From 67f6a90fba62b7af9653d4c185039d84573d235a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 06:58:58 +0000 Subject: [PATCH 001/224] refactor: update dj package to use new name --- src/orcapod/dj/mapper.py | 6 +++--- src/orcapod/dj/operation.py | 4 ++-- src/orcapod/dj/source.py | 12 ++++++------ src/orcapod/dj/stream.py | 2 +- src/orcapod/dj/tracker.py | 12 ++++++------ src/orcapod/pod/core.py | 4 ++++ 6 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/orcapod/dj/mapper.py b/src/orcapod/dj/mapper.py index a38fdaf..efec07c 100644 --- a/src/orcapod/dj/mapper.py +++ b/src/orcapod/dj/mapper.py @@ -1,18 +1,18 @@ import warnings from typing import Optional -from orcapod.mappers import Join, MapPackets, Mapper, MapTags +from orcapod.core.operators import Join, MapPackets, MapTags, Operator from .operation import QueryOperation from .stream import QueryStream -class QueryMapper(QueryOperation, Mapper): +class QueryMapper(QueryOperation, Operator): """ A special type of mapper that returns and works with QueryStreams """ -def convert_to_query_mapper(operation: Mapper) -> QueryMapper: +def convert_to_query_mapper(operation: Operator) -> QueryMapper: """ Convert a generic mapper to an equivalent, Query mapper """ diff --git a/src/orcapod/dj/operation.py b/src/orcapod/dj/operation.py index d4d5a81..70b218e 100644 --- a/src/orcapod/dj/operation.py +++ b/src/orcapod/dj/operation.py @@ -1,8 +1,8 @@ -from ..base import Operation +from orcapod.core.base import Kernel from .stream import QueryStream -class QueryOperation(Operation): +class QueryOperation(Kernel): """ A special type of operation that returns and works with QueryStreams diff --git a/src/orcapod/dj/source.py b/src/orcapod/dj/source.py index 8af3f23..0eaa6dc 100644 --- a/src/orcapod/dj/source.py +++ b/src/orcapod/dj/source.py @@ -6,12 +6,12 @@ from orcapod.hashing import hash_to_uuid -from orcapod.sources import Source -from orcapod.streams import SyncStream -from ..utils.name import pascal_to_snake, snake_to_pascal -from ..utils.stream_utils import common_elements -from .operation import QueryOperation -from .stream import QueryStream, TableCachedStream, TableStream +from orcapod.core.sources import Source +from orcapod.core.streams import SyncStream +from orcapod.utils.name import pascal_to_snake, snake_to_pascal +from orcapod.utils.stream_utils import common_elements +from orcapod.dj.operation import QueryOperation +from orcapod.dj.stream import QueryStream, TableCachedStream, TableStream logger = logging.getLogger(__name__) diff --git a/src/orcapod/dj/stream.py b/src/orcapod/dj/stream.py index 3e4eb08..e8e7195 100644 --- a/src/orcapod/dj/stream.py +++ b/src/orcapod/dj/stream.py @@ -5,7 +5,7 @@ from datajoint.expression import QueryExpression from datajoint.table import Table -from orcapod.streams import SyncStream +from orcapod.core.streams import SyncStream logger = logging.getLogger(__name__) diff --git a/src/orcapod/dj/tracker.py b/src/orcapod/dj/tracker.py index b137e54..24df900 100644 --- a/src/orcapod/dj/tracker.py +++ b/src/orcapod/dj/tracker.py @@ -6,10 +6,10 @@ import networkx as nx from datajoint import Schema -from orcapod.base import Operation, Source -from orcapod.mappers import Mapper, Merge +from orcapod.core.base import Kernel, Source +from orcapod.core.operators import Operator, Merge from orcapod.pod import FunctionPod -from orcapod.pipeline import GraphTracker +from orcapod.core.tracker import GraphTracker from .mapper import convert_to_query_mapper from .operation import QueryOperation @@ -19,7 +19,7 @@ def convert_to_query_operation( - operation: Operation, + operation: Kernel, schema: Schema, table_name: str = None, table_postfix: str = "", @@ -68,7 +68,7 @@ def convert_to_query_operation( True, ) - if isinstance(operation, Mapper): + if isinstance(operation, Operator): return convert_to_query_mapper(operation), True # operation conversion is not supported, raise an error @@ -102,7 +102,7 @@ def generate_tables( for invocation in nx.topological_sort(G): streams = [edge_lut.get(stream, stream) for stream in invocation.streams] new_node, converted = convert_to_query_operation( - invocation.operation, + invocation.kernel, schema, table_name=None, table_postfix=invocation.content_hash_int(), diff --git a/src/orcapod/pod/core.py b/src/orcapod/pod/core.py index a82944f..5c3ee67 100644 --- a/src/orcapod/pod/core.py +++ b/src/orcapod/pod/core.py @@ -303,6 +303,10 @@ def generator() -> Iterator[tuple[Tag, Packet]]: elif self.error_handling == "warn": warnings.warn(f"Error processing packet {packet}: {e}") continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e output_packet: Packet = { k: v for k, v in zip(self.output_keys, output_values) From ac228b02e22b64b5e17d095f6d78b58bcea3cc55 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 19 Jun 2025 20:48:35 +0000 Subject: [PATCH 002/224] feat: add ability to skip computation in pod --- src/orcapod/pod/core.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/orcapod/pod/core.py b/src/orcapod/pod/core.py index 5c3ee67..069e0de 100644 --- a/src/orcapod/pod/core.py +++ b/src/orcapod/pod/core.py @@ -186,6 +186,7 @@ def __init__( function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", custom_hash: int | None = None, label: str | None = None, + skip_computation: bool = False, force_computation: bool = False, skip_memoization_lookup: bool = False, skip_memoization: bool = False, @@ -209,6 +210,7 @@ def __init__( self.store_name = store_name or function_name self.function_hash_mode = function_hash_mode self.custom_hash = custom_hash + self.skip_computation = skip_computation self.force_computation = force_computation self.skip_memoization_lookup = skip_memoization_lookup self.skip_memoization = skip_memoization @@ -277,6 +279,9 @@ def generator() -> Iterator[tuple[Tag, Packet]]: logger.info("Memoized packet found, skipping computation") yield tag, memoized_packet continue + if self.skip_computation: + logger.info("Skipping computation as per configuration") + continue values = self.function(**packet) if len(self.output_keys) == 0: From 450ec90aa8e0b8e33eac9433635bde99dcc5b763 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 24 Jun 2025 20:18:09 +0000 Subject: [PATCH 003/224] refactor: major change of structure and implementation of pipeline --- pyproject.toml | 2 +- src/orcapod/__init__.py | 4 +- src/orcapod/core/__init__.py | 13 + src/orcapod/core/base.py | 58 +- src/orcapod/core/operators.py | 48 +- src/orcapod/core/pod.py | 311 +++++++ src/orcapod/core/pod_legacy.py | 373 ++++++++ src/orcapod/core/tracker.py | 48 +- src/orcapod/dj/pod.py | 2 +- src/orcapod/dj/tracker.py | 2 +- src/orcapod/hashing/core.py | 35 +- src/orcapod/hashing/hashing_legacy.py | 269 ------ src/orcapod/hashing/types.py | 4 +- src/orcapod/pipeline/pipeline.py | 706 ++------------ src/orcapod/pipeline/wrappers.py | 667 +++++++++++++ src/orcapod/pod/__init__.py | 9 - src/orcapod/pod/core.py | 877 ------------------ src/orcapod/store/__init__.py | 5 +- src/orcapod/store/arrow_data_stores.py | 587 ++++++++++-- .../store/{core.py => dict_data_stores.py} | 0 src/orcapod/store/file.py | 159 ---- src/orcapod/store/file_ops.py | 158 +++- src/orcapod/store/optimized_memory_store.py | 433 +++++++++ .../{transfer.py => transfer_data_store.py} | 0 src/orcapod/store/types.py | 1 + src/orcapod/types/__init__.py | 79 +- src/orcapod/types/core.py | 49 +- src/orcapod/types/default.py | 18 - src/orcapod/types/registry.py | 10 +- .../types/{inference.py => typespec.py} | 9 +- uv.lock | 18 +- 31 files changed, 2773 insertions(+), 2181 deletions(-) create mode 100644 src/orcapod/core/pod.py create mode 100644 src/orcapod/core/pod_legacy.py delete mode 100644 src/orcapod/hashing/hashing_legacy.py create mode 100644 src/orcapod/pipeline/wrappers.py delete mode 100644 src/orcapod/pod/__init__.py delete mode 100644 src/orcapod/pod/core.py rename src/orcapod/store/{core.py => dict_data_stores.py} (100%) delete mode 100644 src/orcapod/store/file.py create mode 100644 src/orcapod/store/optimized_memory_store.py rename src/orcapod/store/{transfer.py => transfer_data_store.py} (100%) delete mode 100644 src/orcapod/types/default.py rename src/orcapod/types/{inference.py => typespec.py} (98%) diff --git a/pyproject.toml b/pyproject.toml index ca1c20c..aa23332 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "pandas>=2.2.3", "pyyaml>=6.0.2", "pyarrow>=20.0.0", - "polars>=1.30.0", + "polars>=1.31.0", "beartype>=0.21.0", ] readme = "README.md" diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index a84492a..db457e9 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,8 +1,8 @@ from .core import operators, sources, streams from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -from . import hashing, pod, store +from . import hashing, store from .core.operators import Join, MapPackets, MapTags, packet, tag -from .pod import FunctionPod, function_pod +from .core.pod import FunctionPod, function_pod from .core.sources import GlobSource from .store import DirDataStore, SafeDirDataStore from .core.tracker import GraphTracker diff --git a/src/orcapod/core/__init__.py b/src/orcapod/core/__init__.py index e69de29..d236681 100644 --- a/src/orcapod/core/__init__.py +++ b/src/orcapod/core/__init__.py @@ -0,0 +1,13 @@ +from .base import Kernel, Invocation, Stream, SyncStream, Source +from .operators import Operator +from .pod import Pod + +__all__ = [ + "Kernel", + "Operator", + "Invocation", + "Stream", + "SyncStream", + "Source", + "Pod", +] diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 0b1ed63..664352d 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -2,7 +2,7 @@ import threading from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterator -from typing import Any, TypeVar, Hashable +from typing import Any from orcapod.hashing import HashableMixin @@ -27,9 +27,10 @@ class Kernel(ABC, HashableMixin): for computational graph tracking. """ - def __init__(self, label: str | None = None, **kwargs) -> None: + def __init__(self, label: str | None = None, skip_tracking: bool = False, **kwargs) -> None: super().__init__(**kwargs) self._label = label + self._skip_tracking = skip_tracking @property def label(self) -> str: @@ -40,29 +41,51 @@ def label(self) -> str: if self._label: return self._label return self.__class__.__name__ - + @label.setter def label(self, label: str) -> None: self._label = label + def pre_forward_hook( + self, *streams: "SyncStream", **kwargs + ) -> tuple["SyncStream", ...]: + """ + A hook that is called before the forward method is invoked. + This can be used to perform any pre-processing or validation on the input streams. + Subclasses can override this method to provide custom behavior. + """ + return streams + + def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStream": + """ + A hook that is called after the forward method is invoked. + This can be used to perform any post-processing on the output stream. + Subclasses can override this method to provide custom behavior. + """ + return output_stream + + def __call__(self, *streams: "SyncStream", **kwargs) -> "SyncStream": # Special handling of Source: trigger call on source if passed as stream normalized_streams = [ stream() if isinstance(stream, Source) else stream for stream in streams ] - output_stream = self.forward(*normalized_streams, **kwargs) + pre_processed_streams = self.pre_forward_hook(*normalized_streams, **kwargs) + output_stream = self.forward(*pre_processed_streams, **kwargs) + post_processed_stream = self.post_forward_hook(output_stream, **kwargs) # create an invocation instance - invocation = Invocation(self, normalized_streams) + invocation = Invocation(self, pre_processed_streams) # label the output_stream with the invocation that produced the stream - output_stream.invocation = invocation + post_processed_stream.invocation = invocation - # register the invocation to all active trackers - active_trackers = Tracker.get_active_trackers() - for tracker in active_trackers: - tracker.record(invocation) + if not self._skip_tracking: + # register the invocation to all active trackers + active_trackers = Tracker.get_active_trackers() + for tracker in active_trackers: + tracker.record(invocation) - return output_stream + return post_processed_stream @abstractmethod def forward(self, *streams: "SyncStream") -> "SyncStream": @@ -98,7 +121,7 @@ def identity_structure(self, *streams: "SyncStream") -> Any: logger.warning( f"Identity structure not implemented for {self.__class__.__name__}" ) - return (self.__class__.__name__,) + tuple(streams) + return (self.__class__.__name__,) + streams def keys( self, *streams: "SyncStream", trigger_run: bool = False @@ -365,6 +388,8 @@ def keys( tag_keys, packet_keys = self.invocation.keys() if tag_keys is not None and packet_keys is not None: return tag_keys, packet_keys + if not trigger_run: + return None, None # otherwise, use the keys from the first packet in the stream # note that this may be computationally expensive tag, packet = next(iter(self)) @@ -386,6 +411,8 @@ def types(self, *, trigger_run=False) -> tuple[TypeSpec | None, TypeSpec | None] tag_types, packet_types = self.invocation.types() if not trigger_run or (tag_types is not None and packet_types is not None): return tag_types, packet_types + if not trigger_run: + return None, None # otherwise, use the keys from the first packet in the stream # note that this may be computationally expensive tag, packet = next(iter(self)) @@ -488,13 +515,6 @@ def claims_unique_tags(self, *, trigger_run=False) -> bool | None: return True -class Operator(Kernel): - """ - A Mapper is an operation that does NOT generate new file content. - It is used to control the flow of data in the pipeline without modifying or creating data content. - """ - - class Source(Kernel, SyncStream): """ A base class for all sources in the system. A source can be seen as a special diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index 093167b..84a31f3 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -3,9 +3,9 @@ from itertools import chain from typing import Any - -from orcapod.core.base import Operator, SyncStream +from orcapod.types import Packet, Tag, TypeSpec from orcapod.hashing import function_content_hash, hash_function +from orcapod.core.base import Kernel, SyncStream from orcapod.core.streams import SyncStreamFromGenerator from orcapod.utils.stream_utils import ( batch_packet, @@ -16,7 +16,12 @@ merge_typespecs, ) -from orcapod.types import Packet, Tag, TypeSpec + +class Operator(Kernel): + """ + A Mapper is an operation that does NOT generate new file content. + It is used to control the flow of data in the pipeline without modifying or creating data content. + """ class Repeat(Operator): @@ -186,12 +191,43 @@ def claims_unique_tags( return True +def union_lists(left, right): + if left is None or right is None: + return None + output = list(left) + for item in right: + if item not in output: + output.append(item) + return output + class Join(Operator): def identity_structure(self, *streams): # Join does not depend on the order of the streams -- convert it onto a set return (self.__class__.__name__, set(streams)) + def keys( + self, *streams: SyncStream, trigger_run=False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + """ + Returns the types of the operation. + The first list contains the keys of the tags, and the second list contains the keys of the packets. + The keys are returned if it is feasible to do so, otherwise a tuple + (None, None) is returned to signify that the keys are not known. + """ + if len(streams) != 2: + raise ValueError("Join operation requires exactly two streams") + + left_stream, right_stream = streams + left_tag_keys, left_packet_keys = left_stream.keys(trigger_run=trigger_run) + right_tag_keys, right_packet_keys = right_stream.keys(trigger_run=trigger_run) + + # TODO: do error handling when merge fails + joined_tag_keys = union_lists(left_tag_keys, right_tag_keys) + joined_packet_keys = union_lists(left_packet_keys, right_packet_keys) + + return joined_tag_keys, joined_packet_keys + def types( self, *streams: SyncStream, trigger_run=False ) -> tuple[TypeSpec | None, TypeSpec | None]: @@ -225,8 +261,10 @@ def forward(self, *streams: SyncStream) -> SyncStream: left_stream, right_stream = streams def generator() -> Iterator[tuple[Tag, Packet]]: - for left_tag, left_packet in left_stream: - for right_tag, right_packet in right_stream: + left_stream_buffered = list(left_stream) + right_stream_buffered = list(right_stream) + for left_tag, left_packet in left_stream_buffered: + for right_tag, right_packet in right_stream_buffered: if (joined_tag := join_tags(left_tag, right_tag)) is not None: if not check_packet_compatibility(left_packet, right_packet): raise ValueError( diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py new file mode 100644 index 0000000..582fa85 --- /dev/null +++ b/src/orcapod/core/pod.py @@ -0,0 +1,311 @@ +import logging +import warnings +import sys +from collections.abc import Callable, Collection, Iterable, Iterator, Sequence +from typing import ( + Any, + Literal, +) + +from orcapod.types import Packet, Tag, TypeSpec, default_registry +from orcapod.types.typespec import extract_function_typespecs +from orcapod.types.registry import PacketConverter + +from orcapod.hashing import ( + FunctionInfoExtractor, + get_function_signature, +) +from orcapod.core import Kernel +from orcapod.core.operators import Join +from orcapod.core.streams import ( + SyncStream, + SyncStreamFromGenerator, +) + +logger = logging.getLogger(__name__) + + +class Pod(Kernel): + """ + An (abstract) base class for all pods. A pod can be seen as a special type of operation that + only operates on the packet content without reading tags. Consequently, no operation + of Pod can dependent on the tags of the packets. This is a design choice to ensure that + the pods act as pure functions which is a necessary condition to guarantee reproducibility. + """ + + def __init__( + self, error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs + ): + super().__init__(**kwargs) + self._active = True + self.error_handling = error_handling + + def is_active(self) -> bool: + """ + Check if the pod is active. If not, it will not process any packets. + """ + return self._active + + def set_active(self, active: bool) -> None: + """ + Set the active state of the pod. If set to False, the pod will not process any packets. + """ + self._active = active + + + def process_stream(self, *streams: SyncStream) -> tuple[SyncStream, ...]: + """ + Prepare the incoming streams for execution in the pod. This default implementation + joins all the input streams together. + """ + # if multiple streams are provided, join them + # otherwise, return as is + combined_streams = list(streams) + if len(streams) > 1: + stream = streams[0] + for next_stream in streams[1:]: + stream = Join()(stream, next_stream) + combined_streams = [stream] + return tuple(combined_streams) + + def pre_forward_hook( + self, *streams: SyncStream, **kwargs + ) -> tuple[SyncStream, ...]: + return self.process_stream(*streams) + + def generator_completion_hook(self, n_computed: int) -> None: + """ + Hook that is called when the generator is completed. This can be used to + perform any finalization steps, such as closing resources or logging. + """ + logger.debug(f"Generator completed with {n_computed} items processed.") + + def forward(self, *streams: SyncStream) -> SyncStream: + # at this point, streams should have been joined into one + assert len(streams) == 1, "Only one stream is supported in forward() of Pod" + stream = streams[0] + + def generator() -> Iterator[tuple[Tag, Packet]]: + n_computed = 0 + for tag, packet in stream: + try: + tag, output_packet = self.call(tag, packet) + if output_packet is None: + logger.debug( + f"Call returned None as output for tag {tag}. Skipping..." + ) + continue + n_computed += 1 + logger.debug(f"Computed item {n_computed}") + yield tag, output_packet + + except Exception as e: + logger.error(f"Error processing packet {packet}: {e}") + if self.error_handling == "raise": + raise e + elif self.error_handling == "warn": + warnings.warn(f"Error processing packet {packet}: {e}") + continue + elif self.error_handling == "ignore": + continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e + self.generator_completion_hook(n_computed) + + return SyncStreamFromGenerator(generator) + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ... + + +def function_pod( + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + label: str | None = None, + **kwargs, +) -> Callable[..., "FunctionPod"]: + """ + Decorator that wraps a function in a FunctionPod instance. + + Args: + output_keys: Keys for the function output(s) + function_name: Name of the function pod; if None, defaults to the function name + **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. + + Returns: + FunctionPod instance wrapping the decorated function + """ + + def decorator(func) -> FunctionPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + if not hasattr(func, "__module__") or func.__module__ is None: + raise ValueError( + f"Function {func.__name__} must be defined at module level" + ) + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + module = sys.modules[func.__module__] + base_function_name = func.__name__ + new_function_name = f"_original_{func.__name__}" + setattr(module, new_function_name, func) + # rename the function to be consistent and make it pickleable + setattr(func, "__name__", new_function_name) + setattr(func, "__qualname__", new_function_name) + + # Create a simple typed function pod + pod = FunctionPod( + function=func, + output_keys=output_keys, + function_name=function_name or base_function_name, + label=label, + **kwargs, + ) + return pod + + return decorator + + +class FunctionPod(Pod): + def __init__( + self, + function: Callable[..., Any], + output_keys: str | Collection[str] | None = None, + function_name=None, + input_types: TypeSpec | None = None, + output_types: TypeSpec | Sequence[type] | None = None, + label: str | None = None, + packet_type_registry=None, + function_info_extractor: FunctionInfoExtractor | None = None, + **kwargs, + ) -> None: + self.function = function + if output_keys is None: + output_keys = [] + if isinstance(output_keys, str): + output_keys = [output_keys] + self.output_keys = output_keys + if function_name is None: + if hasattr(self.function, "__name__"): + function_name = getattr(self.function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__ attribute" + ) + self.function_name = function_name + super().__init__(label=label or self.function_name, **kwargs) + + if packet_type_registry is None: + # TODO: reconsider the use of default registry here + packet_type_registry = default_registry + + self.registry = packet_type_registry + self.function_info_extractor = function_info_extractor + + # extract input and output types from the function signature + self.function_input_typespec, self.function_output_typespec = ( + extract_function_typespecs( + self.function, + self.output_keys, + input_types=input_types, + output_types=output_types, + ) + ) + + self.input_converter = PacketConverter(self.function_input_typespec, self.registry) + self.output_converter = PacketConverter( + self.function_output_typespec, self.registry + ) + + def get_function_typespecs(self) -> tuple[TypeSpec, TypeSpec]: + return self.function_input_typespec, self.function_output_typespec + + + def __repr__(self) -> str: + return f"FunctionPod:{self.function!r}" + + def __str__(self) -> str: + func_sig = get_function_signature(self.function) + return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" + + def call(self, tag, packet) -> tuple[Tag, Packet | None]: + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + output_values = [] + + values = self.function(**packet) + + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + + output_packet: Packet = {k: v for k, v in zip(self.output_keys, output_values)} + return tag, output_packet + + def identity_structure(self, *streams) -> Any: + # construct identity structure for the function + # if function_info_extractor is available, use that but substitute the function_name + if self.function_info_extractor is not None: + function_info = self.function_info_extractor.extract_function_info( + self.function, + function_name=self.function_name, + input_typespec=self.function_input_typespec, + output_typespec=self.function_output_typespec, + ) + else: + # use basic information only + function_info = { + "name": self.function_name, + "input_typespec": self.function_input_typespec, + "output_typespec": self.function_output_typespec, + } + function_info["output_keys"] = tuple(self.output_keys) + + return ( + self.__class__.__name__, + function_info, + ) + streams + + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + stream = self.process_stream(*streams) + if len(stream) < 1: + tag_keys = None + else: + tag_keys, _ = stream[0].keys(trigger_run=trigger_run) + return tag_keys, tuple(self.output_keys) + + def types( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[TypeSpec | None, TypeSpec | None]: + stream = self.process_stream(*streams) + if len(stream) < 1: + tag_typespec = None + else: + tag_typespec, _ = stream[0].types(trigger_run=trigger_run) + return tag_typespec, self.function_output_typespec + + def claims_unique_tags( + self, *streams: SyncStream, trigger_run: bool = False + ) -> bool | None: + stream = self.process_stream(*streams) + return stream[0].claims_unique_tags(trigger_run=trigger_run) diff --git a/src/orcapod/core/pod_legacy.py b/src/orcapod/core/pod_legacy.py new file mode 100644 index 0000000..32c8efb --- /dev/null +++ b/src/orcapod/core/pod_legacy.py @@ -0,0 +1,373 @@ +import logging +import warnings +import sys +from collections.abc import Callable, Collection, Iterable, Iterator +from typing import ( + Any, + Literal, +) + +from orcapod.types import Packet, PathSet, PodFunction, Tag + +from orcapod.hashing import ( + get_function_signature, + hash_function, +) +from orcapod.core.base import Kernel +from orcapod.core.operators import Join +from orcapod.core.streams import SyncStream, SyncStreamFromGenerator +from orcapod.store import DataStore, NoOpDataStore + + +logger = logging.getLogger(__name__) + + +class Pod(Kernel): + """ + An (abstract) base class for all pods. A pod can be seen as a special type of operation that + only operates on the packet content without reading tags. Consequently, no operation + of Pod can dependent on the tags of the packets. This is a design choice to ensure that + the pods act as pure functions which is a necessary condition to guarantee reproducibility. + """ + + def __init__( + self, error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs + ): + super().__init__(**kwargs) + self.error_handling = error_handling + self._active = True + + def set_active(self, active=True): + self._active = active + + def is_active(self) -> bool: + return self._active + + def process_stream(self, *streams: SyncStream) -> tuple[SyncStream, ...]: + """ + Prepare the incoming streams for execution in the pod. This default implementation + joins all the streams together and raises and error if no streams are provided. + """ + # if multiple streams are provided, join them + # otherwise, return as is + combined_streams = list(streams) + if len(streams) > 1: + stream = streams[0] + for next_stream in streams[1:]: + stream = Join()(stream, next_stream) + combined_streams = [stream] + return tuple(combined_streams) + + def pre_forward_hook( + self, *streams: SyncStream, **kwargs + ) -> tuple[SyncStream, ...]: + return self.process_stream(*streams) + + def forward(self, *streams: SyncStream) -> SyncStream: + # if multiple streams are provided, join them + if len(streams) > 1: + raise ValueError("Multiple streams should be joined before calling forward") + if len(streams) == 0: + raise ValueError("No streams provided to forward") + stream = streams[0] + + def generator() -> Iterator[tuple[Tag, Packet]]: + n_computed = 0 + for tag, packet in stream: + try: + tag, output_packet = self.call(tag, packet) + if output_packet is None: + logger.info( + f"Call returned None as output for tag {tag}. Skipping..." + ) + continue + n_computed += 1 + logger.info(f"Computed item {n_computed}") + yield tag, output_packet + + except Exception as e: + logger.error(f"Error processing packet {packet}: {e}") + if self.error_handling == "raise": + raise e + elif self.error_handling == "warn": + warnings.warn(f"Error processing packet {packet}: {e}") + continue + elif self.error_handling == "ignore": + continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e + + return SyncStreamFromGenerator(generator) + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ... + + +def function_pod( + output_keys: Collection[str] | None = None, + function_name: str | None = None, + data_store: DataStore | None = None, + store_name: str | None = None, + function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", + custom_hash: int | None = None, + force_computation: bool = False, + skip_memoization: bool = False, + error_handling: Literal["raise", "ignore", "warn"] = "raise", + **kwargs, +) -> Callable[..., "FunctionPod"]: + """ + Decorator that wraps a function in a FunctionPod instance. + + Args: + output_keys: Keys for the function output + force_computation: Whether to force computation + skip_memoization: Whether to skip memoization + + Returns: + FunctionPod instance wrapping the decorated function + """ + + def decorator(func) -> FunctionPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + if not hasattr(func, "__module__") or func.__module__ is None: + raise ValueError( + f"Function {func.__name__} must be defined at module level" + ) + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + module = sys.modules[func.__module__] + base_function_name = func.__name__ + new_function_name = f"_original_{func.__name__}" + setattr(module, new_function_name, func) + # rename the function to be consistent and make it pickleable + setattr(func, "__name__", new_function_name) + setattr(func, "__qualname__", new_function_name) + + # Create the FunctionPod + pod = FunctionPod( + function=func, + output_keys=output_keys, + function_name=function_name or base_function_name, + data_store=data_store, + store_name=store_name, + function_hash_mode=function_hash_mode, + custom_hash=custom_hash, + force_computation=force_computation, + skip_memoization=skip_memoization, + error_handling=error_handling, + **kwargs, + ) + + return pod + + return decorator + + +class FunctionPod(Pod): + """ + A pod that wraps a function and allows it to be used as an operation in a stream. + This pod can be used to apply a function to the packets in a stream, with optional memoization + and caching of results. It can also handle multiple output keys and error handling. + The function should accept keyword arguments that correspond to the keys in the packets. + The output of the function should be a path or a collection of paths that correspond to the output keys.""" + + def __init__( + self, + function: PodFunction, + output_keys: Collection[str] | None = None, + function_name=None, + data_store: DataStore | None = None, + store_name: str | None = None, + function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", + custom_hash: int | None = None, + label: str | None = None, + force_computation: bool = False, + skip_memoization_lookup: bool = False, + skip_memoization: bool = False, + error_handling: Literal["raise", "ignore", "warn"] = "raise", + _hash_function_kwargs: dict | None = None, + **kwargs, + ) -> None: + super().__init__(label=label, **kwargs) + self.function = function + self.output_keys = output_keys or [] + if function_name is None: + if hasattr(self.function, "__name__"): + function_name = getattr(self.function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__ attribute" + ) + + self.function_name = function_name + self.data_store = data_store if data_store is not None else NoOpDataStore() + self.store_name = store_name or function_name + self.function_hash_mode = function_hash_mode + self.custom_hash = custom_hash + self.force_computation = force_computation + self.skip_memoization_lookup = skip_memoization_lookup + self.skip_memoization = skip_memoization + self.error_handling = error_handling + self._hash_function_kwargs = _hash_function_kwargs + + def __repr__(self) -> str: + func_sig = get_function_signature(self.function) + return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" + + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + stream = self.process_stream(*streams) + tag_keys, _ = stream[0].keys(trigger_run=trigger_run) + return tag_keys, tuple(self.output_keys) + + def is_memoized(self, packet: Packet) -> bool: + return self.retrieve_memoized(packet) is not None + + def retrieve_memoized(self, packet: Packet) -> Packet | None: + """ + Retrieve a memoized packet from the data store. + Returns None if no memoized packet is found. + """ + return self.data_store.retrieve_memoized( + self.store_name, + self.content_hash(char_count=16), + packet, + ) + + def memoize( + self, + packet: Packet, + output_packet: Packet, + ) -> Packet: + """ + Memoize the output packet in the data store. + Returns the memoized packet. + """ + return self.data_store.memoize( + self.store_name, + self.content_hash(char_count=16), # identity of this function pod + packet, + output_packet, + ) + + def forward(self, *streams: SyncStream) -> SyncStream: + # if multiple streams are provided, join them + if len(streams) > 1: + raise ValueError("Multiple streams should be joined before calling forward") + if len(streams) == 0: + raise ValueError("No streams provided to forward") + stream = streams[0] + + def generator() -> Iterator[tuple[Tag, Packet]]: + n_computed = 0 + for tag, packet in stream: + output_values: list["PathSet"] = [] + try: + if not self.skip_memoization_lookup: + memoized_packet = self.retrieve_memoized(packet) + else: + memoized_packet = None + if not self.force_computation and memoized_packet is not None: + logger.info("Memoized packet found, skipping computation") + yield tag, memoized_packet + continue + if not self.is_active(): + logger.info( + "Pod is not active: skipping computation of a new entry" + ) + continue + values = self.function(**packet) + + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + except Exception as e: + logger.error(f"Error processing packet {packet}: {e}") + if self.error_handling == "raise": + raise e + elif self.error_handling == "ignore": + continue + elif self.error_handling == "warn": + warnings.warn(f"Error processing packet {packet}: {e}") + continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e + + output_packet: Packet = { + k: v for k, v in zip(self.output_keys, output_values) + } + + if not self.skip_memoization: + # output packet may be modified by the memoization process + # e.g. if the output is a file, the path may be changed + output_packet = self.memoize(packet, output_packet) # type: ignore + + n_computed += 1 + logger.info(f"Computed item {n_computed}") + yield tag, output_packet + + return SyncStreamFromGenerator(generator) + + def identity_structure(self, *streams) -> Any: + content_kwargs = self._hash_function_kwargs + if self.function_hash_mode == "content": + if content_kwargs is None: + content_kwargs = { + "include_name": False, + "include_module": False, + "include_declaration": False, + } + function_hash_value = hash_function( + self.function, + name_override=self.function_name, + function_hash_mode="content", + content_kwargs=content_kwargs, + ) + elif self.function_hash_mode == "signature": + function_hash_value = hash_function( + self.function, + name_override=self.function_name, + function_hash_mode="signature", + content_kwargs=content_kwargs, + ) + elif self.function_hash_mode == "name": + function_hash_value = hash_function( + self.function, + name_override=self.function_name, + function_hash_mode="name", + content_kwargs=content_kwargs, + ) + elif self.function_hash_mode == "custom": + if self.custom_hash is None: + raise ValueError("Custom hash function not provided") + function_hash_value = self.custom_hash + else: + raise ValueError( + f"Unknown function hash mode: {self.function_hash_mode}. " + "Must be one of 'content', 'signature', 'name', or 'custom'." + ) + + return ( + self.__class__.__name__, + function_hash_value, + tuple(self.output_keys), + ) + tuple(streams) diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py index efc2c42..2532582 100644 --- a/src/orcapod/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -1,5 +1,39 @@ -from orcapod.core.base import Invocation, Kernel, Tracker +from orcapod.core.base import Invocation, Kernel, Tracker, SyncStream, TypeSpec +from collections.abc import Collection +from typing import Any +class StubKernel(Kernel): + def __init__(self, stream: SyncStream, **kwargs): + super().__init__(skip_tracking=True, **kwargs) + self.stream = stream + + def forward(self, *streams: SyncStream) -> SyncStream: + if len(streams) != 0: + raise ValueError( + "StubKernel does not support forwarding streams. " + "It generates its own stream from the file system." + ) + return self.stream + + def identity_structure(self, *streams) -> Any: + if len(streams) != 0: + raise ValueError( + "StubKernel does not support forwarding streams. " + "It generates its own stream from the file system." + ) + + return (self.__class__.__name__, self.stream) + + def types(self, *streams: SyncStream, **kwargs) -> tuple[TypeSpec|None, TypeSpec|None]: + return self.stream.types() + + def keys(self, *streams: SyncStream, **kwargs) -> tuple[Collection[str]|None, Collection[str]|None]: + return self.stream.keys() + + + + + class GraphTracker(Tracker): """ @@ -44,6 +78,7 @@ def generate_namemap(self) -> dict[Invocation, str]: def generate_graph(self): import networkx as nx + G = nx.DiGraph() # Add edges for each invocation @@ -51,15 +86,20 @@ def generate_graph(self): for invocation in invocations: for upstream in invocation.streams: # if upstream.invocation is not in the graph, add it - if upstream.invocation not in G: - G.add_node(upstream.invocation) - G.add_edge(upstream.invocation, invocation, stream=upstream) + upstream_invocation = upstream.invocation + if upstream_invocation is None: + # If upstream is None, create a stub kernel + upstream_invocation = Invocation(StubKernel(upstream, label="StubInput"), []) + if upstream_invocation not in G: + G.add_node(upstream_invocation) + G.add_edge(upstream_invocation, invocation, stream=upstream) return G def draw_graph(self): import networkx as nx import matplotlib.pyplot as plt + G = self.generate_graph() labels = self.generate_namemap() diff --git a/src/orcapod/dj/pod.py b/src/orcapod/dj/pod.py index 815b2dc..7101090 100644 --- a/src/orcapod/dj/pod.py +++ b/src/orcapod/dj/pod.py @@ -5,7 +5,7 @@ from datajoint import Schema from datajoint.table import Table -from ..pod import FunctionPod, Pod +from orcapod.core.pod import FunctionPod, Pod from ..utils.name import pascal_to_snake, snake_to_pascal from .mapper import JoinQuery from .operation import QueryOperation diff --git a/src/orcapod/dj/tracker.py b/src/orcapod/dj/tracker.py index 24df900..3276ba9 100644 --- a/src/orcapod/dj/tracker.py +++ b/src/orcapod/dj/tracker.py @@ -8,7 +8,7 @@ from orcapod.core.base import Kernel, Source from orcapod.core.operators import Operator, Merge -from orcapod.pod import FunctionPod +from orcapod.core.pod import FunctionPod from orcapod.core.tracker import GraphTracker from .mapper import convert_to_query_mapper diff --git a/src/orcapod/hashing/core.py b/src/orcapod/hashing/core.py index c711f63..66b4e4d 100644 --- a/src/orcapod/hashing/core.py +++ b/src/orcapod/hashing/core.py @@ -5,7 +5,7 @@ A library for creating stable, content-based hashes that remain consistent across Python sessions, suitable for arbitrarily nested data structures and custom objects via HashableMixin. """ - +WARN_NONE_IDENTITY=False import hashlib import inspect import json @@ -175,11 +175,12 @@ def content_hash(self, char_count: Optional[int] = 16) -> str: # If no custom structure is provided, use the class name # We avoid using id() since it's not stable across sessions if structure is None: - logger.warning( - f"HashableMixin.content_hash called on {self.__class__.__name__} " - "instance that returned identity_structure() of None. " - "Using class name as default identity, which may not correctly reflect object uniqueness." - ) + if WARN_NONE_IDENTITY: + logger.warning( + f"HashableMixin.content_hash called on {self.__class__.__name__} " + "instance that returned identity_structure() of None. " + "Using class name as default identity, which may not correctly reflect object uniqueness." + ) # Fall back to class name for consistent behavior return f"HashableMixin-DefaultIdentity-{self.__class__.__name__}" @@ -205,11 +206,12 @@ def content_hash_int(self, hexdigits: int = 16) -> int: # If no custom structure is provided, use the class name # We avoid using id() since it's not stable across sessions if structure is None: - logger.warning( - f"HashableMixin.content_hash_int called on {self.__class__.__name__} " - "instance without identity_structure() implementation. " - "Using class name as default identity, which may not correctly reflect object uniqueness." - ) + if WARN_NONE_IDENTITY: + logger.warning( + f"HashableMixin.content_hash_int called on {self.__class__.__name__} " + "instance that returned identity_structure() of None. " + "Using class name as default identity, which may not correctly reflect object uniqueness." + ) # Use the same default identity as content_hash for consistency default_identity = ( f"HashableMixin-DefaultIdentity-{self.__class__.__name__}" @@ -235,11 +237,12 @@ def content_hash_uuid(self) -> UUID: # If no custom structure is provided, use the class name # We avoid using id() since it's not stable across sessions if structure is None: - logger.warning( - f"HashableMixin.content_hash_uuid called on {self.__class__.__name__} " - "instance without identity_structure() implementation. " - "Using class name as default identity, which may not correctly reflect object uniqueness." - ) + if WARN_NONE_IDENTITY: + logger.warning( + f"HashableMixin.content_hash_uuid called on {self.__class__.__name__} " + "instance without identity_structure() implementation. " + "Using class name as default identity, which may not correctly reflect object uniqueness." + ) # Use the same default identity as content_hash for consistency default_identity = ( f"HashableMixin-DefaultIdentity-{self.__class__.__name__}" diff --git a/src/orcapod/hashing/hashing_legacy.py b/src/orcapod/hashing/hashing_legacy.py deleted file mode 100644 index 353a4f9..0000000 --- a/src/orcapod/hashing/hashing_legacy.py +++ /dev/null @@ -1,269 +0,0 @@ -# # a function to hash a dictionary of key value pairs into uuid -# from collections.abc import Collection, Mapping -# import hashlib -# import uuid -# from uuid import UUID -# from typing import Any, Dict, Optional, Union -# import inspect -# import json - -# import hashlib - -# # arbitrary depth of nested dictionaries -# T = Dict[str, Union[str, "T"]] - - -# # TODO: implement proper recursive hashing - - -# def hash_dict(d: T) -> UUID: -# # Convert the dictionary to a string representation -# dict_str = str(sorted(d.items())) - -# # Create a hash of the string representation -# hash_object = hashlib.sha256(dict_str.encode("utf-8")) - -# # Convert the hash to a UUID -# hash_uuid = uuid.UUID(hash_object.hexdigest()) - -# return hash_uuid - - -# def stable_hash(s): -# """Create a stable hash that returns the same integer value across sessions.""" -# # Convert input to bytes if it's not already -# if not isinstance(s, bytes): -# s = str(s).encode("utf-8") - -# hash_hex = hashlib.sha256(s).hexdigest() -# return int(hash_hex[:16], 16) - - -# def hash_function(function, function_hash_mode: str = "content", hasher_kwargs=None) -> str: -# """ -# Hash a function based on its content, signature, or name. -# -# Args: -# function: The function to hash -# function_hash_mode: The mode of hashing ('content', 'signature', 'name') -# function_name: Optional name for the function (if not provided, uses function's __name__) - -# Returns: -# A string representing the hash of the function -# """ -# if hasher_kwargs is None: -# hasher_kwargs = {} - -# if function_hash_mode == "content": -# function_hash = function_content_hash(function, **hasher_kwargs) -# elif function_hash_mode == "signature": -# function_hash = stable_hash(get_function_signature(function, **hasher_kwargs)) -# elif function_hash_mode == "name": -# function_hash = stable_hash(function.__name__) - -# return function_hash - - -# def function_content_hash( -# func, exclude_name=False, exclude_module=False, exclude_declaration=False, return_components=False -# ): -# """ -# Compute a hash based on the function's source code, name, module, and closure variables. -# """ -# components = [] - -# # Add function name -# if not exclude_name: -# components.append(f"name:{func.__name__}") - -# # Add module -# if not exclude_module: -# components.append(f"module:{func.__module__}") - -# # Get the function's source code -# try: -# source = inspect.getsource(func) -# # Clean up the source code -# source = source.strip() -# # Remove the function definition line -# if exclude_declaration: -# # find the line that starts with def and remove it -# # TODO: consider dealing with more sophisticated cases like decorators -# source = "\n".join(line for line in source.split("\n") if not line.startswith("def ")) -# components.append(f"source:{source}") -# except (IOError, TypeError): -# # If we can't get the source (e.g., built-in function), use the function's string representation -# components.append(f"repr:{repr(func)}") - -# # Add closure variables if any -# if func.__closure__: -# closure_values = [] -# for cell in func.__closure__: -# # Try to get a stable representation of the cell content -# try: -# # For simple immutable objects -# if isinstance(cell.cell_contents, (int, float, str, bool, type(None))): -# closure_values.append(repr(cell.cell_contents)) -# # For other objects, we'll use their string representation -# else: -# closure_values.append(str(cell.cell_contents)) -# except: -# # If we can't get a stable representation, use the cell's id -# closure_values.append(f"cell_id:{id(cell)}") - -# components.append(f"closure:{','.join(closure_values)}") - -# # Add function attributes that affect behavior -# if hasattr(func, "__defaults__") and func.__defaults__: -# defaults_str = ",".join(repr(d) for d in func.__defaults__) -# components.append(f"defaults:{defaults_str}") - -# if hasattr(func, "__kwdefaults__") and func.__kwdefaults__: -# kwdefaults_str = ",".join(f"{k}={repr(v)}" for k, v in func.__kwdefaults__.items()) -# components.append(f"kwdefaults:{kwdefaults_str}") - -# # Function's code object properties (excluding filename and line numbers) -# code = func.__code__ -# code_props = { -# "co_argcount": code.co_argcount, -# "co_posonlyargcount": getattr(code, "co_posonlyargcount", 0), # Python 3.8+ -# "co_kwonlyargcount": code.co_kwonlyargcount, -# "co_nlocals": code.co_nlocals, -# "co_stacksize": code.co_stacksize, -# "co_flags": code.co_flags, -# "co_code": code.co_code, -# "co_names": code.co_names, -# "co_varnames": code.co_varnames, -# } -# components.append(f"code_properties:{repr(code_props)}") -# if return_components: -# return components - -# # Join all components and compute hash -# combined = "\n".join(components) -# return hashlib.sha256(combined.encode("utf-8")).hexdigest() - - -# class HashableMixin: -# """A mixin that provides content-based hashing functionality.""" - -# def identity_structure(self) -> Any: -# """ -# Return a structure that represents the identity of this object. -# By default, returns None to indicate that no custom structure is provided. -# Subclasses should override this method to provide meaningful representations. - -# Returns: -# None to indicate no custom structure (use default hash) -# """ -# return None - -# def content_hash(self, char_count: Optional[int] = 16) -> str: -# """ -# Generate a stable string hash based on the object's content. - -# Returns: -# str: A hexadecimal digest representing the object's content -# """ -# # Get the identity structure -# structure = self.identity_structure() - -# # TODO: consider returning __hash__ based value if structure is None - -# # Generate a hash from the identity structure -# return self._hash_structure(structure, char_count=char_count) - -# def content_hash_int(self, hexdigits=16) -> int: -# """ -# Generate a stable integer hash based on the object's content. - -# Returns: -# int: An integer representing the object's content -# """ -# return int(self.content_hash(char_count=None)[:hexdigits], 16) - -# def __hash__(self) -> int: -# """ -# Hash implementation that uses the identity structure if provided, -# otherwise falls back to the superclass's hash method. - -# Returns: -# int: A hash value based on either content or identity -# """ -# # Get the identity structure -# structure = self.identity_structure() - -# # If no custom structure is provided, use the superclass's hash -# if structure is None: -# return super().__hash__() - -# # Generate a hash and convert to integer -# hash_hex = self._hash_structure(structure, char_count=None) -# return int(hash_hex[:16], 16) - -# def _hash_structure(self, structure: Any, char_count: Optional[int] = 16) -> str: -# """ -# Helper method to compute a hash string from a structure. - -# Args: -# structure: The structure to hash - -# Returns: -# str: A hexadecimal hash digest of the structure -# """ -# processed = self._process_structure(structure) -# json_str = json.dumps(processed, sort_keys=True).encode() -# return hashlib.sha256(json_str).hexdigest()[:char_count] - -# def _process_structure(self, obj: Any) -> Any: -# """ -# Recursively process a structure to prepare it for hashing. - -# Args: -# obj: The object or structure to process - -# Returns: -# A processed version of the structure with HashableMixin objects replaced by their hashes -# """ -# # Handle None -# if obj is None: -# return "None" - -# # If the object is a HashableMixin, use its content_hash -# if isinstance(obj, HashableMixin): -# # Don't call content_hash on self to avoid cycles -# if obj is self: -# # TODO: carefully consider this case -# # Use the superclass's hash for self -# return str(super(HashableMixin, self).__hash__()) -# return obj.content_hash() - -# # Handle basic types -# if isinstance(obj, (str, int, float, bool)): -# return str(obj) - -# # Handle named tuples (which are subclasses of tuple) -# if hasattr(obj, "_fields") and isinstance(obj, tuple): -# # For namedtuples, convert to dict and then process -# return self._process_structure({field: value for field, value in zip(obj._fields, obj)}) - -# # Handle mappings (dict-like objects) -# if isinstance(obj, Mapping): -# return {str(k): self._process_structure(v) for k, v in sorted(obj.items(), key=lambda x: str(x[0]))} - -# # Handle sets and frozensets specifically -# if isinstance(obj, (set, frozenset)): -# # Process each item first, then sort the processed results -# processed_items = [self._process_structure(item) for item in obj] -# return sorted(processed_items, key=str) - -# # Handle collections (list-like objects) -# if isinstance(obj, Collection): -# return [self._process_structure(item) for item in obj] - -# # For bytes and bytearray, convert to hex representation -# if isinstance(obj, (bytes, bytearray)): -# return obj.hex() - -# # For other objects, just use their string representation -# return str(obj) diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 5e8b07c..36155bb 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -137,6 +137,6 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_types: TypeSpec | None = None, - output_types: TypeSpec | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | None = None, ) -> dict[str, Any]: ... diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index f160f2b..5df050f 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -1,3 +1,5 @@ +from collections import defaultdict +from collections.abc import Collection, Iterator import json import logging import pickle @@ -7,12 +9,21 @@ from pathlib import Path from typing import Any, Protocol, runtime_checkable -import networkx as nx import pandas as pd -from orcapod.core.base import Invocation, Kernel +from orcapod.core import Invocation, Kernel, SyncStream +from orcapod.core.pod import FunctionPod +from orcapod.pipeline.wrappers import KernelNode, FunctionPodNode, Node + from orcapod.hashing import hash_to_hex from orcapod.core.tracker import GraphTracker +from orcapod.hashing import ObjectHasher, ArrowHasher +from orcapod.types import TypeSpec, Tag, Packet +from orcapod.core.streams import SyncStreamFromGenerator +from orcapod.store import ArrowDataStore +from orcapod.types.registry import PacketConverter, TypeRegistry +from orcapod.types import default_registry +from orcapod.utils.stream_utils import merge_typespecs, get_typespec logger = logging.getLogger(__name__) @@ -29,12 +40,12 @@ class Pipeline(GraphTracker): Replaces the old Tracker with better persistence and view capabilities. """ - def __init__(self, name: str | None = None): + def __init__(self, name: str, results_store: ArrowDataStore, pipeline_store: ArrowDataStore) -> None: super().__init__() self.name = name or f"pipeline_{id(self)}" - self._view_registry: dict[str, "PipelineView"] = {} - self._cache_dir = Path(".pipeline_cache") / self.name - self._cache_dir.mkdir(parents=True, exist_ok=True) + self.results_store = results_store + self.pipeline_store = pipeline_store + self.labels_to_nodes = {} # Core Pipeline Operations def save(self, path: Path | str) -> None: @@ -66,6 +77,62 @@ def save(self, path: Path | str) -> None: temp_path.unlink() raise + def wrap_invocation( + self, kernel: Kernel, input_nodes: Collection[Node] + ) -> Node: + if isinstance(kernel, FunctionPod): + return FunctionPodNode(kernel, input_nodes, output_store=self.results_store, tag_store=self.pipeline_store) + return KernelNode(kernel, input_nodes, output_store=self.pipeline_store) + + def compile(self): + import networkx as nx + G = self.generate_graph() + + # Proposed labels for each Kernel in the graph + # If name collides, unique name is generated by appending an index + proposed_labels = defaultdict(list) + node_lut = {} + edge_lut : dict[SyncStream, Node]= {} + for invocation in nx.topological_sort(G): + # map streams to the new streams based on Nodes + input_nodes = [edge_lut[stream] for stream in invocation.streams] + new_node = self.wrap_invocation(invocation.kernel, input_nodes) + + # register the new node against the original invocation + node_lut[invocation] = new_node + # register the new node in the proposed labels -- if duplicates occur, will resolve later + proposed_labels[new_node.label].append(new_node) + + for edge in G.out_edges(invocation): + edge_lut[G.edges[edge]["stream"]] = new_node + + # resolve duplicates in proposed_labels + labels_to_nodes = {} + for label, nodes in proposed_labels.items(): + if len(nodes) > 1: + # If multiple nodes have the same label, append index to make it unique + for idx, node in enumerate(nodes): + node.label = f"{label}_{idx}" + labels_to_nodes[node.label] = node + else: + # If only one node, keep the original label + nodes[0].label = label + labels_to_nodes[label] = nodes[0] + + self.labels_to_nodes = labels_to_nodes + return node_lut, edge_lut, proposed_labels, labels_to_nodes + + def __getattr__(self, item: str) -> Any: + """Allow direct access to pipeline attributes""" + if item in self.labels_to_nodes: + return self.labels_to_nodes[item] + raise AttributeError(f"Pipeline has no attribute '{item}'") + + def __dir__(self): + # Include both regular attributes and dynamic ones + return list(super().__dir__()) + list(self.labels_to_nodes.keys()) + + @classmethod def load(cls, path: Path | str) -> "Pipeline": """Load complete pipeline state""" @@ -74,7 +141,7 @@ def load(cls, path: Path | str) -> "Pipeline": with open(path, "rb") as f: state = pickle.load(f) - pipeline = cls(state["name"]) + pipeline = cls(state["name"], state["output_store"]) pipeline.invocation_lut = state["invocation_lut"] logger.info(f"Pipeline '{pipeline.name}' loaded from {path}") @@ -103,628 +170,5 @@ def _validate_serializable(self) -> None: + "\n".join(f" - {issue}" for issue in issues) + "\n\nOnly named functions are supported for serialization." ) + - # View Management - def as_view( - self, renderer: "ViewRenderer", view_id: str | None = None, **kwargs - ) -> "PipelineView": - """Get a view of this pipeline using the specified renderer""" - view_id = ( - view_id - or f"{renderer.__class__.__name__.lower()}_{len(self._view_registry)}" - ) - - if view_id not in self._view_registry: - self._view_registry[view_id] = renderer.create_view( - self, view_id=view_id, **kwargs - ) - return self._view_registry[view_id] - - def as_dataframe(self, view_id: str = "default", **kwargs) -> "PandasPipelineView": - """Convenience method for pandas DataFrame view""" - return self.as_view(PandasViewRenderer(), view_id=view_id, **kwargs) - - def as_graph(self) -> nx.DiGraph: - """Get the computation graph""" - return self.generate_graph() - - # Combined save/load with views - def save_with_views(self, base_path: Path | str) -> dict[str, Path]: - """Save pipeline and all its views together""" - base_path = Path(base_path) - base_path.mkdir(parents=True, exist_ok=True) - - saved_files = {} - - # Save pipeline itself - pipeline_path = base_path / "pipeline.pkl" - self.save(pipeline_path) - saved_files["pipeline"] = pipeline_path - - # Save all views - for view_id, view in self._view_registry.items(): - view_path = base_path / f"view_{view_id}.pkl" - view.save(view_path, include_pipeline=False) - saved_files[f"view_{view_id}"] = view_path - - # Save manifest - manifest = { - "pipeline_file": "pipeline.pkl", - "views": { - view_id: f"view_{view_id}.pkl" for view_id in self._view_registry.keys() - }, - "created_at": time.time(), - "pipeline_name": self.name, - } - - manifest_path = base_path / "manifest.json" - with open(manifest_path, "w") as f: - json.dump(manifest, f, indent=2) - saved_files["manifest"] = manifest_path - - return saved_files - - @classmethod - def load_with_views( - cls, base_path: Path | str - ) -> tuple["Pipeline", dict[str, "PipelineView"]]: - """Load pipeline and all its views""" - base_path = Path(base_path) - - # Load manifest - manifest_path = base_path / "manifest.json" - with open(manifest_path, "r") as f: - manifest = json.load(f) - - # Load pipeline - pipeline_path = base_path / manifest["pipeline_file"] - pipeline = cls.load(pipeline_path) - - # Load views with appropriate renderers - renderers = { - "PandasPipelineView": PandasViewRenderer(), - "DataJointPipelineView": DataJointViewRenderer(None), # Would need schema - } - - views = {} - for view_id, view_file in manifest["views"].items(): - view_path = base_path / view_file - - # Load view data to determine type - with open(view_path, "rb") as f: - view_data = pickle.load(f) - - # Find appropriate renderer - view_type = view_data.get("view_type", "PandasPipelineView") - if view_type in renderers and renderers[view_type].can_load_view(view_data): - # Load with appropriate view class - if view_type == "PandasPipelineView": - view = PandasPipelineView.load(view_path, pipeline) - else: - view = DataJointPipelineView.load(view_path, pipeline) - else: - # Default to pandas view - view = PandasPipelineView.load(view_path, pipeline) - - views[view_id] = view - pipeline._view_registry[view_id] = view - - return pipeline, views - - def get_stats(self) -> dict[str, Any]: - """Get pipeline statistics""" - total_operations = len(self.invocation_lut) - total_invocations = sum(len(invs) for invs in self.invocation_lut.values()) - - operation_types = {} - for operation in self.invocation_lut.keys(): - op_type = operation.__class__.__name__ - operation_types[op_type] = operation_types.get(op_type, 0) + 1 - - return { - "name": self.name, - "total_operations": total_operations, - "total_invocations": total_invocations, - "operation_types": operation_types, - "views": list(self._view_registry.keys()), - } - - -# View Renderer Protocol -@runtime_checkable -class ViewRenderer(Protocol): - """Protocol for all view renderers - uses structural typing""" - - def create_view( - self, pipeline: "Pipeline", view_id: str, **kwargs - ) -> "PipelineView": - """Create a view for the given pipeline""" - ... - - def can_load_view(self, view_data: dict[str, Any]) -> bool: - """Check if this renderer can load the given view data""" - ... - - -class PandasViewRenderer: - """Renderer for pandas DataFrame views""" - - def create_view( - self, pipeline: "Pipeline", view_id: str, **kwargs - ) -> "PandasPipelineView": - return PandasPipelineView(pipeline, view_id=view_id, **kwargs) - - def can_load_view(self, view_data: dict[str, Any]) -> bool: - return view_data.get("view_type") == "PandasPipelineView" - - -class DataJointViewRenderer: - """Renderer for DataJoint views""" - - def __init__(self, schema): - self.schema = schema - - def create_view( - self, pipeline: "Pipeline", view_id: str, **kwargs - ) -> "DataJointPipelineView": - return DataJointPipelineView(pipeline, self.schema, view_id=view_id, **kwargs) - - def can_load_view(self, view_data: dict[str, Any]) -> bool: - return view_data.get("view_type") == "DataJointPipelineView" - - -# Base class for all views -class PipelineView(ABC): - """Base class for all pipeline views""" - - def __init__(self, pipeline: Pipeline, view_id: str): - self.pipeline = pipeline - self.view_id = view_id - self._cache_dir = pipeline._cache_dir / "views" - self._cache_dir.mkdir(parents=True, exist_ok=True) - - @abstractmethod - def save(self, path: Path | str, include_pipeline: bool = True) -> None: - """Save the view""" - pass - - @classmethod - @abstractmethod - def load(cls, path: Path | str, pipeline: Pipeline | None = None) -> "PipelineView": - """Load the view""" - pass - - def _compute_pipeline_hash(self) -> str: - """Compute hash of current pipeline state for validation""" - pipeline_state = [] - for operation, invocations in self.pipeline.invocation_lut.items(): - for invocation in invocations: - pipeline_state.append(invocation.content_hash()) - return hash_to_hex(sorted(pipeline_state)) - - -# Pandas DataFrame-like view -class PandasPipelineView(PipelineView): - """ - Provides a pandas DataFrame-like interface to pipeline metadata. - Focuses on tag information for querying and filtering. - """ - - def __init__( - self, - pipeline: Pipeline, - view_id: str = "pandas_view", - max_records: int = 10000, - sample_size: int = 100, - ): - super().__init__(pipeline, view_id) - self.max_records = max_records - self.sample_size = sample_size - self._cached_data: pd.DataFrame | None = None - self._build_options = {"max_records": max_records, "sample_size": sample_size} - self._hash_to_data_map: dict[str, Any] = {} - - @property - def df(self) -> pd.DataFrame: - """Access the underlying DataFrame, building if necessary""" - if self._cached_data is None: - # Try to load from cache first - cache_path = self._cache_dir / f"{self.view_id}.pkl" - if cache_path.exists(): - try: - loaded_view = self.load(cache_path, self.pipeline) - if self._is_cache_valid(loaded_view): - self._cached_data = loaded_view._cached_data - self._hash_to_data_map = loaded_view._hash_to_data_map - logger.info(f"Loaded view '{self.view_id}' from cache") - return self._cached_data - except Exception as e: - logger.warning(f"Failed to load cached view: {e}") - - # Build from scratch - logger.info(f"Building view '{self.view_id}' from pipeline") - self._cached_data = self._build_metadata() - - # Auto-save after building - try: - self.save(cache_path, include_pipeline=False) - except Exception as e: - logger.warning(f"Failed to cache view: {e}") - - return self._cached_data - - def _build_metadata(self) -> pd.DataFrame: - """Build the metadata DataFrame from pipeline operations""" - metadata_records = [] - total_records = 0 - - for operation, invocations in self.pipeline.invocation_lut.items(): - if total_records >= self.max_records: - logger.warning(f"Hit max_records limit ({self.max_records})") - break - - for invocation in invocations: - try: - # Get sample of outputs, not all - records = self._extract_metadata_from_invocation( - invocation, operation - ) - for record in records: - metadata_records.append(record) - total_records += 1 - if total_records >= self.max_records: - break - - if total_records >= self.max_records: - break - - except Exception as e: - logger.warning(f"Skipping {operation.__class__.__name__}: {e}") - # Create placeholder record - placeholder = self._create_placeholder_record(invocation, operation) - metadata_records.append(placeholder) - total_records += 1 - - if not metadata_records: - # Return empty DataFrame with basic structure - return pd.DataFrame( - columns=[ - "operation_name", - "operation_hash", - "invocation_id", - "created_at", - "packet_keys", - ] - ) - - return pd.DataFrame(metadata_records) - - def _extract_metadata_from_invocation( - self, invocation: Invocation, operation: Kernel - ) -> list[dict[str, Any]]: - """Extract metadata records from a single invocation""" - records = [] - - # Try to get sample outputs from the invocation - try: - # This is tricky - we need to reconstruct the output stream - # For now, we'll create a basic record from what we know - base_record = { - "operation_name": operation.label or operation.__class__.__name__, - "operation_hash": invocation.content_hash(), - "invocation_id": hash(invocation), - "created_at": time.time(), - "operation_type": operation.__class__.__name__, - } - - # Try to get tag and packet info from the operation - try: - tag_keys, packet_keys = invocation.keys() - base_record.update( - { - "tag_keys": list(tag_keys) if tag_keys else [], - "packet_keys": list(packet_keys) if packet_keys else [], - } - ) - except Exception: - base_record.update( - { - "tag_keys": [], - "packet_keys": [], - } - ) - - records.append(base_record) - - except Exception as e: - logger.debug(f"Could not extract detailed metadata from {operation}: {e}") - records.append(self._create_placeholder_record(invocation, operation)) - - return records - - def _create_placeholder_record( - self, invocation: Invocation, operation: Kernel - ) -> dict[str, Any]: - """Create a placeholder record when extraction fails""" - return { - "operation_name": operation.label or operation.__class__.__name__, - "operation_hash": invocation.content_hash(), - "invocation_id": hash(invocation), - "created_at": time.time(), - "operation_type": operation.__class__.__name__, - "tag_keys": [], - "packet_keys": [], - "is_placeholder": True, - } - - # DataFrame-like interface - def __getitem__(self, condition) -> "FilteredPipelineView": - """Enable pandas-like filtering: view[condition]""" - df = self.df - if isinstance(condition, pd.Series): - filtered_df = df[condition] - elif callable(condition): - filtered_df = df[condition(df)] - else: - filtered_df = df[condition] - - return FilteredPipelineView(self.pipeline, filtered_df, self._hash_to_data_map) - - def query(self, expr: str) -> "FilteredPipelineView": - """SQL-like querying: view.query('operation_name == "MyOperation"')""" - df = self.df - filtered_df = df.query(expr) - return FilteredPipelineView(self.pipeline, filtered_df, self._hash_to_data_map) - - def groupby(self, *args, **kwargs) -> "GroupedPipelineView": - """Group operations similar to pandas groupby""" - df = self.df - grouped = df.groupby(*args, **kwargs) - return GroupedPipelineView(self.pipeline, grouped, self._hash_to_data_map) - - def head(self, n: int = 5) -> pd.DataFrame: - """Return first n rows""" - return self.df.head(n) - - def info(self) -> None: - """Display DataFrame info""" - return self.df.info() - - def describe(self) -> pd.DataFrame: - """Generate descriptive statistics""" - return self.df.describe() - - # Persistence methods - def save(self, path: Path | str, include_pipeline: bool = True) -> None: - """Save view, optionally with complete pipeline state""" - path = Path(path) - - # Build the view data if not cached - df = self.df - - view_data = { - "view_id": self.view_id, - "view_type": self.__class__.__name__, - "dataframe": df, - "build_options": self._build_options, - "hash_to_data_map": self._hash_to_data_map, - "created_at": time.time(), - "pipeline_hash": self._compute_pipeline_hash(), - } - - if include_pipeline: - view_data["pipeline_state"] = { - "name": self.pipeline.name, - "invocation_lut": self.pipeline.invocation_lut, - } - view_data["has_pipeline"] = True - else: - view_data["pipeline_name"] = self.pipeline.name - view_data["has_pipeline"] = False - - with open(path, "wb") as f: - pickle.dump(view_data, f, protocol=pickle.HIGHEST_PROTOCOL) - - @classmethod - def load( - cls, path: Path | str, pipeline: Pipeline | None = None - ) -> "PandasPipelineView": - """Load view, reconstructing pipeline if needed""" - with open(path, "rb") as f: - view_data = pickle.load(f) - - # Handle pipeline reconstruction - if view_data["has_pipeline"]: - pipeline = Pipeline(view_data["pipeline_state"]["name"]) - pipeline.invocation_lut = view_data["pipeline_state"]["invocation_lut"] - elif pipeline is None: - raise ValueError( - "View was saved without pipeline state. " - "You must provide a pipeline parameter." - ) - - # Reconstruct view - build_options = view_data.get("build_options", {}) - view = cls( - pipeline, - view_id=view_data["view_id"], - max_records=build_options.get("max_records", 10000), - sample_size=build_options.get("sample_size", 100), - ) - view._cached_data = view_data["dataframe"] - view._hash_to_data_map = view_data.get("hash_to_data_map", {}) - - return view - - def _is_cache_valid(self, cached_view: "PandasPipelineView") -> bool: - """Check if cached view is still valid""" - try: - cached_hash = getattr(cached_view, "_pipeline_hash", None) - current_hash = self._compute_pipeline_hash() - return cached_hash == current_hash - except Exception: - return False - - def invalidate(self) -> None: - """Force re-rendering on next access""" - self._cached_data = None - cache_path = self._cache_dir / f"{self.view_id}.pkl" - if cache_path.exists(): - cache_path.unlink() - - -class FilteredPipelineView: - """Represents a filtered subset of pipeline metadata""" - - def __init__( - self, pipeline: Pipeline, filtered_df: pd.DataFrame, data_map: dict[str, Any] - ): - self.pipeline = pipeline - self.df = filtered_df - self._data_map = data_map - - def __getitem__(self, condition): - """Further filtering""" - further_filtered = self.df[condition] - return FilteredPipelineView(self.pipeline, further_filtered, self._data_map) - - def query(self, expr: str): - """Apply additional query""" - further_filtered = self.df.query(expr) - return FilteredPipelineView(self.pipeline, further_filtered, self._data_map) - - def to_pandas(self) -> pd.DataFrame: - """Convert to regular pandas DataFrame""" - return self.df.copy() - - def head(self, n: int = 5) -> pd.DataFrame: - """Return first n rows""" - return self.df.head(n) - - def __len__(self) -> int: - return len(self.df) - - def __repr__(self) -> str: - return f"FilteredPipelineView({len(self.df)} records)" - - -class GroupedPipelineView: - """Represents grouped pipeline metadata""" - - def __init__(self, pipeline: Pipeline, grouped_df, data_map: dict[str, Any]): - self.pipeline = pipeline - self.grouped = grouped_df - self._data_map = data_map - - def apply(self, func): - """Apply function to each group""" - return self.grouped.apply(func) - - def agg(self, *args, **kwargs): - """Aggregate groups""" - return self.grouped.agg(*args, **kwargs) - - def size(self): - """Get group sizes""" - return self.grouped.size() - - def get_group(self, name): - """Get specific group""" - group_df = self.grouped.get_group(name) - return FilteredPipelineView(self.pipeline, group_df, self._data_map) - - -# Basic DataJoint View (simplified implementation) -class DataJointPipelineView(PipelineView): - """ - Basic DataJoint view - creates tables for pipeline operations - This is a simplified version - you can expand based on your existing DJ code - """ - - def __init__(self, pipeline: Pipeline, schema, view_id: str = "dj_view"): - super().__init__(pipeline, view_id) - self.schema = schema - self._tables = {} - - def save(self, path: Path | str, include_pipeline: bool = True) -> None: - """Save DataJoint view metadata""" - view_data = { - "view_id": self.view_id, - "view_type": self.__class__.__name__, - "schema_database": self.schema.database, - "table_names": list(self._tables.keys()), - "created_at": time.time(), - } - - if include_pipeline: - view_data["pipeline_state"] = { - "name": self.pipeline.name, - "invocation_lut": self.pipeline.invocation_lut, - } - view_data["has_pipeline"] = True - - with open(path, "wb") as f: - pickle.dump(view_data, f) - - @classmethod - def load( - cls, path: Path | str, pipeline: Pipeline | None = None - ) -> "DataJointPipelineView": - """Load DataJoint view""" - with open(path, "rb") as f: - view_data = pickle.load(f) - - # This would need actual DataJoint schema reconstruction - # For now, return a basic instance - if pipeline is None: - raise ValueError("Pipeline required for DataJoint view loading") - - # You'd need to reconstruct the schema here - view = cls(pipeline, None, view_id=view_data["view_id"]) # schema=None for now - return view - - def generate_tables(self): - """Generate DataJoint tables from pipeline - placeholder implementation""" - # This would use your existing DataJoint generation logic - # from your dj/tracker.py file - pass - - -# Utility functions -def validate_pipeline_serializability(pipeline: Pipeline) -> None: - """Helper to check if pipeline can be saved""" - try: - pipeline._validate_serializable() - print("✅ Pipeline is ready for serialization") - - # Additional performance warnings - stats = pipeline.get_stats() - if stats["total_invocations"] > 1000: - print( - f"⚠️ Large pipeline ({stats['total_invocations']} invocations) - views may be slow to build" - ) - - except SerializationError as e: - print("❌ Pipeline cannot be serialized:") - print(str(e)) - print("\n💡 Convert lambda functions to named functions:") - print(" lambda x: x > 0.8 → def filter_func(x): return x > 0.8") - - -def create_example_pipeline() -> Pipeline: - """Create an example pipeline for testing""" - from orcapod import GlobSource, function_pod - - @function_pod - def example_function(input_file): - return f"processed_{input_file}" - - pipeline = Pipeline("example") - - with pipeline: - # This would need actual operations to be meaningful - # source = GlobSource('data', './test_data', '*.txt')() - # results = source >> example_function - pass - - return pipeline diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py new file mode 100644 index 0000000..55207c2 --- /dev/null +++ b/src/orcapod/pipeline/wrappers.py @@ -0,0 +1,667 @@ +from orcapod.core.pod import Pod, FunctionPod +from orcapod.core import SyncStream, Source, Kernel +from orcapod.store import ArrowDataStore +from orcapod.types import Tag, Packet, TypeSpec, default_registry +from orcapod.types.typespec import extract_function_typespecs +from orcapod.hashing import ObjectHasher, ArrowHasher +from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher +from typing import Any, Literal +from collections.abc import Collection, Iterator +from orcapod.types.registry import TypeRegistry, PacketConverter +import pyarrow as pa +import polars as pl +from orcapod.core.streams import SyncStreamFromGenerator +from orcapod.utils.stream_utils import get_typespec, merge_typespecs + +import logging +logger = logging.getLogger(__name__) + +def tag_to_arrow_table_with_metadata(tag, metadata: dict | None = None): + """ + Convert a tag dictionary to PyArrow table with metadata on each column. + + Args: + tag: Dictionary with string keys and any Python data type values + metadata_key: The metadata key to add to each column + metadata_value: The metadata value to indicate this column came from tag + """ + if metadata is None: + metadata = {} + + # First create the table to infer types + temp_table = pa.Table.from_pylist([tag]) + + # Create new fields with metadata + fields_with_metadata = [] + for field in temp_table.schema: + # Add metadata to each field + field_metadata = metadata + new_field = pa.field( + field.name, field.type, nullable=field.nullable, metadata=field_metadata + ) + fields_with_metadata.append(new_field) + + # Create schema with metadata + schema_with_metadata = pa.schema(fields_with_metadata) + + # Create the final table with the metadata-enriched schema + table = pa.Table.from_pylist([tag], schema=schema_with_metadata) + + return table + +def get_columns_with_metadata(df: pl.DataFrame, key: str, value: str|None = None) -> list[str]: + """Get column names with specific metadata using list comprehension. If value is given, only + columns matching that specific value for the desginated metadata key will be returned. + Otherwise, all columns that contains the key as metadata will be returned regardless of the value""" + return [ + col_name for col_name, dtype in df.schema.items() + if hasattr(dtype, "metadata") and (value is None or getattr(dtype, "metadata") == value) + ] + + +class PolarsSource(Source): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]|None = None): + self.df = df + self.tag_keys = tag_keys + + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + if len(streams) != 0: + raise ValueError( + "PolarsSource does not support forwarding streams. " + "It generates its own stream from the DataFrame." + ) + return PolarsStream(self.df, self.tag_keys) + + +class PolarsStream(SyncStream): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]|None = None): + self.df = df + if tag_keys is None: + # extract tag_keys by picking columns with metadata source=tag + tag_keys = get_columns_with_metadata(df, "source", "tag") + self.tag_keys = tag_keys + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + for row in self.df.iter_rows(named=True): + tag = {key: row[key] for key in self.tag_keys} + packet = {key: val for key, val in row.items() if key not in self.tag_keys} + yield tag, packet + +class EmptyStream(SyncStream): + def __init__(self, tag_keys: Collection[str]|None = None, packet_keys: Collection[str]|None = None, tag_typespec: TypeSpec | None = None, packet_typespec:TypeSpec|None = None): + if tag_keys is None and tag_typespec is not None: + tag_keys = tag_typespec.keys() + self.tag_keys = list(tag_keys) if tag_keys else [] + + if packet_keys is None and packet_typespec is not None: + packet_keys = packet_typespec.keys() + self.packet_keys = list(packet_keys) if packet_keys else [] + + self.tag_typespec = tag_typespec + self.packet_typespec = packet_typespec + + def keys(self, *streams: SyncStream, trigger_run: bool = False) -> tuple[Collection[str] | None, Collection[str] | None]: + return self.tag_keys, self.packet_keys + + def types(self, *streams: SyncStream, trigger_run: bool = False) -> tuple[TypeSpec | None, TypeSpec | None]: + return self.tag_typespec, self.packet_typespec + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + # Empty stream, no data to yield + return iter([]) + + + + +class KernelInvocationWrapper(Kernel): + def __init__(self, kernel: Kernel, input_streams: Collection[SyncStream], **kwargs) -> None: + super().__init__(**kwargs) + self.kernel = kernel + self.input_streams = list(input_streams) + + + def __repr__(self): + return f"{self.__class__.__name__}<{self.kernel!r}>" + + def __str__(self): + return f"{self.__class__.__name__}<{self.kernel}>" + + @property + def label(self) -> str: + return self._label or self.kernel.label + + @label.setter + def label(self, label: str) -> None: + self._label = label + + def resolve_input_streams(self, *input_streams) -> Collection[SyncStream]: + if input_streams: + raise ValueError( + "Wrapped pod with specified streams cannot be invoked with additional streams" + ) + return self.input_streams + + def identity_structure(self, *streams: SyncStream) -> Any: + """ + Identity structure that includes the wrapped kernel's identity structure. + """ + resolved_streams = self.resolve_input_streams(*streams) + return self.kernel.identity_structure(*resolved_streams) + + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + resolved_streams = self.resolve_input_streams(*streams) + return self.kernel.keys(*resolved_streams, trigger_run=trigger_run) + + def types( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[TypeSpec | None, TypeSpec | None]: + resolved_streams = self.resolve_input_streams(*streams) + return self.kernel.types(*resolved_streams, trigger_run=trigger_run) + + def claims_unique_tags( + self, *streams: SyncStream, trigger_run: bool = False + ) -> bool | None: + resolved_streams = self.resolve_input_streams(*streams) + return self.kernel.claims_unique_tags( + *resolved_streams, trigger_run=trigger_run + ) + + +class CachedKernelWrapper(KernelInvocationWrapper, Source): + """ + A Kernel wrapper that wraps a kernel and stores the outputs of the kernel. + If the class is instantiated with input_streams that is not None, then this wrapper + will strictly represent the invocation of the wrapped Kernel on the given input streams. + Passing in an empty list into input_streams would still be registered as a specific invocation. + If input_streams is None, the class instance largely acts as a proxy of the underlying kernel + but will try to save all results. Note that depending on the storage type passed in, the saving + may error out if you invoke the instance on input streams with non-compatible schema (e.g., tags with + different keys). + """ + + def __init__( + self, + kernel: Kernel, + input_streams: Collection[SyncStream], + output_store: ArrowDataStore, + _object_hasher: ObjectHasher | None = None, + _arrow_hasher: ArrowHasher | None = None, + _registry: TypeRegistry | None = None, + **kwargs, + ) -> None: + super().__init__(kernel, input_streams,**kwargs) + + self.output_store = output_store + self.tag_keys, self.packet_keys = self.keys(trigger_run=False) + self.output_converter = None + + # These are configurable but are not expected to be modified except for special circumstances + if _object_hasher is None: + _object_hasher = get_default_object_hasher() + self.object_hasher = _object_hasher + if _arrow_hasher is None: + _arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = _arrow_hasher + if _registry is None: + _registry = default_registry + self.registry = _registry + self.source_info = self.label, self.object_hasher.hash_to_hex(self.kernel) + + self._cache_computed = False + + + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + if self._cache_computed: + logger.info(f"Returning cached outputs for {self}") + if self.df is not None: + return PolarsStream(self.df, tag_keys=self.tag_keys) + else: + return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) + + resolved_streams = self.resolve_input_streams(*streams) + + output_stream = self.kernel.forward(*resolved_streams, **kwargs) + + tag_type, packet_type = output_stream.types(trigger_run=False) + if tag_type is not None and packet_type is not None: + joined_type = merge_typespecs(tag_type, packet_type) + assert joined_type is not None, "Joined typespec should not be None" + self.output_converter = PacketConverter(joined_type, registry=self.registry) + + # Cache the output stream of the underlying kernel + # This is a no-op if the output stream is already cached + def generator() -> Iterator[tuple[Tag, Packet]]: + logger.info(f"Computing and caching outputs for {self}") + for tag, packet in output_stream: + merged_info = {**tag, **packet} + if self.output_converter is None: + joined_type = get_typespec(merged_info) + assert joined_type is not None, "Joined typespec should not be None" + self.output_converter = PacketConverter( + joined_type, registry=self.registry + ) + + output_table = self.output_converter.to_arrow_table(merged_info) + output_id = self.arrow_hasher.hash_table(output_table) + if not self.output_store.get_record(*self.source_info, output_id): + self.output_store.add_record( + *self.source_info, + output_id, + output_table, + ) + yield tag, packet + self._cache_computed = True + + return SyncStreamFromGenerator(generator) + + @property + def lazy_df(self) -> pl.LazyFrame | None: + return self.output_store.get_all_records_as_polars(*self.source_info) + + @property + def df(self) -> pl.DataFrame | None: + lazy_df = self.lazy_df + if lazy_df is None: + return None + return lazy_df.collect() + + + def reset_cache(self): + self._cache_computed = False + + + +class FunctionPodInvocationWrapper(KernelInvocationWrapper, Pod): + """ + Convenience class to wrap a function pod, providing default pass-through + implementations + """ + def __init__(self, function_pod: FunctionPod, input_streams: Collection[SyncStream], **kwargs): + + # note that this would be an alias to the self.kernel but here explicitly taken as function_pod + # for better type hints + # MRO will be KernelInvocationWrapper -> Pod -> Kernel + super().__init__(function_pod, input_streams, **kwargs) + self.function_pod = function_pod + + + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + resolved_streams = self.resolve_input_streams(*streams) + return super().forward(*resolved_streams, **kwargs) + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + return self.function_pod.call(tag, packet) + + + # =============pass through methods/properties to the underlying function pod============= + + def set_active(self, active=True): + """ + Set the active state of the function pod. + """ + self.function_pod.set_active(active) + + def is_active(self) -> bool: + """ + Check if the function pod is active. + """ + return self.function_pod.is_active() + + + + + + +class CachedFunctionPodWrapper(FunctionPodInvocationWrapper, Source): + def __init__( + self, + function_pod: FunctionPod, + input_streams: Collection[SyncStream], + output_store: ArrowDataStore, + tag_store: ArrowDataStore | None = None, + label: str | None = None, + skip_memoization_lookup: bool = False, + skip_memoization: bool = False, + skip_tag_record: bool = False, + error_handling: Literal["raise", "ignore", "warn"] = "raise", + _object_hasher: ObjectHasher | None = None, + _arrow_hasher: ArrowHasher | None = None, + _registry: TypeRegistry | None = None, + **kwargs, + ) -> None: + super().__init__( + function_pod, + input_streams, + label=label, + error_handling=error_handling, + **kwargs, + ) + self.output_store = output_store + self.tag_store = tag_store + + self.skip_memoization_lookup = skip_memoization_lookup + self.skip_memoization = skip_memoization + self.skip_tag_record = skip_tag_record + + # These are configurable but are not expected to be modified except for special circumstances + if _object_hasher is None: + _object_hasher = get_default_object_hasher() + self.object_hasher = _object_hasher + if _arrow_hasher is None: + _arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = _arrow_hasher + if _registry is None: + _registry = default_registry + self.registry = _registry + + # TODO: consider making this dynamic + self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) + self.tag_keys, self.output_keys = self.keys(trigger_run=False) + + + # prepare packet converters + input_typespec, output_typespec = self.function_pod.get_function_typespecs() + + self.input_converter = PacketConverter(input_typespec, self.registry) + self.output_converter = PacketConverter(output_typespec, self.registry) + + self._cache_computed = False + + def reset_cache(self): + self._cache_computed = False + + def generator_completion_hook(self, n_computed: int) -> None: + """ + Hook to be called when the generator is completed. + """ + logger.info(f"Results cached for {self}") + self._cache_computed = True + + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + if self._cache_computed: + logger.info(f"Returning cached outputs for {self}") + if self.df is not None: + return PolarsStream(self.df, self.tag_keys) + else: + return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) + logger.info(f"Computing and caching outputs for {self}") + return super().forward(*streams, **kwargs) + + + def get_packet_key(self, packet: Packet) -> str: + # TODO: reconsider the logic around input/output converter -- who should own this? + return self.arrow_hasher.hash_table( + self.input_converter.to_arrow_table(packet) + ) + + @property + def source_info(self): + return self.function_pod.function_name, self.function_pod_hash + + def is_memoized(self, packet: Packet) -> bool: + return self.retrieve_memoized(packet) is not None + + def add_tag_record(self, tag: Tag, packet: Packet) -> Tag: + """ + Record the tag for the packet in the record store. + This is used to keep track of the tags associated with memoized packets. + """ + return self._add_tag_record_with_packet_key(tag, self.get_packet_key(packet)) + + def _add_tag_record_with_packet_key(self, tag: Tag, packet_key: str) -> Tag: + if self.tag_store is None: + raise ValueError("Recording of tag requires tag_store but none provided") + + tag = dict(tag) # ensure we don't modify the original tag + tag["__packet_key"] = packet_key + + # TODO: consider making this more efficient + # convert tag to arrow table - columns are labeled with metadata source=tag + table = tag_to_arrow_table_with_metadata(tag, {"source": "tag"}) + + entry_hash = self.arrow_hasher.hash_table(table) + + # TODO: add error handling + # check if record already exists: + retrieved_table = self.tag_store.get_record(*self.source_info, entry_hash) + if retrieved_table is None: + self.tag_store.add_record(*self.source_info, entry_hash, table) + + return tag + + def retrieve_memoized(self, packet: Packet) -> Packet | None: + """ + Retrieve a memoized packet from the data store. + Returns None if no memoized packet is found. + """ + logger.debug("Retrieving memoized packet") + return self._retrieve_memoized_with_packet_key(self.get_packet_key(packet)) + + def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: + """ + Retrieve a memoized result packet from the data store, looking up by the packet key + Returns None if no memoized packet is found. + """ + logger.debug(f"Retrieving memoized packet with key {packet_key}") + arrow_table = self.output_store.get_record( + self.function_pod.function_name, + self.function_pod_hash, + packet_key, + ) + if arrow_table is None: + return None + packets = self.function_pod.output_converter.from_arrow_table(arrow_table) + # since memoizing single packet, it should only contain one packet + assert len(packets) == 1, ( + f"Memoizing single packet return {len(packets)} packets!" + ) + return packets[0] + + def memoize( + self, + packet: Packet, + output_packet: Packet, + ) -> Packet: + """ + Memoize the output packet in the data store. + Returns the memoized packet. + """ + logger.debug("Memoizing packet") + return self._memoize_with_packet_key(self.get_packet_key(packet), output_packet) + + def _memoize_with_packet_key( + self, packet_key: str, output_packet: Packet + ) -> Packet: + """ + Memoize the output packet in the data store, looking up by packet key. + Returns the memoized packet. + """ + logger.debug(f"Memoizing packet with key {packet_key}") + # TODO: this logic goes through the entire store and retrieve cycle with two conversions + # consider simpler alternative + packets = self.output_converter.from_arrow_table( + self.output_store.add_record( + self.function_pod.function_name, + self.function_pod_hash, + packet_key, + self.output_converter.to_arrow_table(output_packet), + ) + ) + # since passed in a single packet, it should only return a single packet + assert len(packets) == 1, ( + f"Memoizing single packet returned {len(packets)} packets!" + ) + return packets[0] + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + packet_key = "" + if ( + not self.skip_tag_record + or not self.skip_memoization_lookup + or not self.skip_memoization + ): + packet_key = self.get_packet_key(packet) + + output_packet = None + if not self.skip_memoization_lookup: + output_packet = self._retrieve_memoized_with_packet_key(packet_key) + if output_packet is not None: + logger.debug( + f"Memoized output for {packet} with {packet_key} found, skipping computation" + ) + else: + logger.debug( + f"Memoized output for packet {packet} with {packet_key} not found" + ) + + if output_packet is None: + # TODO: revisit the logic around active state and how to use it + tag, output_packet = self.function_pod.call(tag, packet) + if output_packet is not None and not self.skip_memoization: + # output packet may be modified by the memoization process + # e.g. if the output is a file, the path may be changed + output_packet = self._memoize_with_packet_key(packet_key, output_packet) # type: ignore + + if output_packet is None: + if self.is_active(): + logger.warning( + f"Function pod {self.function_pod.function_name} returned None for packet {packet} despite being active" + ) + return tag, None + + # result was successfully computed -- save the tag + if not self.skip_tag_record and self.tag_store is not None: + self._add_tag_record_with_packet_key(tag, packet_key) + + return tag, output_packet + + def get_all_outputs(self) -> pl.LazyFrame | None: + return self.output_store.get_all_records_as_polars(*self.source_info) + + def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: + if self.tag_store is None: + raise ValueError("Tag store is not set, no tag record can be retrieved") + data = self.tag_store.get_all_records_as_polars(*self.source_info) + if not with_packet_id: + return data.drop("__packet_key") if data is not None else None + return data + + def get_all_entries_with_tags(self) -> pl.LazyFrame | None: + """ + Retrieve all entries from the tag store with their associated tags. + Returns a DataFrame with columns for tag and packet key. + """ + if self.tag_store is None: + raise ValueError("Tag store is not set, no tag record can be retrieved") + + tag_records = self.tag_store.get_all_records_as_polars(*self.source_info) + if tag_records is None: + return None + result_packets = self.output_store.get_records_by_ids_as_polars( + *self.source_info, + tag_records.collect()["__packet_key"], + preserve_input_order=True, + ) + if result_packets is None: + return None + + return pl.concat([tag_records, result_packets], how="horizontal").drop( + ["__packet_key"] + ) + + @property + def df(self) -> pl.DataFrame | None: + lazy_df = self.lazy_df + if lazy_df is None: + return None + return lazy_df.collect() + + @property + def lazy_df(self) -> pl.LazyFrame | None: + return self.get_all_entries_with_tags() + + @property + def tags(self) -> pl.DataFrame | None: + data = self.get_all_tags() + if data is None: + return None + + return data.collect() + + @property + def outputs(self) -> pl.DataFrame | None: + """ + Retrieve all outputs from the result store as a DataFrame. + Returns None if no outputs are available. + """ + data = self.get_all_outputs() + if data is None: + return None + + return data.collect() + + +class DummyFunctionPod(Pod): + def __init__(self, function_name="dummy", **kwargs): + super().__init__(**kwargs) + self.function_name = function_name + + def set_active(self, active: bool = True): + # no-op + pass + + def is_active(self) -> bool: + return False + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + raise NotImplementedError( + "DummyFunctionPod cannot be called, it is only used to access previously stored tags and outputs." + ) + + +# TODO: Create this instead using compositional pattern +class DummyCachedFunctionPod(CachedFunctionPodWrapper): + """ + Dummy for a cached function pod. This is convenient to just allow the user to access + previously stored function pod tags and outputs without requiring instantiating the identical + function used for computation. + + Consequently, this function pod CANNOT be used to compute and insert new entries into the storage. + """ + + def __init__(self, source_pod: CachedFunctionPodWrapper): + self._source_info = source_pod.source_info + self.output_store = source_pod.output_store + self.tag_store = source_pod.tag_store + self.function_pod = DummyFunctionPod(source_pod.function_pod.function_name) + + @property + def source_info(self) -> tuple[str, str]: + return self._source_info + + +class Node(KernelInvocationWrapper, Source): + def __init__(self, kernel: Kernel, input_nodes: Collection["Node"], **kwargs): + """ + Create a node that wraps a kernel and provides a Node interface. + This is useful for creating nodes in a pipeline that can be executed. + """ + return super().__init__(kernel, input_nodes, **kwargs) + + def reset_cache(self) -> None: ... + + + +class KernelNode(Node, CachedKernelWrapper): + """ + A node that wraps a Kernel and provides a Node interface. + This is useful for creating nodes in a pipeline that can be executed. + """ + +class FunctionPodNode(Node, CachedFunctionPodWrapper): + """ + A node that wraps a FunctionPod and provides a Node interface. + This is useful for creating nodes in a pipeline that can be executed. + """ \ No newline at end of file diff --git a/src/orcapod/pod/__init__.py b/src/orcapod/pod/__init__.py deleted file mode 100644 index 8567c2a..0000000 --- a/src/orcapod/pod/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .core import Pod, FunctionPod, function_pod, TypedFunctionPod, typed_function_pod - -__all__ = [ - "Pod", - "FunctionPod", - "function_pod", - "TypedFunctionPod", - "typed_function_pod", -] diff --git a/src/orcapod/pod/core.py b/src/orcapod/pod/core.py deleted file mode 100644 index 069e0de..0000000 --- a/src/orcapod/pod/core.py +++ /dev/null @@ -1,877 +0,0 @@ -import functools -import logging -import pickle -import warnings -from abc import abstractmethod -import pyarrow as pa -import sys -from collections.abc import Callable, Collection, Iterable, Iterator, Sequence -from typing import ( - Any, - Literal, -) - -from orcapod.types.registry import PacketConverter - -from orcapod.core.base import Kernel -from orcapod.hashing import ( - ObjectHasher, - ArrowHasher, - FunctionInfoExtractor, - get_function_signature, - hash_function, - get_default_object_hasher, - get_default_arrow_hasher, -) -from orcapod.core.operators import Join -from orcapod.store import DataStore, ArrowDataStore, NoOpDataStore -from orcapod.core.streams import SyncStream, SyncStreamFromGenerator -from orcapod.types import Packet, PathSet, PodFunction, Tag, TypeSpec - -from orcapod.types.default import default_registry -from orcapod.types.inference import ( - extract_function_data_types, - verify_against_typespec, - check_typespec_compatibility, -) -from orcapod.types.registry import is_packet_supported -import polars as pl - -logger = logging.getLogger(__name__) - - -def function_pod( - output_keys: Collection[str] | None = None, - function_name: str | None = None, - data_store: DataStore | None = None, - store_name: str | None = None, - function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", - custom_hash: int | None = None, - force_computation: bool = False, - skip_memoization: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - **kwargs, -) -> Callable[..., "FunctionPod"]: - """ - Decorator that wraps a function in a FunctionPod instance. - - Args: - output_keys: Keys for the function output - force_computation: Whether to force computation - skip_memoization: Whether to skip memoization - - Returns: - FunctionPod instance wrapping the decorated function - """ - - def decorator(func) -> FunctionPod: - if func.__name__ == "": - raise ValueError("Lambda functions cannot be used with function_pod") - - if not hasattr(func, "__module__") or func.__module__ is None: - raise ValueError( - f"Function {func.__name__} must be defined at module level" - ) - - # Store the original function in the module for pickling purposes - # and make sure to change the name of the function - module = sys.modules[func.__module__] - base_function_name = func.__name__ - new_function_name = f"_original_{func.__name__}" - setattr(module, new_function_name, func) - # rename the function to be consistent and make it pickleable - setattr(func, "__name__", new_function_name) - setattr(func, "__qualname__", new_function_name) - - # Create the FunctionPod - pod = FunctionPod( - function=func, - output_keys=output_keys, - function_name=function_name or base_function_name, - data_store=data_store, - store_name=store_name, - function_hash_mode=function_hash_mode, - custom_hash=custom_hash, - force_computation=force_computation, - skip_memoization=skip_memoization, - error_handling=error_handling, - **kwargs, - ) - - return pod - - return decorator - - -class Pod(Kernel): - """ - An (abstract) base class for all pods. A pod can be seen as a special type of operation that - only operates on the packet content without reading tags. Consequently, no operation - of Pod can dependent on the tags of the packets. This is a design choice to ensure that - the pods act as pure functions which is a necessary condition to guarantee reproducibility. - """ - - def __init__( - self, error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs - ): - super().__init__(**kwargs) - self.error_handling = error_handling - - def process_stream(self, *streams: SyncStream) -> list[SyncStream]: - """ - Prepare the incoming streams for execution in the pod. This default implementation - joins all the streams together and raises and error if no streams are provided. - """ - # if multiple streams are provided, join them - # otherwise, return as is - combined_streams = list(streams) - if len(streams) > 1: - stream = streams[0] - for next_stream in streams[1:]: - stream = Join()(stream, next_stream) - combined_streams = [stream] - return combined_streams - - def __call__(self, *streams: SyncStream, **kwargs) -> SyncStream: - stream = self.process_stream(*streams) - return super().__call__(*stream, **kwargs) - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet]: ... - - def forward(self, *streams: SyncStream) -> SyncStream: - # if multiple streams are provided, join them - if len(streams) > 1: - raise ValueError("Multiple streams should be joined before calling forward") - if len(streams) == 0: - raise ValueError("No streams provided to forward") - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - n_computed = 0 - for tag, packet in stream: - try: - tag, output_packet = self.call(tag, packet) - n_computed += 1 - logger.info(f"Computed item {n_computed}") - yield tag, output_packet - - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "ignore": - continue - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - - return SyncStreamFromGenerator(generator) - - -class FunctionPod(Pod): - """ - A pod that wraps a function and allows it to be used as an operation in a stream. - This pod can be used to apply a function to the packets in a stream, with optional memoization - and caching of results. It can also handle multiple output keys and error handling. - The function should accept keyword arguments that correspond to the keys in the packets. - The output of the function should be a path or a collection of paths that correspond to the output keys.""" - - def __init__( - self, - function: PodFunction, - output_keys: Collection[str] | None = None, - function_name=None, - data_store: DataStore | None = None, - store_name: str | None = None, - function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", - custom_hash: int | None = None, - label: str | None = None, - skip_computation: bool = False, - force_computation: bool = False, - skip_memoization_lookup: bool = False, - skip_memoization: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - _hash_function_kwargs: dict | None = None, - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self.function = function - self.output_keys = output_keys or [] - if function_name is None: - if hasattr(self.function, "__name__"): - function_name = getattr(self.function, "__name__") - else: - raise ValueError( - "function_name must be provided if function has no __name__ attribute" - ) - - self.function_name = function_name - self.data_store = data_store if data_store is not None else NoOpDataStore() - self.store_name = store_name or function_name - self.function_hash_mode = function_hash_mode - self.custom_hash = custom_hash - self.skip_computation = skip_computation - self.force_computation = force_computation - self.skip_memoization_lookup = skip_memoization_lookup - self.skip_memoization = skip_memoization - self.error_handling = error_handling - self._hash_function_kwargs = _hash_function_kwargs - - def __repr__(self) -> str: - func_sig = get_function_signature(self.function) - return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - stream = self.process_stream(*streams) - tag_keys, _ = stream[0].keys(trigger_run=trigger_run) - return tag_keys, tuple(self.output_keys) - - def is_memoized(self, packet: Packet) -> bool: - return self.retrieve_memoized(packet) is not None - - def retrieve_memoized(self, packet: Packet) -> Packet | None: - """ - Retrieve a memoized packet from the data store. - Returns None if no memoized packet is found. - """ - return self.data_store.retrieve_memoized( - self.store_name, - self.content_hash(char_count=16), - packet, - ) - - def memoize( - self, - packet: Packet, - output_packet: Packet, - ) -> Packet: - """ - Memoize the output packet in the data store. - Returns the memoized packet. - """ - return self.data_store.memoize( - self.store_name, - self.content_hash(char_count=16), # identity of this function pod - packet, - output_packet, - ) - - def forward(self, *streams: SyncStream) -> SyncStream: - # if multiple streams are provided, join them - if len(streams) > 1: - raise ValueError("Multiple streams should be joined before calling forward") - if len(streams) == 0: - raise ValueError("No streams provided to forward") - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - n_computed = 0 - for tag, packet in stream: - output_values: list["PathSet"] = [] - try: - if not self.skip_memoization_lookup: - memoized_packet = self.retrieve_memoized(packet) - else: - memoized_packet = None - if not self.force_computation and memoized_packet is not None: - logger.info("Memoized packet found, skipping computation") - yield tag, memoized_packet - continue - if self.skip_computation: - logger.info("Skipping computation as per configuration") - continue - values = self.function(**packet) - - if len(self.output_keys) == 0: - output_values = [] - elif len(self.output_keys) == 1: - output_values = [values] # type: ignore - elif isinstance(values, Iterable): - output_values = list(values) # type: ignore - elif len(self.output_keys) > 1: - raise ValueError( - "Values returned by function must be a pathlike or a sequence of pathlikes" - ) - - if len(output_values) != len(self.output_keys): - raise ValueError( - f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" - ) - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "ignore": - continue - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - else: - raise ValueError( - f"Unknown error handling mode: {self.error_handling} encountered while handling error:" - ) from e - - output_packet: Packet = { - k: v for k, v in zip(self.output_keys, output_values) - } - - if not self.skip_memoization: - # output packet may be modified by the memoization process - # e.g. if the output is a file, the path may be changed - output_packet = self.memoize(packet, output_packet) # type: ignore - - n_computed += 1 - logger.info(f"Computed item {n_computed}") - yield tag, output_packet - - return SyncStreamFromGenerator(generator) - - def identity_structure(self, *streams) -> Any: - content_kwargs = self._hash_function_kwargs - if self.function_hash_mode == "content": - if content_kwargs is None: - content_kwargs = { - "include_name": False, - "include_module": False, - "include_declaration": False, - } - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="content", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "signature": - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="signature", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "name": - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="name", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "custom": - if self.custom_hash is None: - raise ValueError("Custom hash function not provided") - function_hash_value = self.custom_hash - else: - raise ValueError( - f"Unknown function hash mode: {self.function_hash_mode}. " - "Must be one of 'content', 'signature', 'name', or 'custom'." - ) - - return ( - self.__class__.__name__, - function_hash_value, - tuple(self.output_keys), - ) + tuple(streams) - - -def typed_function_pod( - output_keys: str | Collection[str] | None = None, - function_name: str | None = None, - label: str | None = None, - result_store: ArrowDataStore | None = None, - tag_store: ArrowDataStore | None = None, - object_hasher: ObjectHasher | None = None, - arrow_hasher: ArrowHasher | None = None, - **kwargs, -) -> Callable[..., "TypedFunctionPod | CachedFunctionPod"]: - """ - Decorator that wraps a function in a FunctionPod instance. - - Args: - output_keys: Keys for the function output(s) - function_name: Name of the function pod; if None, defaults to the function name - **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. - - Returns: - FunctionPod instance wrapping the decorated function - """ - - def decorator(func) -> TypedFunctionPod | CachedFunctionPod: - if func.__name__ == "": - raise ValueError("Lambda functions cannot be used with function_pod") - - if not hasattr(func, "__module__") or func.__module__ is None: - raise ValueError( - f"Function {func.__name__} must be defined at module level" - ) - - # Store the original function in the module for pickling purposes - # and make sure to change the name of the function - module = sys.modules[func.__module__] - base_function_name = func.__name__ - new_function_name = f"_original_{func.__name__}" - setattr(module, new_function_name, func) - # rename the function to be consistent and make it pickleable - setattr(func, "__name__", new_function_name) - setattr(func, "__qualname__", new_function_name) - - # Create a simple typed function pod - pod = TypedFunctionPod( - function=func, - output_keys=output_keys, - function_name=function_name or base_function_name, - label=label, - **kwargs, - ) - - if result_store is not None: - pod = CachedFunctionPod( - function_pod=pod, - object_hasher=object_hasher - if object_hasher is not None - else get_default_object_hasher(), - arrow_hasher=arrow_hasher - if arrow_hasher is not None - else get_default_arrow_hasher(), - result_store=result_store, - tag_store=tag_store, - ) - - return pod - - return decorator - - -class TypedFunctionPod(Pod): - """ - A type-aware pod that wraps a function and provides automatic type validation and inference. - - This pod extends the base Pod functionality by automatically extracting and validating - type information from function signatures and user-provided specifications. It ensures - type safety by verifying that both input and output types are supported by the - configured type registry before execution. - - The TypedFunctionPod analyzes the wrapped function's signature to determine: - - Parameter types (from annotations or user-provided input_types) - - Return value types (from annotations or user-provided output_types) - - Type compatibility with the packet type registry - - Key Features: - - Automatic type extraction from function annotations - - Type override support via input_types and output_types parameters - - Registry-based type validation ensuring data compatibility - - Memoization support with type-aware caching - - Multiple output key handling with proper type mapping - - Comprehensive error handling for type mismatches - - Type Resolution Priority: - 1. User-provided input_types/output_types override function annotations - 2. Function parameter annotations are used when available - 3. Function return annotations are parsed for output type inference - 4. Error raised if types cannot be determined or are unsupported - - Args: - function: The function to wrap. Must accept keyword arguments corresponding - to packet keys and return values compatible with output_keys. - output_keys: Collection of string keys for the function outputs. For functions - returning a single value, provide a single key. For multiple returns - (tuple/list), provide keys matching the number of return items. - function_name: Optional name for the function. Defaults to function.__name__. - input_types: Optional mapping of parameter names to their types. Overrides - function annotations for specified parameters. - output_types: Optional type specification for return values. Can be: - - A dict mapping output keys to types (TypeSpec) - - A sequence of types mapped to output_keys in order - These override inferred types from function return annotations. - data_store: DataStore instance for memoization. Defaults to NoOpDataStore. - function_hasher: Hasher function for creating function identity hashes. - Required parameter - no default implementation available. - label: Optional label for the pod instance. - skip_memoization_lookup: If True, skips checking for memoized results. - skip_memoization: If True, disables memoization entirely. - error_handling: How to handle execution errors: - - "raise": Raise exceptions (default) - - "ignore": Skip failed packets silently - - "warn": Issue warnings and continue - packet_type_registry: Registry for validating packet types. Defaults to - the default registry if None. - **kwargs: Additional arguments passed to the parent Pod class and above. - - Raises: - ValueError: When: - - function_name cannot be determined and is not provided - - Input types are not supported by the registry - - Output types are not supported by the registry - - Type extraction fails due to missing annotations/specifications - NotImplementedError: When function_hasher is None (required parameter). - - Examples: - Basic usage with annotated function: - - >>> def process_data(text: str, count: int) -> tuple[str, int]: - ... return text.upper(), count * 2 - >>> - >>> pod = TypedFunctionPod( - ... function=process_data, - ... output_keys=['upper_text', 'doubled_count'], - ... function_hasher=my_hasher - ... ) - - Override types for legacy function: - - >>> def legacy_func(x, y): # No annotations - ... return x + y - >>> - >>> pod = TypedFunctionPod( - ... function=legacy_func, - ... output_keys=['sum'], - ... input_types={'x': int, 'y': int}, - ... output_types={'sum': int}, - ... function_hasher=my_hasher - ... ) - - Multiple outputs with sequence override: - - >>> def analyze(data: list) -> tuple[int, float, str]: - ... return len(data), sum(data), str(data) - >>> - >>> pod = TypedFunctionPod( - ... function=analyze, - ... output_keys=['count', 'total', 'repr'], - ... output_types=[int, float, str], # Override with sequence - ... function_hasher=my_hasher - ... ) - - Attributes: - function: The wrapped function. - output_keys: List of output key names. - function_name: Name identifier for the function. - function_input_types: Resolved input type specification. - function_output_types: Resolved output type specification. - registry: Type registry for validation. - data_store: DataStore instance for memoization. - function_hasher: Function hasher for identity computation. - skip_memoization_lookup: Whether to skip memoization lookups. - skip_memoization: Whether to disable memoization entirely. - error_handling: Error handling strategy. - - Note: - The TypedFunctionPod requires a function_hasher to be provided as there - is no default implementation. This hasher is used to create stable - identity hashes for memoization and caching purposes. - - Type validation occurs during initialization, ensuring that any type - incompatibilities are caught early rather than during stream processing. - """ - - def __init__( - self, - function: Callable[..., Any], - output_keys: str | Collection[str] | None = None, - function_name=None, - input_types: TypeSpec | None = None, - output_types: TypeSpec | Sequence[type] | None = None, - label: str | None = None, - packet_type_registry=None, - function_info_extractor: FunctionInfoExtractor | None = None, - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self.function = function - if output_keys is None: - output_keys = [] - if isinstance(output_keys, str): - output_keys = [output_keys] - self.output_keys = output_keys - if function_name is None: - if hasattr(self.function, "__name__"): - function_name = getattr(self.function, "__name__") - else: - raise ValueError( - "function_name must be provided if function has no __name__ attribute" - ) - self.function_name = function_name - - if packet_type_registry is None: - packet_type_registry = default_registry - - self.registry = packet_type_registry - self.function_info_extractor = function_info_extractor - - # extract input and output types from the function signature - function_input_types, function_output_types = extract_function_data_types( - self.function, - self.output_keys, - input_types=input_types, - output_types=output_types, - ) - - self.function_input_types = function_input_types - self.function_output_types = function_output_types - - # TODO: include explicit check of support during PacketConverter creation - self.input_converter = PacketConverter(self.function_input_types, self.registry) - self.output_converter = PacketConverter( - self.function_output_types, self.registry - ) - - # TODO: prepare a separate str and repr methods - def __repr__(self) -> str: - func_sig = get_function_signature(self.function) - return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" - - def call(self, tag, packet) -> tuple[Tag, Packet]: - output_values: list["PathSet"] = [] - - values = self.function(**packet) - - if len(self.output_keys) == 0: - output_values = [] - elif len(self.output_keys) == 1: - output_values = [values] # type: ignore - elif isinstance(values, Iterable): - output_values = list(values) # type: ignore - elif len(self.output_keys) > 1: - raise ValueError( - "Values returned by function must be a pathlike or a sequence of pathlikes" - ) - - if len(output_values) != len(self.output_keys): - raise ValueError( - f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" - ) - - output_packet: Packet = {k: v for k, v in zip(self.output_keys, output_values)} - return tag, output_packet - - def identity_structure(self, *streams) -> Any: - # construct identity structure for the function - # if function_info_extractor is available, use that but substitute the function_name - if self.function_info_extractor is not None: - function_info = self.function_info_extractor.extract_function_info( - self.function, - function_name=self.function_name, - input_types=self.function_input_types, - output_types=self.function_output_types, - ) - else: - # use basic information only - function_info = { - "name": self.function_name, - "input_types": self.function_input_types, - "output_types": self.function_output_types, - } - function_info["output_keys"] = tuple(self.output_keys) - - return ( - self.__class__.__name__, - function_info, - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - stream = self.process_stream(*streams) - tag_keys, _ = stream[0].keys(trigger_run=trigger_run) - return tag_keys, tuple(self.output_keys) - - -class CachedFunctionPod(Pod): - def __init__( - self, - function_pod: TypedFunctionPod, - object_hasher: ObjectHasher, - arrow_hasher: ArrowHasher, - result_store: ArrowDataStore, - tag_store: ArrowDataStore | None = None, - label: str | None = None, - skip_memoization_lookup: bool = False, - skip_memoization: bool = False, - skip_tag_record: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - **kwargs, - ) -> None: - super().__init__(label=label, error_handling=error_handling, **kwargs) - self.function_pod = function_pod - - self.object_hasher = object_hasher - self.arrow_hasher = arrow_hasher - self.result_store = result_store - self.tag_store = tag_store - - self.skip_memoization_lookup = skip_memoization_lookup - self.skip_memoization = skip_memoization - self.skip_tag_record = skip_tag_record - - # TODO: consider making this dynamic - self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) - - def get_packet_key(self, packet: Packet) -> str: - return self.arrow_hasher.hash_table( - self.function_pod.input_converter.to_arrow_table(packet) - ) - - # TODO: prepare a separate str and repr methods - def __repr__(self) -> str: - return f"Cached:{self.function_pod}" - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.function_pod.keys(*streams, trigger_run=trigger_run) - - def is_memoized(self, packet: Packet) -> bool: - return self.retrieve_memoized(packet) is not None - - def add_tag_record(self, tag: Tag, packet: Packet) -> Tag: - """ - Record the tag for the packet in the record store. - This is used to keep track of the tags associated with memoized packets. - """ - - return self._add_tag_record_with_packet_key(tag, self.get_packet_key(packet)) - - def _add_tag_record_with_packet_key(self, tag: Tag, packet_key: str) -> Tag: - if self.tag_store is None: - raise ValueError("Recording of tag requires tag_store but none provided") - - tag = dict(tag) # ensure we don't modify the original tag - tag["__packet_key"] = packet_key - - # convert tag to arrow table - table = pa.Table.from_pylist([tag]) - - entry_hash = self.arrow_hasher.hash_table(table) - - # TODO: add error handling - # check if record already exists: - retrieved_table = self.tag_store.get_record( - self.function_pod.function_name, self.function_pod_hash, entry_hash - ) - if retrieved_table is None: - self.tag_store.add_record( - self.function_pod.function_name, - self.function_pod_hash, - entry_hash, - table, - ) - - return tag - - def retrieve_memoized(self, packet: Packet) -> Packet | None: - """ - Retrieve a memoized packet from the data store. - Returns None if no memoized packet is found. - """ - logger.info("Retrieving memoized packet") - return self._retrieve_memoized_by_hash(self.get_packet_key(packet)) - - def _retrieve_memoized_by_hash(self, packet_hash: str) -> Packet | None: - """ - Retrieve a memoized result packet from the data store, looking up by hash - Returns None if no memoized packet is found. - """ - logger.info(f"Retrieving memoized packet with hash {packet_hash}") - arrow_table = self.result_store.get_record( - self.function_pod.function_name, - self.function_pod_hash, - packet_hash, - ) - if arrow_table is None: - return None - packets = self.function_pod.output_converter.from_arrow_table(arrow_table) - # since memoizing single packet, it should only contain one packet - assert len(packets) == 1, ( - f"Memoizing single packet return {len(packets)} packets!" - ) - return packets[0] - - def memoize( - self, - packet: Packet, - output_packet: Packet, - ) -> Packet: - """ - Memoize the output packet in the data store. - Returns the memoized packet. - """ - logger.info("Memoizing packet") - return self._memoize_by_hash(self.get_packet_key(packet), output_packet) - - def _memoize_by_hash(self, packet_hash: str, output_packet: Packet) -> Packet: - """ - Memoize the output packet in the data store, looking up by hash. - Returns the memoized packet. - """ - logger.info(f"Memoizing packet with hash {packet_hash}") - packets = self.function_pod.output_converter.from_arrow_table( - self.result_store.add_record( - self.function_pod.function_name, - self.function_pod_hash, - packet_hash, - self.function_pod.output_converter.to_arrow_table(output_packet), - ) - ) - # since memoizing single packet, it should only contain one packet - assert len(packets) == 1, ( - f"Memoizing single packet return {len(packets)} packets!" - ) - return packets[0] - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet]: - packet_key = "" - if ( - not self.skip_tag_record - or not self.skip_memoization_lookup - or not self.skip_memoization - ): - packet_key = self.get_packet_key(packet) - - if not self.skip_tag_record and self.tag_store is not None: - self._add_tag_record_with_packet_key(tag, packet_key) - - if not self.skip_memoization_lookup: - memoized_packet = self._retrieve_memoized_by_hash(packet_key) - else: - memoized_packet = None - if memoized_packet is not None: - logger.info("Memoized packet found, skipping computation") - return tag, memoized_packet - - tag, output_packet = self.function_pod.call(tag, packet) - - if not self.skip_memoization: - # output packet may be modified by the memoization process - # e.g. if the output is a file, the path may be changed - output_packet = self.memoize(packet, output_packet) # type: ignore - - return tag, output_packet - - def get_all_entries_with_tags(self) -> pl.LazyFrame | None: - """ - Retrieve all entries from the tag store with their associated tags. - Returns a DataFrame with columns for tag and packet key. - """ - if self.tag_store is None: - raise ValueError("Tag store is not set, cannot retrieve entries") - - tag_records = self.tag_store.get_all_records_as_polars( - self.function_pod.function_name, self.function_pod_hash - ) - if tag_records is None: - return None - result_packets = self.result_store.get_records_by_ids_as_polars( - self.function_pod.function_name, - self.function_pod_hash, - tag_records.collect()["__packet_key"], - preserve_input_order=True, - ) - if result_packets is None: - return None - - return pl.concat([tag_records, result_packets], how="horizontal").drop( - ["__packet_key"] - ) - - def identity_structure(self, *streams) -> Any: - return self.function_pod.identity_structure(*streams) diff --git a/src/orcapod/store/__init__.py b/src/orcapod/store/__init__.py index f573c4d..281874b 100644 --- a/src/orcapod/store/__init__.py +++ b/src/orcapod/store/__init__.py @@ -1,5 +1,6 @@ from .types import DataStore, ArrowDataStore -from .core import DirDataStore, NoOpDataStore +from .arrow_data_stores import MockArrowDataStore, SimpleInMemoryDataStore +from .dict_data_stores import DirDataStore, NoOpDataStore from .safe_dir_data_store import SafeDirDataStore __all__ = [ @@ -8,4 +9,6 @@ "DirDataStore", "SafeDirDataStore", "NoOpDataStore", + "MockArrowDataStore", + "SimpleInMemoryDataStore", ] diff --git a/src/orcapod/store/arrow_data_stores.py b/src/orcapod/store/arrow_data_stores.py index 4be9698..e2c1376 100644 --- a/src/orcapod/store/arrow_data_stores.py +++ b/src/orcapod/store/arrow_data_stores.py @@ -23,17 +23,15 @@ class MockArrowDataStore: def __init__(self): logger.info("Initialized MockArrowDataStore") - def add_record(self, - source_name: str, - source_id: str, - entry_id: str, - arrow_data: pa.Table) -> pa.Table: + def add_record( + self, source_name: str, source_id: str, entry_id: str, arrow_data: pa.Table + ) -> pa.Table: """Add a record to the mock store.""" return arrow_data - def get_record(self, source_name: str, - source_id: str, - entry_id: str) -> pa.Table | None: + def get_record( + self, source_name: str, source_id: str, entry_id: str + ) -> pa.Table | None: """Get a specific record.""" return None @@ -76,7 +74,7 @@ def get_records_by_ids( Arrow table containing all found records, or None if no records found """ return None - + def get_records_by_ids_as_polars( self, source_name: str, @@ -88,21 +86,19 @@ def get_records_by_ids_as_polars( return None - - -class InMemoryArrowDataStore: +class SimpleInMemoryDataStore: """ - In-memory Arrow data store for testing purposes. + In-memory Arrow data store, primarily to be used for testing purposes. This class simulates the behavior of ParquetArrowDataStore without actual file I/O. It is useful for unit tests where you want to avoid filesystem dependencies. - + Uses dict of dict of Arrow tables for efficient storage and retrieval. """ def __init__(self, duplicate_entry_behavior: str = "error"): """ Initialize the InMemoryArrowDataStore. - + Args: duplicate_entry_behavior: How to handle duplicate entry_ids: - 'error': Raise ValueError when entry_id already exists @@ -112,10 +108,12 @@ def __init__(self, duplicate_entry_behavior: str = "error"): if duplicate_entry_behavior not in ["error", "overwrite"]: raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") self.duplicate_entry_behavior = duplicate_entry_behavior - + # Store Arrow tables: {source_key: {entry_id: arrow_table}} self._in_memory_store: dict[str, dict[str, pa.Table]] = {} - logger.info(f"Initialized InMemoryArrowDataStore with duplicate_entry_behavior='{duplicate_entry_behavior}'") + logger.info( + f"Initialized InMemoryArrowDataStore with duplicate_entry_behavior='{duplicate_entry_behavior}'" + ) def _get_source_key(self, source_name: str, source_id: str) -> str: """Generate key for source storage.""" @@ -127,40 +125,42 @@ def add_record( source_id: str, entry_id: str, arrow_data: pa.Table, + ignore_duplicate: bool = False, ) -> pa.Table: """ Add a record to the in-memory store. - + Args: source_name: Name of the data source source_id: ID of the specific dataset within the source entry_id: Unique identifier for this record arrow_data: The Arrow table data to store - + Returns: - The original arrow_data table - + arrow_data equivalent to having loaded the corresponding entry that was just saved + Raises: ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' """ source_key = self._get_source_key(source_name, source_id) - + # Initialize source if it doesn't exist if source_key not in self._in_memory_store: self._in_memory_store[source_key] = {} - + local_data = self._in_memory_store[source_key] - + # Check for duplicate entry - if entry_id in local_data and self.duplicate_entry_behavior == "error": - raise ValueError( - f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - + if entry_id in local_data: + if not ignore_duplicate and self.duplicate_entry_behavior == "error": + raise ValueError( + f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + # Store the record local_data[entry_id] = arrow_data - + action = "Updated" if entry_id in local_data else "Added" logger.debug(f"{action} record {entry_id} in {source_key}") return arrow_data @@ -173,24 +173,29 @@ def get_record( local_data = self._in_memory_store.get(source_key, {}) return local_data.get(entry_id) - def get_all_records(self, source_name: str, source_id: str) -> pa.Table | None: + def get_all_records( + self, source_name: str, source_id: str, add_entry_id_column: bool | str = False + ) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" source_key = self._get_source_key(source_name, source_id) local_data = self._in_memory_store.get(source_key, {}) - + if not local_data: return None tables_with_keys = [] for key, table in local_data.items(): # Add entry_id column to each table - key_array = pa.array([key] * len(table), type=pa.string()) + key_array = pa.array([key] * len(table), type=pa.large_string()) table_with_key = table.add_column(0, "__entry_id", key_array) tables_with_keys.append(table_with_key) # Concatenate all tables if tables_with_keys: - return pa.concat_tables(tables_with_keys) + combined_table = pa.concat_tables(tables_with_keys) + if not add_entry_id_column: + combined_table = combined_table.drop(columns=["__entry_id"]) + return combined_table return None def get_all_records_as_polars( @@ -212,7 +217,7 @@ def get_records_by_ids( ) -> pa.Table | None: """ Retrieve records by entry IDs as a single table. - + Args: source_name: Name of the data source source_id: ID of the specific dataset within the source @@ -226,7 +231,7 @@ def get_records_by_ids( - str: Include entry ID column with custom name preserve_input_order: If True, return results in the same order as input entry_ids, with null rows for missing entries. If False, return in storage order. - + Returns: Arrow table containing all found records, or None if no records found """ @@ -250,18 +255,18 @@ def get_records_by_ids( source_key = self._get_source_key(source_name, source_id) local_data = self._in_memory_store.get(source_key, {}) - + if not local_data: return None # Collect matching tables found_tables = [] found_entry_ids = [] - + if preserve_input_order: # Preserve input order, include nulls for missing entries first_table_schema = None - + for entry_id in entry_ids_list: if entry_id in local_data: table = local_data[entry_id] @@ -270,7 +275,7 @@ def get_records_by_ids( table_with_key = table.add_column(0, "__entry_id", key_array) found_tables.append(table_with_key) found_entry_ids.append(entry_id) - + # Store schema for creating null rows if first_table_schema is None: first_table_schema = table_with_key.schema @@ -281,12 +286,14 @@ def get_records_by_ids( null_data = {} for field in first_table_schema: if field.name == "__entry_id": - null_data[field.name] = pa.array([entry_id], type=field.type) + null_data[field.name] = pa.array( + [entry_id], type=field.type + ) else: # Create null array with proper type null_array = pa.array([None], type=field.type) null_data[field.name] = null_array - + null_table = pa.table(null_data, schema=first_table_schema) found_tables.append(null_table) found_entry_ids.append(entry_id) @@ -315,12 +322,17 @@ def get_records_by_ids( # Remove the __entry_id column column_names = combined_table.column_names if "__entry_id" in column_names: - indices_to_keep = [i for i, name in enumerate(column_names) if name != "__entry_id"] + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] combined_table = combined_table.select(indices_to_keep) elif isinstance(add_entry_id_column, str): # Rename __entry_id to custom name schema = combined_table.schema - new_names = [add_entry_id_column if name == "__entry_id" else name for name in schema.names] + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] combined_table = combined_table.rename_columns(new_names) # If add_entry_id_column is True, keep __entry_id as is @@ -336,7 +348,7 @@ def get_records_by_ids_as_polars( ) -> pl.LazyFrame | None: """ Retrieve records by entry IDs as a single Polars LazyFrame. - + Args: source_name: Name of the data source source_id: ID of the specific dataset within the source @@ -350,7 +362,7 @@ def get_records_by_ids_as_polars( - str: Include entry ID column with custom name preserve_input_order: If True, return results in the same order as input entry_ids, with null rows for missing entries. If False, return in storage order. - + Returns: Polars LazyFrame containing all found records, or None if no records found """ @@ -358,42 +370,42 @@ def get_records_by_ids_as_polars( arrow_result = self.get_records_by_ids( source_name, source_id, entry_ids, add_entry_id_column, preserve_input_order ) - + if arrow_result is None: return None - + # Convert to Polars LazyFrame return pl.LazyFrame(arrow_result) def save_to_parquet(self, base_path: str | Path) -> None: """ Save all data to Parquet files in a directory structure. - + Directory structure: base_path/source_name/source_id/data.parquet - + Args: base_path: Base directory path where to save the Parquet files """ base_path = Path(base_path) base_path.mkdir(parents=True, exist_ok=True) - + saved_count = 0 - + for source_key, local_data in self._in_memory_store.items(): if not local_data: continue - + # Parse source_name and source_id from the key if ":" not in source_key: logger.warning(f"Invalid source key format: {source_key}, skipping") continue - + source_name, source_id = source_key.split(":", 1) - + # Create directory structure source_dir = base_path / source_name / source_id source_dir.mkdir(parents=True, exist_ok=True) - + # Combine all tables for this source with entry_id column tables_with_keys = [] for entry_id, table in local_data.items(): @@ -401,82 +413,89 @@ def save_to_parquet(self, base_path: str | Path) -> None: key_array = pa.array([entry_id] * len(table), type=pa.string()) table_with_key = table.add_column(0, "__entry_id", key_array) tables_with_keys.append(table_with_key) - + # Concatenate all tables if tables_with_keys: combined_table = pa.concat_tables(tables_with_keys) - + # Save as Parquet file + # TODO: perform safe "atomic" write parquet_path = source_dir / "data.parquet" import pyarrow.parquet as pq + pq.write_table(combined_table, parquet_path) - + saved_count += 1 - logger.debug(f"Saved {len(combined_table)} records for {source_key} to {parquet_path}") - + logger.debug( + f"Saved {len(combined_table)} records for {source_key} to {parquet_path}" + ) + logger.info(f"Saved {saved_count} sources to Parquet files in {base_path}") def load_from_parquet(self, base_path: str | Path) -> None: """ Load data from Parquet files with the expected directory structure. - + Expected structure: base_path/source_name/source_id/data.parquet - + Args: base_path: Base directory path containing the Parquet files """ base_path = Path(base_path) - + if not base_path.exists(): logger.warning(f"Base path {base_path} does not exist") return - + # Clear existing data self._in_memory_store.clear() - + loaded_count = 0 - + # Traverse directory structure: source_name/source_id/ for source_name_dir in base_path.iterdir(): if not source_name_dir.is_dir(): continue - + source_name = source_name_dir.name - + for source_id_dir in source_name_dir.iterdir(): if not source_id_dir.is_dir(): continue - + source_id = source_id_dir.name source_key = self._get_source_key(source_name, source_id) - + # Look for Parquet files in this directory parquet_files = list(source_id_dir.glob("*.parquet")) - + if not parquet_files: logger.debug(f"No Parquet files found in {source_id_dir}") continue - + # Load all Parquet files and combine them all_records = [] - + for parquet_file in parquet_files: try: import pyarrow.parquet as pq + table = pq.read_table(parquet_file) - + # Validate that __entry_id column exists if "__entry_id" not in table.column_names: - logger.warning(f"Parquet file {parquet_file} missing __entry_id column, skipping") + logger.warning( + f"Parquet file {parquet_file} missing __entry_id column, skipping" + ) continue - + all_records.append(table) logger.debug(f"Loaded {len(table)} records from {parquet_file}") - + except Exception as e: logger.error(f"Failed to load Parquet file {parquet_file}: {e}") continue - + # Process all records for this source if all_records: # Combine all tables @@ -484,44 +503,53 @@ def load_from_parquet(self, base_path: str | Path) -> None: combined_table = all_records[0] else: combined_table = pa.concat_tables(all_records) - + # Split back into individual records by entry_id local_data = {} entry_ids = combined_table.column("__entry_id").to_pylist() - + # Group records by entry_id entry_id_groups = {} for i, entry_id in enumerate(entry_ids): if entry_id not in entry_id_groups: entry_id_groups[entry_id] = [] entry_id_groups[entry_id].append(i) - + # Extract each entry_id's records for entry_id, indices in entry_id_groups.items(): # Take rows for this entry_id and remove __entry_id column entry_table = combined_table.take(indices) - + # Remove __entry_id column column_names = entry_table.column_names if "__entry_id" in column_names: - indices_to_keep = [i for i, name in enumerate(column_names) if name != "__entry_id"] + indices_to_keep = [ + i + for i, name in enumerate(column_names) + if name != "__entry_id" + ] entry_table = entry_table.select(indices_to_keep) - + local_data[entry_id] = entry_table - + self._in_memory_store[source_key] = local_data loaded_count += 1 - + record_count = len(combined_table) unique_entries = len(entry_id_groups) - logger.debug(f"Loaded {record_count} records ({unique_entries} unique entries) for {source_key}") - + logger.debug( + f"Loaded {record_count} records ({unique_entries} unique entries) for {source_key}" + ) + logger.info(f"Loaded {loaded_count} sources from Parquet files in {base_path}") - + # Log summary of loaded data - total_records = sum(len(local_data) for local_data in self._in_memory_store.values()) + total_records = sum( + len(local_data) for local_data in self._in_memory_store.values() + ) logger.info(f"Total records loaded: {total_records}") + @dataclass class RecordMetadata: """Metadata for a stored record.""" @@ -1634,7 +1662,7 @@ def create_multi_row_record(entry_id: str, num_rows: int = 3) -> pa.Table: store.add_record( "experiments", "dataset_A", valid_entries[0], overwrite_data ) - print(f"✓ Overwrote existing record") + print("✓ Overwrote existing record") # Verify overwrite updated_record = store.get_record( @@ -1648,7 +1676,7 @@ def create_multi_row_record(entry_id: str, num_rows: int = 3) -> pa.Table: # Sync and show final stats store.force_sync() stats = store.get_stats() - print(f"\n=== Final Statistics ===") + print("\n=== Final Statistics ===") print(f"Total records: {stats['total_records']}") print(f"Loaded caches: {stats['loaded_source_caches']}") print(f"Dirty caches: {stats['dirty_caches']}") @@ -1659,6 +1687,377 @@ def create_multi_row_record(entry_id: str, num_rows: int = 3) -> pa.Table: print("\n✓ Single-row constraint testing completed successfully!") +class InMemoryPolarsDataStore: + """ + In-memory Arrow data store using Polars DataFrames for efficient storage and retrieval. + This class provides the same interface as InMemoryArrowDataStore but uses Polars internally + for better performance with large datasets and complex queries. + + Uses dict of Polars DataFrames for efficient storage and retrieval. + Each DataFrame contains all records for a source with an __entry_id column. + """ + + def __init__(self, duplicate_entry_behavior: str = "error"): + """ + Initialize the InMemoryPolarsDataStore. + + Args: + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + self.duplicate_entry_behavior = duplicate_entry_behavior + + # Store Polars DataFrames: {source_key: polars_dataframe} + # Each DataFrame has an __entry_id column plus user data columns + self._in_memory_store: dict[str, pl.DataFrame] = {} + logger.info( + f"Initialized InMemoryPolarsDataStore with duplicate_entry_behavior='{duplicate_entry_behavior}'" + ) + + def _get_source_key(self, source_name: str, source_id: str) -> str: + """Generate key for source storage.""" + return f"{source_name}:{source_id}" + + def add_record( + self, + source_name: str, + source_id: str, + entry_id: str, + arrow_data: pa.Table, + ) -> pa.Table: + """ + Add a record to the in-memory store. + + Args: + source_name: Name of the data source + source_id: ID of the specific dataset within the source + entry_id: Unique identifier for this record + arrow_data: The Arrow table data to store + + Returns: + arrow_data equivalent to having loaded the corresponding entry that was just saved + + Raises: + ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' + """ + source_key = self._get_source_key(source_name, source_id) + + # Convert Arrow table to Polars DataFrame and add entry_id column + polars_data = cast(pl.DataFrame, pl.from_arrow(arrow_data)) + + # Add __entry_id column + polars_data = polars_data.with_columns(pl.lit(entry_id).alias("__entry_id")) + + # Check if source exists + if source_key not in self._in_memory_store: + # First record for this source + self._in_memory_store[source_key] = polars_data + logger.debug(f"Created new source {source_key} with entry {entry_id}") + else: + existing_df = self._in_memory_store[source_key] + + # Check for duplicate entry + entry_exists = ( + existing_df.filter(pl.col("__entry_id") == entry_id).shape[0] > 0 + ) + + if entry_exists: + if self.duplicate_entry_behavior == "error": + raise ValueError( + f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + else: # validity of value is checked in constructor so it must be "ovewrite" + # Remove existing entry and add new one + existing_df = existing_df.filter(pl.col("__entry_id") != entry_id) + self._in_memory_store[source_key] = pl.concat( + [existing_df, polars_data] + ) + logger.debug(f"Overwrote entry {entry_id} in {source_key}") + else: + # Append new entry + try: + self._in_memory_store[source_key] = pl.concat( + [existing_df, polars_data] + ) + logger.debug(f"Added entry {entry_id} to {source_key}") + except Exception as e: + # Handle schema mismatch + existing_cols = set(existing_df.columns) - {"__entry_id"} + new_cols = set(polars_data.columns) - {"__entry_id"} + + if existing_cols != new_cols: + raise ValueError( + f"Schema mismatch for {source_key}. " + f"Existing columns: {sorted(existing_cols)}, " + f"New columns: {sorted(new_cols)}" + ) from e + else: + raise e + + return arrow_data + + def get_record( + self, source_name: str, source_id: str, entry_id: str + ) -> pa.Table | None: + """Get a specific record.""" + source_key = self._get_source_key(source_name, source_id) + + if source_key not in self._in_memory_store: + return None + + df = self._in_memory_store[source_key] + + # Filter for the specific entry_id + filtered_df = df.filter(pl.col("__entry_id") == entry_id) + + if filtered_df.shape[0] == 0: + return None + + # Remove __entry_id column and convert to Arrow + result_df = filtered_df.drop("__entry_id") + return result_df.to_arrow() + + def get_all_records( + self, source_name: str, source_id: str, add_entry_id_column: bool | str = False + ) -> pa.Table | None: + """Retrieve all records for a given source as a single table.""" + df = self.get_all_records_as_polars( + source_name, source_id, add_entry_id_column=add_entry_id_column + ) + return df.collect().to_arrow() + + def get_all_records_as_polars( + self, source_name: str, source_id: str, add_entry_id_column: bool | str = False + ) -> pl.LazyFrame | None: + """Retrieve all records for a given source as a single Polars LazyFrame.""" + source_key = self._get_source_key(source_name, source_id) + + if source_key not in self._in_memory_store: + return None + + df = self._in_memory_store[source_key] + + if df.shape[0] == 0: + return None + + # perform column selection lazily + df = df.lazy() + + # Handle entry_id column based on parameter + if add_entry_id_column is False: + # Remove __entry_id column + result_df = df.drop("__entry_id") + elif add_entry_id_column is True: + # Keep __entry_id column as is + result_df = df + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + result_df = df.rename({"__entry_id": add_entry_id_column}) + else: + raise ValueError( + f"add_entry_id_column must be a bool or str but {add_entry_id_column} was given" + ) + + return result_df + + def get_records_by_ids( + self, + source_name: str, + source_id: str, + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table. + + Args: + source_name: Name of the data source + source_id: ID of the specific dataset within the source + entry_ids: Entry IDs to retrieve. Can be: + - list[str]: List of entry ID strings + - pl.Series: Polars Series containing entry IDs + - pa.Array: PyArrow Array containing entry IDs + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + preserve_input_order: If True, return results in the same order as input entry_ids, + with null rows for missing entries. If False, return in storage order. + + Returns: + Arrow table containing all found records, or None if no records found + """ + # Convert input to Polars Series + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_series = pl.Series("entry_id", entry_ids) + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_series = entry_ids + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_series = pl.from_arrow(pa.table({"entry_id": entry_ids}))[ + "entry_id" + ] + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + source_key = self._get_source_key(source_name, source_id) + + if source_key not in self._in_memory_store: + return None + + df = self._in_memory_store[source_key] + + if preserve_input_order: + # Create DataFrame with input order and join to preserve order with nulls + ordered_df = pl.DataFrame({"__entry_id": entry_ids_series}) + result_df = ordered_df.join(df, on="__entry_id", how="left") + else: + # Filter for matching entry_ids (storage order) + result_df = df.filter(pl.col("__entry_id").is_in(entry_ids_series)) + + if result_df.shape[0] == 0: + return None + + # Handle entry_id column based on parameter + if add_entry_id_column is False: + # Remove __entry_id column + result_df = result_df.drop("__entry_id") + elif add_entry_id_column is True: + # Keep __entry_id column as is + pass + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + result_df = result_df.rename({"__entry_id": add_entry_id_column}) + + return result_df.to_arrow() + + def get_records_by_ids_as_polars( + self, + source_name: str, + source_id: str, + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pl.LazyFrame | None: + """ + Retrieve records by entry IDs as a single Polars LazyFrame. + + Args: + source_name: Name of the data source + source_id: ID of the specific dataset within the source + entry_ids: Entry IDs to retrieve. Can be: + - list[str]: List of entry ID strings + - pl.Series: Polars Series containing entry IDs + - pa.Array: PyArrow Array containing entry IDs + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + preserve_input_order: If True, return results in the same order as input entry_ids, + with null rows for missing entries. If False, return in storage order. + + Returns: + Polars LazyFrame containing all found records, or None if no records found + """ + # Get Arrow result and convert to Polars LazyFrame + arrow_result = self.get_records_by_ids( + source_name, source_id, entry_ids, add_entry_id_column, preserve_input_order + ) + + if arrow_result is None: + return None + + # Convert to Polars LazyFrame + return pl.from_arrow(arrow_result).lazy() + + def entry_exists(self, source_name: str, source_id: str, entry_id: str) -> bool: + """Check if a specific entry exists.""" + source_key = self._get_source_key(source_name, source_id) + + if source_key not in self._in_memory_store: + return False + + df = self._in_memory_store[source_key] + return df.filter(pl.col("__entry_id") == entry_id).shape[0] > 0 + + def list_entries(self, source_name: str, source_id: str) -> set[str]: + """List all entry IDs for a specific source.""" + source_key = self._get_source_key(source_name, source_id) + + if source_key not in self._in_memory_store: + return set() + + df = self._in_memory_store[source_key] + return set(df["__entry_id"].to_list()) + + def list_sources(self) -> set[tuple[str, str]]: + """List all (source_name, source_id) combinations.""" + sources = set() + for source_key in self._in_memory_store.keys(): + if ":" in source_key: + source_name, source_id = source_key.split(":", 1) + sources.add((source_name, source_id)) + return sources + + def clear_source(self, source_name: str, source_id: str) -> None: + """Clear all records for a specific source.""" + source_key = self._get_source_key(source_name, source_id) + if source_key in self._in_memory_store: + del self._in_memory_store[source_key] + logger.debug(f"Cleared source {source_key}") + + def clear_all(self) -> None: + """Clear all records from the store.""" + self._in_memory_store.clear() + logger.info("Cleared all records from store") + + def get_stats(self) -> dict[str, Any]: + """Get comprehensive statistics about the data store.""" + total_records = 0 + total_memory_mb = 0 + source_stats = [] + + for source_key, df in self._in_memory_store.items(): + record_count = df.shape[0] + total_records += record_count + + # Estimate memory usage (rough approximation) + memory_bytes = df.estimated_size() + memory_mb = memory_bytes / (1024 * 1024) + total_memory_mb += memory_mb + + source_stats.append( + { + "source_key": source_key, + "record_count": record_count, + "column_count": df.shape[1] - 1, # Exclude __entry_id + "memory_mb": round(memory_mb, 2), + "columns": [col for col in df.columns if col != "__entry_id"], + } + ) + + return { + "total_records": total_records, + "total_sources": len(self._in_memory_store), + "total_memory_mb": round(total_memory_mb, 2), + "duplicate_entry_behavior": self.duplicate_entry_behavior, + "source_details": source_stats, + } + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) demo_single_row_constraint() diff --git a/src/orcapod/store/core.py b/src/orcapod/store/dict_data_stores.py similarity index 100% rename from src/orcapod/store/core.py rename to src/orcapod/store/dict_data_stores.py diff --git a/src/orcapod/store/file.py b/src/orcapod/store/file.py deleted file mode 100644 index 0de8aff..0000000 --- a/src/orcapod/store/file.py +++ /dev/null @@ -1,159 +0,0 @@ -import builtins -import contextlib -import inspect -import os -from pathlib import Path -from typing import Callable, Collection, Dict, Optional, Tuple, Union - -from orcapod.types import Packet, PathSet - - -@contextlib.contextmanager -def redirect_open( - mapping: Union[Dict[str, str], Callable[[str], Optional[str]]], -): - """ - Context manager to intercept file opening operations. - - Args: - mapping: Either a dictionary mapping original paths to their replacements, - or a function that takes a path string and returns a replacement path - (or None to indicate the file should not be opened). - - Raises: - FileNotFoundError: If using a dictionary and the path is not found in it. - """ - # Track all places that might store an open() function - places_to_patch = [] - - # 1. Standard builtins.open - original_builtin_open = builtins.open - places_to_patch.append((builtins, "open", original_builtin_open)) - - # 2. __builtins__ (could be different in some contexts, especially IPython) - if isinstance(__builtins__, dict) and "open" in __builtins__: - places_to_patch.append((__builtins__, "open", __builtins__["open"])) - - # 3. Current module's globals (for the calling namespace) - current_frame = inspect.currentframe() - if current_frame is not None: - caller_globals = current_frame.f_back.f_globals if current_frame.f_back else {} - if "open" in caller_globals: - places_to_patch.append((caller_globals, "open", caller_globals["open"])) - - # 4. Check for IPython user namespace - try: - import IPython - - ip = IPython.get_ipython() # type: ignore - if ip and "open" in ip.user_ns: - places_to_patch.append((ip.user_ns, "open", ip.user_ns["open"])) - except (ImportError, AttributeError): - pass - - def patched_open(file, *args, **kwargs): - # Convert PathLike objects to string if needed - if hasattr(file, "__fspath__"): - file_path = os.fspath(file) - else: - file_path = str(file) - - if isinstance(mapping, dict): - if file_path in mapping: - redirected_path = mapping[file_path] - print(f"Redirecting '{file_path}' to '{redirected_path}'") - return original_builtin_open(redirected_path, *args, **kwargs) - else: - raise FileNotFoundError( - f"Path '{file_path}' not found in redirection mapping" - ) - else: # mapping is a function - redirected_path = mapping(file_path) - if redirected_path is not None: - print(f"Redirecting '{file_path}' to '{redirected_path}'") - return original_builtin_open(redirected_path, *args, **kwargs) - else: - raise FileNotFoundError(f"Path '{file_path}' could not be redirected") - - # Apply the patch to all places - for obj, attr, _ in places_to_patch: - if isinstance(obj, dict): - obj[attr] = patched_open - else: - setattr(obj, attr, patched_open) - - try: - yield - finally: - # Restore all original functions - for obj, attr, original in places_to_patch: - if isinstance(obj, dict): - obj[attr] = original - else: - setattr(obj, attr, original) - - -def virtual_mount( - packet: Packet, -) -> Tuple[Packet, Dict[str, str], Dict[str, str]]: - """ - Visit all pathset within the packet, and convert them to alternative path - representation. By default, full path is mapped to the file name. If two or - more paths have the same file name, the second one is suffixed with "_1", the - third one with "_2", etc. This is useful for creating a virtual mount point - for a set of files, where the original paths are not important, but the file - names can be used to identify the files. - """ - forward_lut = {} # mapping from original path to new path - reverse_lut = {} # mapping from new path to original path - new_packet = {} - - for key, value in packet.items(): - new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) - - return new_packet, forward_lut, reverse_lut - - -# TODO: re-assess the structure of PathSet and consider making it recursive -def convert_pathset(pathset: PathSet, forward_lut, reverse_lut) -> PathSet: - """ - Convert a pathset to a new pathset. forward_lut and reverse_lut are updated - with the new paths. The new paths are created by replacing the original paths - with the new paths in the forward_lut. The reverse_lut is updated with the - original paths. If name already exists, a suffix is added to the new name to avoid - collisions. - """ - if isinstance(pathset, (str, bytes)): - new_name = Path(pathset).name - if new_name in reverse_lut: - # if the name already exists, add a suffix - i = 1 - while f"{new_name}_{i}" in reverse_lut: - i += 1 - new_name = f"{new_name}_{i}" - forward_lut[pathset] = new_name - reverse_lut[new_name] = pathset - return new_name - elif isinstance(pathset, Collection): - return [convert_pathset(p, forward_lut, reverse_lut) for p in pathset] # type: ignore - else: - raise ValueError( - f"Unsupported pathset type: {type(pathset)}. Expected str, bytes, or Collection." - ) - - -class WrappedPath: - def __init__(self, path, name=None): - self.path = Path(path) - if name is None: - name = self.path.name - self.name = name - - def __fspath__(self) -> Union[str, bytes]: - return self.path.__fspath__() - - def __str__(self) -> str: - return self.name - - def __repr__(self) -> str: - return f"WrappedPath({self.path}): {self.name}" diff --git a/src/orcapod/store/file_ops.py b/src/orcapod/store/file_ops.py index 0e34213..4fa6202 100644 --- a/src/orcapod/store/file_ops.py +++ b/src/orcapod/store/file_ops.py @@ -1,10 +1,15 @@ # file_ops.py - Atomic file operations module +import builtins +import contextlib +import inspect import logging import os from pathlib import Path -from orcapod.types import PathLike +from orcapod.types import PathLike, PathSet, Packet +from collections.abc import Collection, Callable + logger = logging.getLogger(__name__) @@ -276,3 +281,154 @@ def is_file_locked(file_path: PathLike) -> bool: except Exception: # Any other exception - assume not locked return False + + +@contextlib.contextmanager +def redirect_open( + mapping: dict[str, str] | Callable[[str], str | None], +): + """ + Context manager to intercept file opening operations. + + Args: + mapping: Either a dictionary mapping original paths to their replacements, + or a function that takes a path string and returns a replacement path + (or None to indicate the file should not be opened). + + Raises: + FileNotFoundError: If using a dictionary and the path is not found in it. + """ + # Track all places that might store an open() function + places_to_patch = [] + + # 1. Standard builtins.open + original_builtin_open = builtins.open + places_to_patch.append((builtins, "open", original_builtin_open)) + + # 2. __builtins__ (could be different in some contexts, especially IPython) + if isinstance(__builtins__, dict) and "open" in __builtins__: + places_to_patch.append((__builtins__, "open", __builtins__["open"])) + + # 3. Current module's globals (for the calling namespace) + current_frame = inspect.currentframe() + if current_frame is not None: + caller_globals = current_frame.f_back.f_globals if current_frame.f_back else {} + if "open" in caller_globals: + places_to_patch.append((caller_globals, "open", caller_globals["open"])) + + # 4. Check for IPython user namespace + try: + import IPython + + ip = IPython.get_ipython() # type: ignore + if ip and "open" in ip.user_ns: + places_to_patch.append((ip.user_ns, "open", ip.user_ns["open"])) + except (ImportError, AttributeError): + pass + + def patched_open(file, *args, **kwargs): + # Convert PathLike objects to string if needed + if hasattr(file, "__fspath__"): + file_path = os.fspath(file) + else: + file_path = str(file) + + if isinstance(mapping, dict): + if file_path in mapping: + redirected_path = mapping[file_path] + print(f"Redirecting '{file_path}' to '{redirected_path}'") + return original_builtin_open(redirected_path, *args, **kwargs) + else: + raise FileNotFoundError( + f"Path '{file_path}' not found in redirection mapping" + ) + else: # mapping is a function + redirected_path = mapping(file_path) + if redirected_path is not None: + print(f"Redirecting '{file_path}' to '{redirected_path}'") + return original_builtin_open(redirected_path, *args, **kwargs) + else: + raise FileNotFoundError(f"Path '{file_path}' could not be redirected") + + # Apply the patch to all places + for obj, attr, _ in places_to_patch: + if isinstance(obj, dict): + obj[attr] = patched_open + else: + setattr(obj, attr, patched_open) + + try: + yield + finally: + # Restore all original functions + for obj, attr, original in places_to_patch: + if isinstance(obj, dict): + obj[attr] = original + else: + setattr(obj, attr, original) + + +def virtual_mount( + packet: Packet, +) -> tuple[Packet, dict[str, str], dict[str, str]]: + """ + Visit all pathset within the packet, and convert them to alternative path + representation. By default, full path is mapped to the file name. If two or + more paths have the same file name, the second one is suffixed with "_1", the + third one with "_2", etc. This is useful for creating a virtual mount point + for a set of files, where the original paths are not important, but the file + names can be used to identify the files. + """ + forward_lut = {} # mapping from original path to new path + reverse_lut = {} # mapping from new path to original path + new_packet = {} + + for key, value in packet.items(): + new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) + + return new_packet, forward_lut, reverse_lut + + +# TODO: re-assess the structure of PathSet and consider making it recursive +def convert_pathset(pathset: PathSet, forward_lut, reverse_lut) -> PathSet: + """ + Convert a pathset to a new pathset. forward_lut and reverse_lut are updated + with the new paths. The new paths are created by replacing the original paths + with the new paths in the forward_lut. The reverse_lut is updated with the + original paths. If name already exists, a suffix is added to the new name to avoid + collisions. + """ + if isinstance(pathset, (str, bytes)): + new_name = Path(pathset).name + if new_name in reverse_lut: + # if the name already exists, add a suffix + i = 1 + while f"{new_name}_{i}" in reverse_lut: + i += 1 + new_name = f"{new_name}_{i}" + forward_lut[pathset] = new_name + reverse_lut[new_name] = pathset + return new_name + elif isinstance(pathset, Collection): + return [convert_pathset(p, forward_lut, reverse_lut) for p in pathset] # type: ignore + else: + raise ValueError( + f"Unsupported pathset type: {type(pathset)}. Expected str, bytes, or Collection." + ) + + +class WrappedPath: + def __init__(self, path, name=None): + self.path = Path(path) + if name is None: + name = self.path.name + self.name = name + + def __fspath__(self) -> str | bytes: + return self.path.__fspath__() + + def __str__(self) -> str: + return self.name + + def __repr__(self) -> str: + return f"WrappedPath({self.path}): {self.name}" diff --git a/src/orcapod/store/optimized_memory_store.py b/src/orcapod/store/optimized_memory_store.py new file mode 100644 index 0000000..ff962e9 --- /dev/null +++ b/src/orcapod/store/optimized_memory_store.py @@ -0,0 +1,433 @@ +import polars as pl +import pyarrow as pa +import logging +from typing import Any, Dict, List, Tuple, cast +from collections import defaultdict + +# Module-level logger +logger = logging.getLogger(__name__) + + +class ArrowBatchedPolarsDataStore: + """ + Arrow-batched Polars data store that minimizes Arrow<->Polars conversions. + + Key optimizations: + 1. Keep data in Arrow format during batching + 2. Only convert to Polars when consolidating or querying + 3. Batch Arrow tables and concatenate before conversion + 4. Maintain Arrow-based indexing for fast lookups + 5. Lazy Polars conversion only when needed + """ + + def __init__(self, duplicate_entry_behavior: str = "error", batch_size: int = 100): + """ + Initialize the ArrowBatchedPolarsDataStore. + + Args: + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + batch_size: Number of records to batch before consolidating + """ + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.batch_size = batch_size + + # Arrow batch buffer: {source_key: [(entry_id, arrow_table), ...]} + self._arrow_batches: Dict[str, List[Tuple[str, pa.Table]]] = defaultdict(list) + + # Consolidated Polars store: {source_key: polars_dataframe} + self._polars_store: Dict[str, pl.DataFrame] = {} + + # Entry ID index for fast lookups: {source_key: set[entry_ids]} + self._entry_index: Dict[str, set] = defaultdict(set) + + # Schema cache + self._schema_cache: Dict[str, pa.Schema] = {} + + logger.info( + f"Initialized ArrowBatchedPolarsDataStore with " + f"duplicate_entry_behavior='{duplicate_entry_behavior}', batch_size={batch_size}" + ) + + def _get_source_key(self, source_name: str, source_id: str) -> str: + """Generate key for source storage.""" + return f"{source_name}:{source_id}" + + def _add_entry_id_to_arrow_table(self, table: pa.Table, entry_id: str) -> pa.Table: + """Add entry_id column to Arrow table efficiently.""" + # Create entry_id array with the same length as the table + entry_id_array = pa.array([entry_id] * len(table), type=pa.string()) + + # Add column at the beginning for consistent ordering + return table.add_column(0, "__entry_id", entry_id_array) + + def _consolidate_arrow_batch(self, source_key: str) -> None: + """Consolidate Arrow batch into Polars DataFrame.""" + if source_key not in self._arrow_batches or not self._arrow_batches[source_key]: + return + + logger.debug(f"Consolidating {len(self._arrow_batches[source_key])} Arrow tables for {source_key}") + + # Prepare all Arrow tables with entry_id columns + arrow_tables_with_id = [] + + for entry_id, arrow_table in self._arrow_batches[source_key]: + table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) + arrow_tables_with_id.append(table_with_id) + + # Concatenate all Arrow tables at once (very fast) + if len(arrow_tables_with_id) == 1: + consolidated_arrow = arrow_tables_with_id[0] + else: + consolidated_arrow = pa.concat_tables(arrow_tables_with_id) + + # Single conversion to Polars + new_polars_df = cast(pl.DataFrame, pl.from_arrow(consolidated_arrow)) + + # Combine with existing Polars DataFrame if it exists + if source_key in self._polars_store: + existing_df = self._polars_store[source_key] + self._polars_store[source_key] = pl.concat([existing_df, new_polars_df]) + else: + self._polars_store[source_key] = new_polars_df + + # Clear the Arrow batch + self._arrow_batches[source_key].clear() + + logger.debug(f"Consolidated to Polars DataFrame with {len(self._polars_store[source_key])} total rows") + + def _force_consolidation(self, source_key: str) -> None: + """Force consolidation of Arrow batches.""" + if source_key in self._arrow_batches and self._arrow_batches[source_key]: + self._consolidate_arrow_batch(source_key) + + def _get_consolidated_dataframe(self, source_key: str) -> pl.DataFrame | None: + """Get consolidated Polars DataFrame, forcing consolidation if needed.""" + self._force_consolidation(source_key) + return self._polars_store.get(source_key) + + def add_record( + self, + source_name: str, + source_id: str, + entry_id: str, + arrow_data: pa.Table, + ) -> pa.Table: + """ + Add a record to the store using Arrow batching. + + This is the fastest path - no conversions, just Arrow table storage. + """ + source_key = self._get_source_key(source_name, source_id) + + # Check for duplicate entry + if entry_id in self._entry_index[source_key]: + if self.duplicate_entry_behavior == "error": + raise ValueError( + f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + else: + # Handle overwrite: remove from both Arrow batch and Polars store + # Remove from Arrow batch + self._arrow_batches[source_key] = [ + (eid, table) for eid, table in self._arrow_batches[source_key] + if eid != entry_id + ] + + # Remove from Polars store if it exists + if source_key in self._polars_store: + self._polars_store[source_key] = self._polars_store[source_key].filter( + pl.col("__entry_id") != entry_id + ) + + # Schema validation (cached) + if source_key in self._schema_cache: + if not self._schema_cache[source_key].equals(arrow_data.schema): + raise ValueError( + f"Schema mismatch for {source_key}. " + f"Expected: {self._schema_cache[source_key]}, " + f"Got: {arrow_data.schema}" + ) + else: + self._schema_cache[source_key] = arrow_data.schema + + # Add to Arrow batch (no conversion yet!) + self._arrow_batches[source_key].append((entry_id, arrow_data)) + self._entry_index[source_key].add(entry_id) + + # Consolidate if batch is full + if len(self._arrow_batches[source_key]) >= self.batch_size: + self._consolidate_arrow_batch(source_key) + + logger.debug(f"Added entry {entry_id} to Arrow batch for {source_key}") + return arrow_data + + def get_record( + self, source_name: str, source_id: str, entry_id: str + ) -> pa.Table | None: + """Get a specific record with optimized lookup.""" + source_key = self._get_source_key(source_name, source_id) + + # Quick existence check + if entry_id not in self._entry_index[source_key]: + return None + + # Check Arrow batch first (most recent data) + for batch_entry_id, arrow_table in self._arrow_batches[source_key]: + if batch_entry_id == entry_id: + return arrow_table + + # Check consolidated Polars store + df = self._get_consolidated_dataframe(source_key) + if df is None: + return None + + # Filter and convert back to Arrow + filtered_df = df.filter(pl.col("__entry_id") == entry_id).drop("__entry_id") + + if filtered_df.height == 0: + return None + + return filtered_df.to_arrow() + + def get_all_records( + self, source_name: str, source_id: str, add_entry_id_column: bool | str = False + ) -> pa.Table | None: + """Retrieve all records as a single Arrow table.""" + source_key = self._get_source_key(source_name, source_id) + + # Force consolidation to include all data + df = self._get_consolidated_dataframe(source_key) + if df is None or df.height == 0: + return None + + # Handle entry_id column + if add_entry_id_column is False: + result_df = df.drop("__entry_id") + elif add_entry_id_column is True: + result_df = df + elif isinstance(add_entry_id_column, str): + result_df = df.rename({"__entry_id": add_entry_id_column}) + else: + result_df = df.drop("__entry_id") + + return result_df.to_arrow() + + def get_all_records_as_polars( + self, source_name: str, source_id: str + ) -> pl.LazyFrame | None: + """Retrieve all records as a Polars LazyFrame.""" + source_key = self._get_source_key(source_name, source_id) + + df = self._get_consolidated_dataframe(source_key) + if df is None or df.height == 0: + return None + + return df.drop("__entry_id").lazy() + + def get_records_by_ids( + self, + source_name: str, + source_id: str, + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pa.Table | None: + """Retrieve records by entry IDs efficiently.""" + # Convert input to list for processing + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError(f"entry_ids must be list[str], pl.Series, or pa.Array") + + source_key = self._get_source_key(source_name, source_id) + + # Quick filter using index + existing_entries = [ + entry_id for entry_id in entry_ids_list + if entry_id in self._entry_index[source_key] + ] + + if not existing_entries and not preserve_input_order: + return None + + # Collect from Arrow batch first + batch_tables = [] + found_in_batch = set() + + for entry_id, arrow_table in self._arrow_batches[source_key]: + if entry_id in entry_ids_list: + table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) + batch_tables.append(table_with_id) + found_in_batch.add(entry_id) + + # Get remaining from consolidated store + remaining_ids = [eid for eid in existing_entries if eid not in found_in_batch] + + consolidated_tables = [] + if remaining_ids: + df = self._get_consolidated_dataframe(source_key) + if df is not None: + if preserve_input_order: + ordered_df = pl.DataFrame({"__entry_id": entry_ids_list}) + result_df = ordered_df.join(df, on="__entry_id", how="left") + else: + result_df = df.filter(pl.col("__entry_id").is_in(remaining_ids)) + + if result_df.height > 0: + consolidated_tables.append(result_df.to_arrow()) + + # Combine all results + all_tables = batch_tables + consolidated_tables + + if not all_tables: + return None + + # Concatenate Arrow tables + if len(all_tables) == 1: + result_table = all_tables[0] + else: + result_table = pa.concat_tables(all_tables) + + # Handle entry_id column + if add_entry_id_column is False: + # Remove __entry_id column + column_names = result_table.column_names + if "__entry_id" in column_names: + indices = [i for i, name in enumerate(column_names) if name != "__entry_id"] + result_table = result_table.select(indices) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id column + schema = result_table.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + result_table = result_table.rename_columns(new_names) + + return result_table + + def get_records_by_ids_as_polars( + self, + source_name: str, + source_id: str, + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pl.LazyFrame | None: + """Retrieve records by entry IDs as Polars LazyFrame.""" + arrow_result = self.get_records_by_ids( + source_name, source_id, entry_ids, add_entry_id_column, preserve_input_order + ) + + if arrow_result is None: + return None + + pl_result = cast(pl.DataFrame, pl.from_arrow(arrow_result)) + + return pl_result.lazy() + + def entry_exists(self, source_name: str, source_id: str, entry_id: str) -> bool: + """Check if entry exists using the index.""" + source_key = self._get_source_key(source_name, source_id) + return entry_id in self._entry_index[source_key] + + def list_entries(self, source_name: str, source_id: str) -> set[str]: + """List all entry IDs using the index.""" + source_key = self._get_source_key(source_name, source_id) + return self._entry_index[source_key].copy() + + def list_sources(self) -> set[tuple[str, str]]: + """List all source combinations.""" + sources = set() + for source_key in self._entry_index.keys(): + if ":" in source_key: + source_name, source_id = source_key.split(":", 1) + sources.add((source_name, source_id)) + return sources + + def force_consolidation(self) -> None: + """Force consolidation of all Arrow batches.""" + for source_key in list(self._arrow_batches.keys()): + self._force_consolidation(source_key) + logger.info("Forced consolidation of all Arrow batches") + + def clear_source(self, source_name: str, source_id: str) -> None: + """Clear all data for a source.""" + source_key = self._get_source_key(source_name, source_id) + + if source_key in self._arrow_batches: + del self._arrow_batches[source_key] + if source_key in self._polars_store: + del self._polars_store[source_key] + if source_key in self._entry_index: + del self._entry_index[source_key] + if source_key in self._schema_cache: + del self._schema_cache[source_key] + + logger.debug(f"Cleared source {source_key}") + + def clear_all(self) -> None: + """Clear all data.""" + self._arrow_batches.clear() + self._polars_store.clear() + self._entry_index.clear() + self._schema_cache.clear() + logger.info("Cleared all data") + + def get_stats(self) -> dict[str, Any]: + """Get comprehensive statistics.""" + total_records = sum(len(entries) for entries in self._entry_index.values()) + total_batched = sum(len(batch) for batch in self._arrow_batches.values()) + total_consolidated = sum( + len(df) for df in self._polars_store.values() + ) if self._polars_store else 0 + + source_stats = [] + for source_key in self._entry_index.keys(): + record_count = len(self._entry_index[source_key]) + batched_count = len(self._arrow_batches.get(source_key, [])) + consolidated_count = 0 + + if source_key in self._polars_store: + consolidated_count = len(self._polars_store[source_key]) + + source_stats.append({ + "source_key": source_key, + "total_records": record_count, + "batched_records": batched_count, + "consolidated_records": consolidated_count, + }) + + return { + "total_records": total_records, + "total_sources": len(self._entry_index), + "total_batched": total_batched, + "total_consolidated": total_consolidated, + "batch_size": self.batch_size, + "duplicate_entry_behavior": self.duplicate_entry_behavior, + "source_details": source_stats, + } + + def optimize_for_reads(self) -> None: + """Optimize for read operations by consolidating all batches.""" + logger.info("Optimizing for reads - consolidating all Arrow batches...") + self.force_consolidation() + # Clear Arrow batches to save memory + self._arrow_batches.clear() + logger.info("Optimization complete") \ No newline at end of file diff --git a/src/orcapod/store/transfer.py b/src/orcapod/store/transfer_data_store.py similarity index 100% rename from src/orcapod/store/transfer.py rename to src/orcapod/store/transfer_data_store.py diff --git a/src/orcapod/store/types.py b/src/orcapod/store/types.py index 6c1b5af..49d9a70 100644 --- a/src/orcapod/store/types.py +++ b/src/orcapod/store/types.py @@ -45,6 +45,7 @@ def add_record( source_id: str, entry_id: str, arrow_data: pa.Table, + ignore_duplicate: bool = False, ) -> pa.Table: ... def get_record( diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index f372259..cbcfffc 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,52 +1,29 @@ # src/orcabridge/types.py -import os -from collections.abc import Collection, Mapping -from pathlib import Path -from typing import Any, Protocol -from typing_extensions import TypeAlias -from .core import TypeSpec, TypeHandler - - -SUPPORTED_PYTHON_TYPES = (str, int, float, bool, bytes) - -# Convenience alias for anything pathlike -PathLike = str | os.PathLike - -# an (optional) string or a collection of (optional) string values -# Note that TagValue can be nested, allowing for an arbitrary depth of nested lists -TagValue: TypeAlias = str | None | Collection["TagValue"] - -# the top level tag is a mapping from string keys to values that can be a string or -# an arbitrary depth of nested list of strings or None -Tag: TypeAlias = Mapping[str, TagValue] - -# a pathset is a path or an arbitrary depth of nested list of paths -PathSet: TypeAlias = PathLike | Collection[PathLike | None] - -# Simple data types that we support (with clear Polars correspondence) -SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes - -ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike - -# Extended data values that can be stored in packets -# Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = PathSet | SupportedNativePythonData | Collection["DataValue"] - - -# a packet is a mapping from string keys to data values -Packet: TypeAlias = Mapping[str, DataValue] - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[Tag, Collection[Packet]] - - -class PodFunction(Protocol): - """ - A function suitable to be used in a FunctionPod. - It takes one or more named arguments, each corresponding to either: - - A path to a file or directory (PathSet) - for backward compatibility - - A simple data value (str, int, float, bool, bytes, Path) - and returns either None, a single value, or a list of values - """ - - def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... +from .core import Tag, Packet, TypeSpec, PathLike, PathSet, PodFunction +from .registry import TypeRegistry +from .handlers import PathHandler, UUIDHandler, DateTimeHandler +from . import handlers +from . import typespec + + +# Create default registry and register handlers +default_registry = TypeRegistry() + +# Register with semantic names - registry extracts supported types automatically +default_registry.register("path", PathHandler()) +default_registry.register("uuid", UUIDHandler()) +default_registry.register( + "datetime", DateTimeHandler() +) # Registers for datetime, date, time + +__all__ = [ + "default_registry", + "Tag", + "Packet", + "TypeSpec", + "PathLike", + "PathSet", + "PodFunction", + "handlers", + "typespec", +] diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 5822f87..097750e 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -1,6 +1,8 @@ -from typing import Protocol, Any, TypeAlias, Mapping +from typing import Protocol, Any, TypeAlias import pyarrow as pa from dataclasses import dataclass +import os +from collections.abc import Collection, Mapping # TODO: reconsider the need for this dataclass as its information is superfluous @@ -20,6 +22,51 @@ class TypeInfo: ] # Mapping of parameter names to their types +SUPPORTED_PYTHON_TYPES = (str, int, float, bool, bytes) + +# Convenience alias for anything pathlike +PathLike = str | os.PathLike + +# an (optional) string or a collection of (optional) string values +# Note that TagValue can be nested, allowing for an arbitrary depth of nested lists +TagValue: TypeAlias = str | None | Collection["TagValue"] + +# the top level tag is a mapping from string keys to values that can be a string or +# an arbitrary depth of nested list of strings or None +Tag: TypeAlias = Mapping[str, TagValue] + +# a pathset is a path or an arbitrary depth of nested list of paths +PathSet: TypeAlias = PathLike | Collection[PathLike | None] + +# Simple data types that we support (with clear Polars correspondence) +SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes + +ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike + +# Extended data values that can be stored in packets +# Either the original PathSet or one of our supported simple data types +DataValue: TypeAlias = PathSet | SupportedNativePythonData | Collection["DataValue"] + + +# a packet is a mapping from string keys to data values +Packet: TypeAlias = Mapping[str, DataValue] + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[Tag, Collection[Packet]] + + +class PodFunction(Protocol): + """ + A function suitable to be used in a FunctionPod. + It takes one or more named arguments, each corresponding to either: + - A path to a file or directory (PathSet) - for backward compatibility + - A simple data value (str, int, float, bool, bytes, Path) + and returns either None, a single value, or a list of values + """ + + def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... + + class TypeHandler(Protocol): """Protocol for handling conversion between Python types and underlying Arrow data types used for storage. diff --git a/src/orcapod/types/default.py b/src/orcapod/types/default.py deleted file mode 100644 index d41e577..0000000 --- a/src/orcapod/types/default.py +++ /dev/null @@ -1,18 +0,0 @@ -from .registry import TypeRegistry -from .handlers import ( - PathHandler, - UUIDHandler, - SimpleMappingHandler, - DateTimeHandler, -) -import pyarrow as pa - -# Create default registry and register handlers -default_registry = TypeRegistry() - -# Register with semantic names - registry extracts supported types automatically -default_registry.register("path", PathHandler()) -default_registry.register("uuid", UUIDHandler()) -default_registry.register( - "datetime", DateTimeHandler() -) # Registers for datetime, date, time diff --git a/src/orcapod/types/registry.py b/src/orcapod/types/registry.py index 0dafda5..6b56183 100644 --- a/src/orcapod/types/registry.py +++ b/src/orcapod/types/registry.py @@ -1,4 +1,4 @@ -from collections.abc import Callable, Collection, Sequence +from collections.abc import Callable, Collection, Sequence, Mapping import logging from optparse import Values from typing import Any @@ -156,9 +156,7 @@ def _to_storage_packet(self, packet: Packet) -> dict[str, Any]: self._check_key_consistency(packet_keys) # Convert each value - storage_packet: dict[str, Any] = ( - packet.copy() - ) # Start with a copy of the packet + storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet for key, handler in self.keys_with_handlers: try: @@ -168,7 +166,7 @@ def _to_storage_packet(self, packet: Packet) -> dict[str, Any]: return storage_packet - def _from_storage_packet(self, storage_packet: dict[str, Any]) -> Packet: + def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> Packet: """Convert storage packet back to Python packet. Args: @@ -188,7 +186,7 @@ def _from_storage_packet(self, storage_packet: dict[str, Any]) -> Packet: self._check_key_consistency(storage_keys) # Convert each value back to Python type - packet: Packet = storage_packet.copy() + packet: Packet = dict(storage_packet) for key, handler in self.keys_with_handlers: try: diff --git a/src/orcapod/types/inference.py b/src/orcapod/types/typespec.py similarity index 98% rename from src/orcapod/types/inference.py rename to src/orcapod/types/typespec.py index 2f18f39..eb5be89 100644 --- a/src/orcapod/types/inference.py +++ b/src/orcapod/types/typespec.py @@ -1,20 +1,20 @@ -# Library of functions for inferring types for FunctionPod input and output parameters. +# Library of functions for working with TypeSpecs and for extracting TypeSpecs from a function's signature from collections.abc import Callable, Collection, Sequence -from typing import get_origin, get_args, TypeAlias +from typing import get_origin, get_args from .core import TypeSpec import inspect import logging - logger = logging.getLogger(__name__) def verify_against_typespec(packet: dict, typespec: TypeSpec) -> bool: """Verify that the dictionary's types match the expected types in the typespec.""" from beartype.door import is_bearable + # verify that packet contains no keys not in typespec if set(packet.keys()) - set(typespec.keys()): logger.warning( @@ -40,6 +40,7 @@ def check_typespec_compatibility( incoming_types: TypeSpec, receiving_types: TypeSpec ) -> bool: from beartype.door import is_subhint + for key, type_info in incoming_types.items(): if key not in receiving_types: logger.warning(f"Key '{key}' not found in parameter types.") @@ -52,7 +53,7 @@ def check_typespec_compatibility( return True -def extract_function_data_types( +def extract_function_typespecs( func: Callable, output_keys: Collection[str], input_types: TypeSpec | None = None, diff --git a/uv.lock b/uv.lock index 589ebc2..ba522ac 100644 --- a/uv.lock +++ b/uv.lock @@ -1230,7 +1230,7 @@ requires-dist = [ { name = "matplotlib", specifier = ">=3.10.3" }, { name = "networkx" }, { name = "pandas", specifier = ">=2.2.3" }, - { name = "polars", specifier = ">=1.30.0" }, + { name = "polars", specifier = ">=1.31.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, { name = "pyyaml", specifier = ">=6.0.2" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, @@ -1436,16 +1436,16 @@ wheels = [ [[package]] name = "polars" -version = "1.30.0" +version = "1.31.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/82/b6/8dbdf626c0705a57f052708c9fc0860ffc2aa97955930d5faaf6a66fcfd3/polars-1.30.0.tar.gz", hash = "sha256:dfe94ae84a5efd9ba74e616e3e125b24ca155494a931890a8f17480737c4db45", size = 4668318, upload-time = "2025-05-21T13:33:24.175Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/f5/de1b5ecd7d0bd0dd87aa392937f759f9cc3997c5866a9a7f94eabf37cd48/polars-1.31.0.tar.gz", hash = "sha256:59a88054a5fc0135386268ceefdbb6a6cc012d21b5b44fed4f1d3faabbdcbf32", size = 4681224, upload-time = "2025-06-18T12:00:46.24Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/40/48/e9b2cb379abcc9f7aff2e701098fcdb9fe6d85dc4ad4cec7b35d39c70951/polars-1.30.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:4c33bc97c29b7112f0e689a2f8a33143973a3ff466c70b25c7fd1880225de6dd", size = 35704342, upload-time = "2025-05-21T13:32:22.996Z" }, - { url = "https://files.pythonhosted.org/packages/36/ca/f545f61282f75eea4dfde4db2944963dcd59abd50c20e33a1c894da44dad/polars-1.30.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:e3d05914c364b8e39a5b10dcf97e84d76e516b3b1693880bf189a93aab3ca00d", size = 32459857, upload-time = "2025-05-21T13:32:27.728Z" }, - { url = "https://files.pythonhosted.org/packages/76/20/e018cd87d7cb6f8684355f31f4e193222455a6e8f7b942f4a2934f5969c7/polars-1.30.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a52af3862082b868c1febeae650af8ae8a2105d2cb28f0449179a7b44f54ccf", size = 36267243, upload-time = "2025-05-21T13:32:31.796Z" }, - { url = "https://files.pythonhosted.org/packages/cb/e7/b88b973021be07b13d91b9301cc14392c994225ef5107a32a8ffd3fd6424/polars-1.30.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:ffb3ef133454275d4254442257c5f71dd6e393ce365c97997dadeb6fa9d6d4b5", size = 33416871, upload-time = "2025-05-21T13:32:35.077Z" }, - { url = "https://files.pythonhosted.org/packages/dd/7c/d46d4381adeac537b8520b653dc30cb8b7edbf59883d71fbb989e9005de1/polars-1.30.0-cp39-abi3-win_amd64.whl", hash = "sha256:c26b633a9bd530c5fc09d317fca3bb3e16c772bd7df7549a9d8ec1934773cc5d", size = 36363630, upload-time = "2025-05-21T13:32:38.286Z" }, - { url = "https://files.pythonhosted.org/packages/fb/b5/5056d0c12aadb57390d0627492bef8b1abf3549474abb9ae0fd4e2bfa885/polars-1.30.0-cp39-abi3-win_arm64.whl", hash = "sha256:476f1bde65bc7b4d9f80af370645c2981b5798d67c151055e58534e89e96f2a8", size = 32643590, upload-time = "2025-05-21T13:32:42.107Z" }, + { url = "https://files.pythonhosted.org/packages/3d/6e/bdd0937653c1e7a564a09ae3bc7757ce83fedbf19da600c8b35d62c0182a/polars-1.31.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ccc68cd6877deecd46b13cbd2663ca89ab2a2cb1fe49d5cfc66a9cef166566d9", size = 34511354, upload-time = "2025-06-18T11:59:40.048Z" }, + { url = "https://files.pythonhosted.org/packages/77/fe/81aaca3540c1a5530b4bc4fd7f1b6f77100243d7bb9b7ad3478b770d8b3e/polars-1.31.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:a94c5550df397ad3c2d6adc212e59fd93d9b044ec974dd3653e121e6487a7d21", size = 31377712, upload-time = "2025-06-18T11:59:45.104Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d9/5e2753784ea30d84b3e769a56f5e50ac5a89c129e87baa16ac0773eb4ef7/polars-1.31.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada7940ed92bea65d5500ae7ac1f599798149df8faa5a6db150327c9ddbee4f1", size = 35050729, upload-time = "2025-06-18T11:59:48.538Z" }, + { url = "https://files.pythonhosted.org/packages/20/e8/a6bdfe7b687c1fe84bceb1f854c43415eaf0d2fdf3c679a9dc9c4776e462/polars-1.31.0-cp39-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:b324e6e3e8c6cc6593f9d72fe625f06af65e8d9d47c8686583585533a5e731e1", size = 32260836, upload-time = "2025-06-18T11:59:52.543Z" }, + { url = "https://files.pythonhosted.org/packages/6e/f6/9d9ad9dc4480d66502497e90ce29efc063373e1598f4bd9b6a38af3e08e7/polars-1.31.0-cp39-abi3-win_amd64.whl", hash = "sha256:3fd874d3432fc932863e8cceff2cff8a12a51976b053f2eb6326a0672134a632", size = 35156211, upload-time = "2025-06-18T11:59:55.805Z" }, + { url = "https://files.pythonhosted.org/packages/40/4b/0673a68ac4d6527fac951970e929c3b4440c654f994f0c957bd5556deb38/polars-1.31.0-cp39-abi3-win_arm64.whl", hash = "sha256:62ef23bb9d10dca4c2b945979f9a50812ac4ace4ed9e158a6b5d32a7322e6f75", size = 31469078, upload-time = "2025-06-18T11:59:59.242Z" }, ] [[package]] From 09eb9473cd35af5169b2c267f0ab8421353ecf29 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 24 Jun 2025 22:26:10 +0000 Subject: [PATCH 004/224] refactor: implement ContentHashableBase --- src/orcapod/core/base.py | 21 ++++----- src/orcapod/core/pod.py | 5 ++- src/orcapod/hashing/__init__.py | 2 + src/orcapod/hashing/content_hashable.py | 57 +++++++++++++++++++++++++ src/orcapod/hashing/core.py | 16 ++++--- src/orcapod/hashing/types.py | 2 + 6 files changed, 82 insertions(+), 21 deletions(-) create mode 100644 src/orcapod/hashing/content_hashable.py diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 664352d..0dd45ad 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -5,7 +5,10 @@ from typing import Any -from orcapod.hashing import HashableMixin +from orcapod.hashing import HashableMixin, ObjectHasher +from orcapod.hashing import get_default_object_hasher + +from orcapod.hashing import ContentHashableBase from orcapod.types import Packet, Tag, TypeSpec from orcapod.utils.stream_utils import get_typespec @@ -15,7 +18,7 @@ logger = logging.getLogger(__name__) -class Kernel(ABC, HashableMixin): +class Kernel(ABC, ContentHashableBase): """ Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. It is the base class for all computations and transformations that can be performed on a collection of streams @@ -27,7 +30,7 @@ class Kernel(ABC, HashableMixin): for computational graph tracking. """ - def __init__(self, label: str | None = None, skip_tracking: bool = False, **kwargs) -> None: + def __init__(self, label: str | None = None, skip_tracking: bool = False,**kwargs) -> None: super().__init__(**kwargs) self._label = label self._skip_tracking = skip_tracking @@ -227,7 +230,7 @@ def record(self, invocation: "Invocation") -> None: ... # This is NOT an abstract class, but rather a concrete class that # represents an invocation of a kernel on a collection of streams. -class Invocation(HashableMixin): +class Invocation(ContentHashableBase): """ This class represents an invocation of a kernel on a collection of streams. It contains the kernel and the streams that were used in the invocation. @@ -244,20 +247,12 @@ def __init__( self.kernel = kernel self.streams = streams - def __hash__(self) -> int: - return super().__hash__() - def __repr__(self) -> str: return f"Invocation(kernel={self.kernel}, streams={self.streams})" def __str__(self) -> str: return f"Invocation[ID:{self.__hash__()}]({self.kernel}, {self.streams})" - def __eq__(self, other: Any) -> bool: - if not isinstance(other, Invocation): - return False - return hash(self) == hash(other) - def __lt__(self, other: Any) -> bool: if not isinstance(other, Invocation): return NotImplemented @@ -294,7 +289,7 @@ def identity_structure(self) -> int: return self.kernel.identity_structure(*self.streams) -class Stream(ABC, HashableMixin): +class Stream(ABC, ContentHashableBase): """ A stream is a collection of tagged-packets that are generated by an operation. The stream is iterable and can be used to access the packets in the stream. diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index 582fa85..77d1610 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -228,8 +228,9 @@ def __repr__(self) -> str: return f"FunctionPod:{self.function!r}" def __str__(self) -> str: - func_sig = get_function_signature(self.function) - return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" + include_module = self.function.__module__ != "__main__" + func_sig = get_function_signature(self.function, name_override=self.function_name, include_module=include_module) + return f"FunctionPod:{func_sig}" def call(self, tag, packet) -> tuple[Tag, Packet | None]: if not self.is_active(): diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 98a15da..f95b0f7 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -24,6 +24,7 @@ FunctionInfoExtractor, CompositeFileHasher, ) +from .content_hashable import ContentHashableBase __all__ = [ "FileHasher", @@ -46,4 +47,5 @@ "get_default_composite_file_hasher", "get_default_object_hasher", "get_default_arrow_hasher", + "ContentHashableBase", ] diff --git a/src/orcapod/hashing/content_hashable.py b/src/orcapod/hashing/content_hashable.py new file mode 100644 index 0000000..f3fc929 --- /dev/null +++ b/src/orcapod/hashing/content_hashable.py @@ -0,0 +1,57 @@ + +from .types import ObjectHasher +from .defaults import get_default_object_hasher +from typing import Any + +class ContentHashableBase: + def __init__(self, object_hasher: ObjectHasher | None = None) -> None: + """ + Initialize the ContentHashable with an optional ObjectHasher. + + Args: + object_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. + """ + self.object_hasher = object_hasher or get_default_object_hasher() + + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this object. + + Override this method in your subclass to provide a stable representation + of your object's content. The structure should contain all fields that + determine the object's identity. + + Returns: + Any: A structure representing this object's content, or None to use default hash + """ + return None + + + def __hash__(self) -> int: + """ + Hash implementation that uses the identity structure if provided, + otherwise falls back to the superclass's hash method. + + Returns: + int: A hash value based on either content or identity + """ + # Get the identity structure + structure = self.identity_structure() + + return self.object_hasher.hash_to_int(structure) + + def __eq__(self, other: object) -> bool: + """ + Equality check that compares the identity structures of two objects. + + Args: + other (object): The object to compare against. + + Returns: + bool: True if both objects have the same identity structure, False otherwise. + """ + if not isinstance(other, ContentHashableBase): + return NotImplemented + + return self.identity_structure() == other.identity_structure() \ No newline at end of file diff --git a/src/orcapod/hashing/core.py b/src/orcapod/hashing/core.py index 66b4e4d..6e40bde 100644 --- a/src/orcapod/hashing/core.py +++ b/src/orcapod/hashing/core.py @@ -832,6 +832,7 @@ def get_function_signature( name_override: str | None = None, include_defaults: bool = True, include_module: bool = True, + output_names: Collection[str] | None = None ) -> str: """ Get a stable string representation of a function's signature. @@ -847,14 +848,14 @@ def get_function_signature( sig = inspect.signature(func) # Build the signature string - parts = [] + parts = {} # Add module if requested if include_module and hasattr(func, "__module__"): - parts.append(f"module:{func.__module__}") + parts["module"] = func.__module__ # Add function name - parts.append(f"name:{name_override or func.__name__}") + parts["name"] = name_override or func.__name__ # Add parameters param_strs = [] @@ -864,13 +865,16 @@ def get_function_signature( param_str = param_str.split("=")[0].strip() param_strs.append(param_str) - parts.append(f"params:({', '.join(param_strs)})") + parts["params"] = f"({', '.join(param_strs)})" # Add return annotation if present if sig.return_annotation is not inspect.Signature.empty: - parts.append(f"returns:{sig.return_annotation}") + parts["returns"] = sig.return_annotation - return " ".join(parts) + fn_string = f"{parts["module"] + "." if "module" in parts else ""}{parts["name"]}{parts["params"]}" + if "returns" in parts: + fn_string = fn_string + f"-> {str(parts["returns"])}" + return fn_string def _is_in_string(line, pos): diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 36155bb..abae409 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -140,3 +140,5 @@ def extract_function_info( input_typespec: TypeSpec | None = None, output_typespec: TypeSpec | None = None, ) -> dict[str, Any]: ... + + From bd3c7a871641fc128ce2d3e37bede5b14f62b301 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 25 Jun 2025 03:10:29 +0000 Subject: [PATCH 005/224] refactor: significantly clean up label logic --- src/orcapod/core/base.py | 64 +++++++++---------------- src/orcapod/core/operators.py | 14 +++++- src/orcapod/core/streams.py | 35 ++++++++++++-- src/orcapod/core/tracker.py | 47 ++++++++++++++---- src/orcapod/hashing/content_hashable.py | 31 +++++++++++- src/orcapod/hashing/core.py | 6 +++ src/orcapod/pipeline/pipeline.py | 52 +++++++++++++++----- src/orcapod/pipeline/wrappers.py | 14 +++--- 8 files changed, 184 insertions(+), 79 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 0dd45ad..9aa8b4d 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -30,24 +30,12 @@ class Kernel(ABC, ContentHashableBase): for computational graph tracking. """ - def __init__(self, label: str | None = None, skip_tracking: bool = False,**kwargs) -> None: + def __init__(self, label: str | None = None, skip_tracking: bool = False, **kwargs) -> None: super().__init__(**kwargs) self._label = label self._skip_tracking = skip_tracking - @property - def label(self) -> str: - """ - Returns a human-readable label for this kernel. - Default implementation returns the provided label or class name if no label was provided. - """ - if self._label: - return self._label - return self.__class__.__name__ - - @label.setter - def label(self, label: str) -> None: - self._label = label + def pre_forward_hook( self, *streams: "SyncStream", **kwargs @@ -68,7 +56,9 @@ def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStrea return output_stream - def __call__(self, *streams: "SyncStream", **kwargs) -> "SyncStream": + def __call__(self, *streams: "SyncStream", label:str|None = None, **kwargs) -> "SyncStream": + if label is not None: + self.label = label # Special handling of Source: trigger call on source if passed as stream normalized_streams = [ stream() if isinstance(stream, Source) else stream for stream in streams @@ -243,10 +233,19 @@ def __init__( kernel: Kernel, # TODO: technically this should be Stream to stay consistent with Stream interface. Update to Stream when AsyncStream is implemented streams: Collection["SyncStream"], + **kwargs, ) -> None: + super().__init__(**kwargs) self.kernel = kernel self.streams = streams + def computed_label(self) -> str | None: + """ + Returns the computed label for this invocation. + This is used to provide a default label if no label is set. + """ + return self.kernel.label + def __repr__(self) -> str: return f"Invocation(kernel={self.kernel}, streams={self.streams})" @@ -298,35 +297,16 @@ class Stream(ABC, ContentHashableBase): This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). """ - def __init__(self, label: str | None = None, **kwargs) -> None: + def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self._invocation: Invocation | None = None - self._label = label - - @property - def label(self) -> str: - """ - Returns a human-readable label for this stream. - If no label is provided and the stream is generated by an operation, - the label of the operation is used. - Otherwise, the class name is used as the label. - """ - if self._label is None: - if self.invocation is not None: - # use the invocation operation label - return self.invocation.kernel.label - else: - return self.__class__.__name__ - return self._label - @label.setter - def label(self, label: str) -> None: - """ - Sets a human-readable label for this stream. - """ - if not isinstance(label, str): - raise TypeError("label must be a string") - self._label = label + def computed_label(self) -> str | None: + if self.invocation is not None: + # use the invocation operation label + return self.invocation.kernel.label + return None + @property def invocation(self) -> Invocation | None: @@ -347,7 +327,7 @@ def flow(self) -> Collection[tuple[Tag, Packet]]: Flow everything through the stream, returning the entire collection of (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. """ - return list(self) + return [e for e in self] # --------------------- Recursive methods --------------------------- # These methods form a step in the multi-class recursive invocation that follows the pattern of diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index 84a31f3..654d9d2 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -438,7 +438,12 @@ def keys( stream = streams[0] tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) if tag_keys is None or packet_keys is None: - return super().keys(trigger_run=trigger_run) + super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) + tag_keys = tag_keys or super_tag_keys + packet_keys = packet_keys or super_packet_keys + + if packet_keys is None: + return tag_keys, packet_keys if self.drop_unmapped: # If drop_unmapped is True, we only keep the keys that are in the mapping @@ -464,7 +469,12 @@ def types( stream = streams[0] tag_types, packet_types = stream.types(trigger_run=trigger_run) if tag_types is None or packet_types is None: - return super().types(trigger_run=trigger_run) + super_tag_types, super_packet_types = super().types(trigger_run=trigger_run) + tag_types = tag_types or super_tag_types + packet_types = packet_types or super_packet_types + + if packet_types is None: + return tag_types, packet_types if self.drop_unmapped: # If drop_unmapped is True, we only keep the keys that are in the mapping diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index 77cdbe3..f5c5e60 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -1,7 +1,8 @@ from collections.abc import Callable, Collection, Iterator from orcapod.core.base import SyncStream -from orcapod.types import Packet, Tag +from orcapod.types import Packet, Tag, TypeSpec +from copy import copy class SyncStreamFromLists(SyncStream): @@ -12,12 +13,21 @@ def __init__( paired: Collection[tuple[Tag, Packet]] | None = None, tag_keys: list[str] | None = None, packet_keys: list[str] | None = None, + tag_typespec: TypeSpec | None = None, + packet_typespec: TypeSpec | None = None, strict: bool = True, **kwargs, ) -> None: super().__init__(**kwargs) + self.tag_typespec = tag_typespec + self.packet_typespec = packet_typespec + if tag_keys is None and tag_typespec is not None: + tag_keys = list(tag_typespec.keys()) + if packet_keys is None and packet_typespec is not None: + packet_keys = list(packet_typespec.keys()) self.tag_keys = tag_keys self.packet_keys = packet_keys + if tags is not None and packets is not None: if strict and len(tags) != len(packets): raise ValueError( @@ -34,14 +44,31 @@ def __init__( def keys( self, *, trigger_run: bool = False ) -> tuple[Collection[str] | None, Collection[str] | None]: - if self.tag_keys is None or self.packet_keys is None: - return super().keys(trigger_run=trigger_run) + tag_keys, packet_keys = copy(self.tag_keys), copy(self.packet_keys) + if tag_keys is None or packet_keys is None: + super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) + tag_keys = tag_keys or super_tag_keys + packet_keys = packet_keys or super_packet_keys + # If the keys are already set, return them - return self.tag_keys.copy(), self.packet_keys.copy() + return tag_keys, packet_keys + + def types( + self, *, trigger_run: bool = False + ) -> tuple[TypeSpec | None, TypeSpec | None]: + tag_typespec, packet_typespec = copy(self.tag_typespec), copy(self.packet_typespec) + if tag_typespec is None or packet_typespec is None: + super_tag_typespec, super_packet_typespec = super().types(trigger_run=trigger_run) + tag_typespec = tag_typespec or super_tag_typespec + packet_typespec = packet_typespec or super_packet_typespec + + # If the types are already set, return them + return tag_typespec, packet_typespec def __iter__(self) -> Iterator[tuple[Tag, Packet]]: yield from self.paired + class SyncStreamFromGenerator(SyncStream): """ diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py index 2532582..337c027 100644 --- a/src/orcapod/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -1,8 +1,37 @@ -from orcapod.core.base import Invocation, Kernel, Tracker, SyncStream, TypeSpec -from collections.abc import Collection +from orcapod.core.base import Invocation, Kernel, Tracker, SyncStream, Source +from orcapod.types import Tag, Packet, TypeSpec +from collections.abc import Collection, Iterator from typing import Any -class StubKernel(Kernel): +class StreamWrapper(SyncStream): + """ + A wrapper for a SyncStream that allows it to be used as a Source. + This is useful for cases where you want to treat a stream as a source + without modifying the original stream. + """ + + def __init__(self, stream: SyncStream, **kwargs): + super().__init__(**kwargs) + self.stream = stream + + def keys(self, *streams: SyncStream, **kwargs) -> tuple[Collection[str]|None, Collection[str]|None]: + return self.stream.keys(*streams, **kwargs) + + def types(self, *streams: SyncStream, **kwargs) -> tuple[TypeSpec|None, TypeSpec|None]: + return self.stream.types(*streams, **kwargs) + + def computed_label(self) -> str | None: + return self.stream.label + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + """ + Iterate over the stream, yielding tuples of (tags, packets). + """ + yield from self.stream + + + +class StreamSource(Source): def __init__(self, stream: SyncStream, **kwargs): super().__init__(skip_tracking=True, **kwargs) self.stream = stream @@ -10,15 +39,15 @@ def __init__(self, stream: SyncStream, **kwargs): def forward(self, *streams: SyncStream) -> SyncStream: if len(streams) != 0: raise ValueError( - "StubKernel does not support forwarding streams. " + "StreamSource does not support forwarding streams. " "It generates its own stream from the file system." ) - return self.stream + return StreamWrapper(self.stream) def identity_structure(self, *streams) -> Any: if len(streams) != 0: raise ValueError( - "StubKernel does not support forwarding streams. " + "StreamSource does not support forwarding streams. " "It generates its own stream from the file system." ) @@ -29,11 +58,11 @@ def types(self, *streams: SyncStream, **kwargs) -> tuple[TypeSpec|None, TypeSpec def keys(self, *streams: SyncStream, **kwargs) -> tuple[Collection[str]|None, Collection[str]|None]: return self.stream.keys() - + def computed_label(self) -> str | None: + return self.stream.label - class GraphTracker(Tracker): """ @@ -89,7 +118,7 @@ def generate_graph(self): upstream_invocation = upstream.invocation if upstream_invocation is None: # If upstream is None, create a stub kernel - upstream_invocation = Invocation(StubKernel(upstream, label="StubInput"), []) + upstream_invocation = Invocation(StreamSource(upstream), []) if upstream_invocation not in G: G.add_node(upstream_invocation) G.add_edge(upstream_invocation, invocation, stream=upstream) diff --git a/src/orcapod/hashing/content_hashable.py b/src/orcapod/hashing/content_hashable.py index f3fc929..33fd1bb 100644 --- a/src/orcapod/hashing/content_hashable.py +++ b/src/orcapod/hashing/content_hashable.py @@ -3,8 +3,9 @@ from .defaults import get_default_object_hasher from typing import Any + class ContentHashableBase: - def __init__(self, object_hasher: ObjectHasher | None = None) -> None: + def __init__(self, object_hasher: ObjectHasher | None = None, label: str | None = None) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. @@ -12,6 +13,31 @@ def __init__(self, object_hasher: ObjectHasher | None = None) -> None: object_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ self.object_hasher = object_hasher or get_default_object_hasher() + self._label = label + + @property + def label(self) -> str : + """ + Get the label of this object. + + Returns: + str | None: The label of the object, or None if not set. + """ + return self._label or self.computed_label() or self.__class__.__name__ + + @label.setter + def label(self, label: str | None) -> None: + """ + Set the label of this object. + + Args: + label (str | None): The label to set for this object. + """ + self._label = label + + def computed_label(self) -> str|None: + return None + def identity_structure(self) -> Any: @@ -38,6 +64,9 @@ def __hash__(self) -> int: """ # Get the identity structure structure = self.identity_structure() + if structure is None: + # If no identity structure is provided, use the default hash + return super().__hash__() return self.object_hasher.hash_to_int(structure) diff --git a/src/orcapod/hashing/core.py b/src/orcapod/hashing/core.py index 6e40bde..d37ade3 100644 --- a/src/orcapod/hashing/core.py +++ b/src/orcapod/hashing/core.py @@ -29,6 +29,7 @@ ) from uuid import UUID + import xxhash from orcapod.types import Packet, PathSet @@ -435,6 +436,11 @@ def process_structure( if isinstance(obj, HashableMixin): logger.debug(f"Processing HashableMixin instance of type {type(obj).__name__}") return obj.content_hash() + + from .content_hashable import ContentHashableBase + if isinstance(obj, ContentHashableBase): + logger.debug(f"Processing ContentHashableBase instance of type {type(obj).__name__}") + return process_structure(obj.identity_structure(), visited, function_info_extractor) # Handle basic types if isinstance(obj, (str, int, float, bool)): diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 5df050f..7edd03e 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -1,15 +1,12 @@ from collections import defaultdict -from collections.abc import Collection, Iterator -import json +from collections.abc import Collection import logging import pickle import sys import time -from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Protocol, runtime_checkable +from typing import Any -import pandas as pd from orcapod.core import Invocation, Kernel, SyncStream from orcapod.core.pod import FunctionPod @@ -17,13 +14,7 @@ from orcapod.hashing import hash_to_hex from orcapod.core.tracker import GraphTracker -from orcapod.hashing import ObjectHasher, ArrowHasher -from orcapod.types import TypeSpec, Tag, Packet -from orcapod.core.streams import SyncStreamFromGenerator from orcapod.store import ArrowDataStore -from orcapod.types.registry import PacketConverter, TypeRegistry -from orcapod.types import default_registry -from orcapod.utils.stream_utils import merge_typespecs, get_typespec logger = logging.getLogger(__name__) @@ -40,12 +31,15 @@ class Pipeline(GraphTracker): Replaces the old Tracker with better persistence and view capabilities. """ - def __init__(self, name: str, results_store: ArrowDataStore, pipeline_store: ArrowDataStore) -> None: + def __init__(self, name: str, results_store: ArrowDataStore, pipeline_store: ArrowDataStore, auto_compile:bool=True) -> None: super().__init__() self.name = name or f"pipeline_{id(self)}" self.results_store = results_store self.pipeline_store = pipeline_store self.labels_to_nodes = {} + self.auto_compile = auto_compile + self._dirty = False + self._ordered_nodes = [] # Track order of invocations # Core Pipeline Operations def save(self, path: Path | str) -> None: @@ -77,6 +71,14 @@ def save(self, path: Path | str) -> None: temp_path.unlink() raise + def record(self, invocation: Invocation) -> None: + """ + Record an invocation in the pipeline. + This method is called automatically by the Kernel when an operation is invoked. + """ + super().record(invocation) + self._dirty = True + def wrap_invocation( self, kernel: Kernel, input_nodes: Collection[Node] ) -> Node: @@ -93,6 +95,7 @@ def compile(self): proposed_labels = defaultdict(list) node_lut = {} edge_lut : dict[SyncStream, Node]= {} + ordered_nodes = [] for invocation in nx.topological_sort(G): # map streams to the new streams based on Nodes input_nodes = [edge_lut[stream] for stream in invocation.streams] @@ -100,11 +103,14 @@ def compile(self): # register the new node against the original invocation node_lut[invocation] = new_node + ordered_nodes.append(new_node) # register the new node in the proposed labels -- if duplicates occur, will resolve later proposed_labels[new_node.label].append(new_node) for edge in G.out_edges(invocation): edge_lut[G.edges[edge]["stream"]] = new_node + + self._ordered_nodes = ordered_nodes # resolve duplicates in proposed_labels labels_to_nodes = {} @@ -120,8 +126,15 @@ def compile(self): labels_to_nodes[label] = nodes[0] self.labels_to_nodes = labels_to_nodes + self._dirty = False return node_lut, edge_lut, proposed_labels, labels_to_nodes + def __exit__(self, exc_type, exc_val, ext_tb): + super().__exit__(exc_type, exc_val, ext_tb) + if self.auto_compile: + self.compile() + + def __getattr__(self, item: str) -> Any: """Allow direct access to pipeline attributes""" if item in self.labels_to_nodes: @@ -131,8 +144,21 @@ def __getattr__(self, item: str) -> Any: def __dir__(self): # Include both regular attributes and dynamic ones return list(super().__dir__()) + list(self.labels_to_nodes.keys()) - + def run(self, full_sync:bool=False) -> None: + """ + Run the pipeline, compiling it if necessary. + This method is a no-op if auto_compile is False. + """ + if self.auto_compile and self._dirty: + self.compile() + + # Run in topological order + for node in self._ordered_nodes: + if full_sync: + node.reset_cache() + node.flow() + @classmethod def load(cls, path: Path | str) -> "Pipeline": """Load complete pipeline state""" diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index 55207c2..b6d5f18 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -126,13 +126,11 @@ def __repr__(self): def __str__(self): return f"{self.__class__.__name__}<{self.kernel}>" - @property - def label(self) -> str: - return self._label or self.kernel.label - - @label.setter - def label(self, label: str) -> None: - self._label = label + def computed_label(self) -> str | None: + """ + Return the label of the wrapped kernel. + """ + return self.kernel.label def resolve_input_streams(self, *input_streams) -> Collection[SyncStream]: if input_streams: @@ -207,7 +205,7 @@ def __init__( if _registry is None: _registry = default_registry self.registry = _registry - self.source_info = self.label, self.object_hasher.hash_to_hex(self.kernel) + self.source_info = self.label, str(hash(self.kernel)) self._cache_computed = False From 90b9dada7fac5b76d842bf3a9a449db957341425 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 25 Jun 2025 03:17:42 +0000 Subject: [PATCH 006/224] optim: avoid len call by using list comprehension --- src/orcapod/core/operators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index 654d9d2..b5c4512 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -261,8 +261,9 @@ def forward(self, *streams: SyncStream) -> SyncStream: left_stream, right_stream = streams def generator() -> Iterator[tuple[Tag, Packet]]: - left_stream_buffered = list(left_stream) - right_stream_buffered = list(right_stream) + # using list comprehension rather than list() to avoid call to __len__ which is expensive + left_stream_buffered = [e for e in left_stream] + right_stream_buffered = [e for e in right_stream] for left_tag, left_packet in left_stream_buffered: for right_tag, right_packet in right_stream_buffered: if (joined_tag := join_tags(left_tag, right_tag)) is not None: From 1e61259f890c34aa3a15ee1704dd618d7a7ecd00 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 25 Jun 2025 07:17:37 +0000 Subject: [PATCH 007/224] refactor: place Operator back in base --- src/orcapod/core/operators.py | 85 +++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 13 deletions(-) diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index b5c4512..53ecacc 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -5,24 +5,21 @@ from orcapod.types import Packet, Tag, TypeSpec from orcapod.hashing import function_content_hash, hash_function -from orcapod.core.base import Kernel, SyncStream +from orcapod.core.base import Kernel, SyncStream, Operator from orcapod.core.streams import SyncStreamFromGenerator from orcapod.utils.stream_utils import ( batch_packet, batch_tags, check_packet_compatibility, + intersection_typespecs, join_tags, - fill_missing, - merge_typespecs, + semijoin_tags, + union_typespecs, + intersection_typespecs, + fill_missing ) -class Operator(Kernel): - """ - A Mapper is an operation that does NOT generate new file content. - It is used to control the flow of data in the pipeline without modifying or creating data content. - """ - class Repeat(Operator): """ @@ -245,8 +242,8 @@ def types( right_tag_types, right_packet_types = right_stream.types(trigger_run=False) # TODO: do error handling when merge fails - joined_tag_types = merge_typespecs(left_tag_types, right_tag_types) - joined_packet_types = merge_typespecs(left_packet_types, right_packet_types) + joined_tag_types = union_typespecs(left_tag_types, right_tag_types) + joined_packet_types = union_typespecs(left_packet_types, right_packet_types) return joined_tag_types, joined_packet_types @@ -377,8 +374,8 @@ def types( ): return super().types(*streams, trigger_run=trigger_run) - joined_tag_types = merge_typespecs(left_tag_types, right_tag_types) - joined_packet_types = merge_typespecs(left_packet_types, right_packet_types) + joined_tag_types = union_typespecs(left_tag_types, right_tag_types) + joined_packet_types = union_typespecs(left_packet_types, right_packet_types) return joined_tag_types, joined_packet_types @@ -599,6 +596,68 @@ def keys( return mapped_tag_keys, packet_keys +class SemiJoin(Operator): + """ + Perform semi-join on the left stream tags with the tags of the right stream + """ + def identity_structure(self, *streams): + # Restrict DOES depend on the order of the streams -- maintain as a tuple + return (self.__class__.__name__,) + streams + + def keys( + self, *streams: SyncStream, trigger_run=False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + """ + For semijoin, output keys and types are identical to left stream + """ + if len(streams) != 2: + raise ValueError("Join operation requires exactly two streams") + + return streams[0].keys(trigger_run=trigger_run) + + def types( + self, *streams: SyncStream, trigger_run=False + ) -> tuple[TypeSpec | None, TypeSpec | None]: + """ + For semijoin, output keys and types are identical to left stream + """ + if len(streams) != 2: + raise ValueError("Join operation requires exactly two streams") + + return streams[0].types(trigger_run=trigger_run) + + def forward(self, *streams: SyncStream) -> SyncStream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + if len(streams) != 2: + raise ValueError("Join operation requires exactly two streams") + + left_stream, right_stream = streams + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + common_tag_typespec = intersection_typespecs(left_tag_typespec, right_tag_typespec) + common_tag_keys = None + if common_tag_typespec is not None: + common_tag_keys = list(common_tag_typespec.keys()) + + def generator() -> Iterator[tuple[Tag, Packet]]: + # using list comprehension rather than list() to avoid call to __len__ which is expensive + left_stream_buffered = [e for e in left_stream] + right_stream_buffered = [e for e in right_stream] + for left_tag, left_packet in left_stream_buffered: + for right_tag, _ in right_stream_buffered: + if semijoin_tags(left_tag, right_tag, common_tag_keys) is not None: + yield left_tag, left_packet + # move onto next entry + break + + return SyncStreamFromGenerator(generator) + + def __repr__(self) -> str: + return "SemiJoin()" class Filter(Operator): """ From df581342eb82a2f7a44ae716a0da5882c50d265c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 25 Jun 2025 07:18:29 +0000 Subject: [PATCH 008/224] refactor: place operator in base and add additional operator methods to sync stream --- src/orcapod/core/base.py | 77 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 9aa8b4d..0a99a8a 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -8,7 +8,7 @@ from orcapod.hashing import HashableMixin, ObjectHasher from orcapod.hashing import get_default_object_hasher -from orcapod.hashing import ContentHashableBase +from orcapod.hashing import ContentIdentifiableBase from orcapod.types import Packet, Tag, TypeSpec from orcapod.utils.stream_utils import get_typespec @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) -class Kernel(ABC, ContentHashableBase): +class Kernel(ABC, ContentIdentifiableBase): """ Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. It is the base class for all computations and transformations that can be performed on a collection of streams @@ -36,7 +36,6 @@ def __init__(self, label: str | None = None, skip_tracking: bool = False, **kwar self._skip_tracking = skip_tracking - def pre_forward_hook( self, *streams: "SyncStream", **kwargs ) -> tuple["SyncStream", ...]: @@ -220,7 +219,7 @@ def record(self, invocation: "Invocation") -> None: ... # This is NOT an abstract class, but rather a concrete class that # represents an invocation of a kernel on a collection of streams. -class Invocation(ContentHashableBase): +class Invocation(ContentIdentifiableBase): """ This class represents an invocation of a kernel on a collection of streams. It contains the kernel and the streams that were used in the invocation. @@ -288,7 +287,7 @@ def identity_structure(self) -> int: return self.kernel.identity_structure(*self.streams) -class Stream(ABC, ContentHashableBase): +class Stream(ABC, ContentIdentifiableBase): """ A stream is a collection of tagged-packets that are generated by an operation. The stream is iterable and can be used to access the packets in the stream. @@ -434,6 +433,64 @@ def __len__(self) -> int: """ return sum(1 for _ in self) + def join(self, other: "SyncStream", label:str|None=None) -> "SyncStream": + """ + Returns a new stream that is the result of joining with the other stream. + The join is performed on the tags of the packets in the streams. + """ + from .operators import Join + + if not isinstance(other, SyncStream): + raise TypeError("other must be a SyncStream") + return Join(label=label)(self, other) + + def semijoin(self, other: "SyncStream", label: str | None = None) -> "SyncStream": + """ + Returns a new stream that is the result of semijoining with the other stream. + The semijoin is performed on the tags of the packets in the streams. + """ + from .operators import SemiJoin + + if not isinstance(other, SyncStream): + raise TypeError("other must be a SyncStream") + return SemiJoin(label=label)(self, other) + + def map(self, packet_map: dict | None = None, tag_map: dict | None = None, drop_unmapped:bool=True) -> "SyncStream": + """ + Returns a new stream that is the result of mapping the packets and tags in the stream. + The mapping is applied to each packet in the stream and the resulting packets + are returned in a new stream. + If packet_map is None, no mapping is applied to the packets. + If tag_map is None, no mapping is applied to the tags. + """ + from .operators import MapTags, MapPackets + output = self + if packet_map is not None: + output = MapPackets(packet_map, drop_unmapped=drop_unmapped)(output) + if tag_map is not None: + output = MapTags(tag_map, drop_unmapped=drop_unmapped)(output) + + return output + + def apply(self, transformer: 'dict | Operator') -> "SyncStream": + """ + Returns a new stream that is the result of applying the mapping to the stream. + The mapping is applied to each packet in the stream and the resulting packets + are returned in a new stream. + """ + from .operators import MapPackets + + if isinstance(transformer, dict): + return MapPackets(transformer)(self) + elif isinstance(transformer, Operator): + # If the transformer is an Operator, we can apply it directly + return transformer(self) + + # Otherwise, do not know how to handle the transformer + raise TypeError( + "transformer must be a dictionary or an operator" + ) + def __rshift__( self, transformer: dict | Callable[["SyncStream"], "SyncStream"] ) -> "SyncStream": @@ -442,7 +499,6 @@ def __rshift__( The mapping is applied to each packet in the stream and the resulting packets are returned in a new stream. """ - # TODO: remove just in time import from .operators import MapPackets if isinstance(transformer, dict): @@ -459,13 +515,13 @@ def __mul__(self, other: "SyncStream") -> "SyncStream": """ Returns a new stream that is the result joining with the other stream """ - # TODO: remove just in time import from .operators import Join if not isinstance(other, SyncStream): raise TypeError("other must be a SyncStream") return Join()(self, other) + def claims_unique_tags(self, *, trigger_run=False) -> bool | None: """ For synchronous streams, if the stream is generated by an operation, the invocation @@ -490,6 +546,13 @@ def claims_unique_tags(self, *, trigger_run=False) -> bool | None: return True +class Operator(Kernel): + """ + A Mapper is an operation that does NOT generate new file content. + It is used to control the flow of data in the pipeline without modifying or creating data content. + """ + + class Source(Kernel, SyncStream): """ A base class for all sources in the system. A source can be seen as a special From 6e4d4bd876ceb38dc931644e53e4d98e5090e888 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 25 Jun 2025 07:19:12 +0000 Subject: [PATCH 009/224] wip: change to content identifable base --- src/orcapod/core/streams.py | 1 + src/orcapod/hashing/__init__.py | 4 +- src/orcapod/hashing/content_hashable.py | 17 ++-- src/orcapod/hashing/core.py | 4 +- src/orcapod/pipeline/wrappers.py | 112 +++++++++++++++++------- src/orcapod/utils/stream_utils.py | 35 +++++++- 6 files changed, 130 insertions(+), 43 deletions(-) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index f5c5e60..a1e9620 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -5,6 +5,7 @@ from copy import copy + class SyncStreamFromLists(SyncStream): def __init__( self, diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index f95b0f7..d3d83e9 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -24,7 +24,7 @@ FunctionInfoExtractor, CompositeFileHasher, ) -from .content_hashable import ContentHashableBase +from .content_hashable import ContentIdentifiableBase __all__ = [ "FileHasher", @@ -47,5 +47,5 @@ "get_default_composite_file_hasher", "get_default_object_hasher", "get_default_arrow_hasher", - "ContentHashableBase", + "ContentIdentifiableBase", ] diff --git a/src/orcapod/hashing/content_hashable.py b/src/orcapod/hashing/content_hashable.py index 33fd1bb..61eb0e5 100644 --- a/src/orcapod/hashing/content_hashable.py +++ b/src/orcapod/hashing/content_hashable.py @@ -4,15 +4,15 @@ from typing import Any -class ContentHashableBase: - def __init__(self, object_hasher: ObjectHasher | None = None, label: str | None = None) -> None: +class ContentIdentifiableBase: + def __init__(self, identity_structure_hasher: ObjectHasher | None = None, label: str | None = None) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. Args: - object_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. + identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ - self.object_hasher = object_hasher or get_default_object_hasher() + self.identity_structure_hasher = identity_structure_hasher or get_default_object_hasher() self._label = label @property @@ -36,10 +36,13 @@ def label(self, label: str | None) -> None: self._label = label def computed_label(self) -> str|None: + """ + Compute a label for this object based on its content. If label is not explicitly set for this object + and computed_label returns a valid value, it will be used as label of this object. + """ return None - def identity_structure(self) -> Any: """ Return a structure that represents the identity of this object. @@ -68,7 +71,7 @@ def __hash__(self) -> int: # If no identity structure is provided, use the default hash return super().__hash__() - return self.object_hasher.hash_to_int(structure) + return self.identity_structure_hasher.hash_to_int(structure) def __eq__(self, other: object) -> bool: """ @@ -80,7 +83,7 @@ def __eq__(self, other: object) -> bool: Returns: bool: True if both objects have the same identity structure, False otherwise. """ - if not isinstance(other, ContentHashableBase): + if not isinstance(other, ContentIdentifiableBase): return NotImplemented return self.identity_structure() == other.identity_structure() \ No newline at end of file diff --git a/src/orcapod/hashing/core.py b/src/orcapod/hashing/core.py index d37ade3..08fd812 100644 --- a/src/orcapod/hashing/core.py +++ b/src/orcapod/hashing/core.py @@ -437,8 +437,8 @@ def process_structure( logger.debug(f"Processing HashableMixin instance of type {type(obj).__name__}") return obj.content_hash() - from .content_hashable import ContentHashableBase - if isinstance(obj, ContentHashableBase): + from .content_hashable import ContentIdentifiableBase + if isinstance(obj, ContentIdentifiableBase): logger.debug(f"Processing ContentHashableBase instance of type {type(obj).__name__}") return process_structure(obj.identity_structure(), visited, function_info_extractor) diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index b6d5f18..a77d67a 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -11,7 +11,7 @@ import pyarrow as pa import polars as pl from orcapod.core.streams import SyncStreamFromGenerator -from orcapod.utils.stream_utils import get_typespec, merge_typespecs +from orcapod.utils.stream_utils import get_typespec, union_typespecs import logging logger = logging.getLogger(__name__) @@ -184,31 +184,45 @@ def __init__( kernel: Kernel, input_streams: Collection[SyncStream], output_store: ArrowDataStore, - _object_hasher: ObjectHasher | None = None, - _arrow_hasher: ArrowHasher | None = None, - _registry: TypeRegistry | None = None, + kernel_hasher: ObjectHasher | None = None, + arrow_packet_hasher: ArrowHasher | None = None, + packet_type_registry: TypeRegistry | None = None, **kwargs, ) -> None: super().__init__(kernel, input_streams,**kwargs) self.output_store = output_store - self.tag_keys, self.packet_keys = self.keys(trigger_run=False) - self.output_converter = None # These are configurable but are not expected to be modified except for special circumstances - if _object_hasher is None: - _object_hasher = get_default_object_hasher() - self.object_hasher = _object_hasher - if _arrow_hasher is None: - _arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = _arrow_hasher - if _registry is None: - _registry = default_registry - self.registry = _registry - self.source_info = self.label, str(hash(self.kernel)) + if kernel_hasher is None: + kernel_hasher = get_default_object_hasher() + self._kernel_hasher = kernel_hasher + if arrow_packet_hasher is None: + arrow_packet_hasher = get_default_arrow_hasher() + self._arrow_packet_hasher = arrow_packet_hasher + if packet_type_registry is None: + packet_type_registry = default_registry + self._packet_type_registry = packet_type_registry + + + self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) + self.tag_keys, self.packet_keys = self.keys(trigger_run=False) + self.output_converter = None self._cache_computed = False + @property + def kernel_hasher(self) -> ObjectHasher: + return self._kernel_hasher + + @kernel_hasher.setter + def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): + if kernel_hasher is None: + kernel_hasher = get_default_object_hasher() + self._kernel_hasher = kernel_hasher + # hasher changed -- trigger recomputation of properties that depend on kernel hasher + self.update_cached_values() + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: @@ -224,7 +238,7 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: tag_type, packet_type = output_stream.types(trigger_run=False) if tag_type is not None and packet_type is not None: - joined_type = merge_typespecs(tag_type, packet_type) + joined_type = union_typespecs(tag_type, packet_type) assert joined_type is not None, "Joined typespec should not be None" self.output_converter = PacketConverter(joined_type, registry=self.registry) @@ -324,9 +338,9 @@ def __init__( skip_memoization: bool = False, skip_tag_record: bool = False, error_handling: Literal["raise", "ignore", "warn"] = "raise", - _object_hasher: ObjectHasher | None = None, - _arrow_hasher: ArrowHasher | None = None, - _registry: TypeRegistry | None = None, + object_hasher: ObjectHasher | None = None, + arrow_hasher: ArrowHasher | None = None, + registry: TypeRegistry | None = None, **kwargs, ) -> None: super().__init__( @@ -344,28 +358,64 @@ def __init__( self.skip_tag_record = skip_tag_record # These are configurable but are not expected to be modified except for special circumstances + # Here I'm assigning to the hidden properties directly to avoid triggering setters if _object_hasher is None: _object_hasher = get_default_object_hasher() - self.object_hasher = _object_hasher + self._object_hasher = _object_hasher if _arrow_hasher is None: _arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = _arrow_hasher + self._arrow_hasher = _arrow_hasher if _registry is None: _registry = default_registry - self.registry = _registry + self._registry = _registry - # TODO: consider making this dynamic - self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) - self.tag_keys, self.output_keys = self.keys(trigger_run=False) + # compute and cache properties and converters for efficiency + self.update_cached_values() + self._cache_computed = False - # prepare packet converters - input_typespec, output_typespec = self.function_pod.get_function_typespecs() + @property + def object_hasher(self) -> ObjectHasher: + return self._object_hasher - self.input_converter = PacketConverter(input_typespec, self.registry) - self.output_converter = PacketConverter(output_typespec, self.registry) + @object_hasher.setter + def object_hasher(self, object_hasher:ObjectHasher | None = None): + if object_hasher is None: + object_hasher = get_default_object_hasher() + self._object_hasher = object_hasher + # hasher changed -- trigger recomputation of properties that depend on object hasher + self.update_cached_values() - self._cache_computed = False + @property + def arrow_hasher(self) -> ArrowHasher: + return self._arrow_hasher + + @arrow_hasher.setter + def arrow_hasher(self, arrow_hasher:ArrowHasher | None = None): + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self._arrow_hasher = arrow_hasher + # hasher changed -- trigger recomputation of properties that depend on arrow hasher + self.update_cached_values() + + @property + def registry(self) -> TypeRegistry: + return self._registry + + @registry.setter + def registry(self, registry: TypeRegistry | None = None): + if registry is None: + registry = default_registry + self._registry = registry + # registry changed -- trigger recomputation of properties that depend on registry + self.update_cached_values() + + def update_cached_values(self) -> None: + self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) + self.tag_keys, self.output_keys = self.keys(trigger_run=False) + self.input_typespec, self.output_typespec = self.function_pod.get_function_typespecs() + self.input_converter = PacketConverter(self.input_typespec, self.registry) + self.output_converter = PacketConverter(self.output_typespec, self.registry) def reset_cache(self): self._cache_computed = False diff --git a/src/orcapod/utils/stream_utils.py b/src/orcapod/utils/stream_utils.py index 51d46c1..95703c8 100644 --- a/src/orcapod/utils/stream_utils.py +++ b/src/orcapod/utils/stream_utils.py @@ -43,7 +43,7 @@ def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: return merged -def merge_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: +def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: if left is None: return right if right is None: @@ -58,6 +58,25 @@ def merge_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | ) return merged +def intersection_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: + """ + Returns the intersection of two TypeSpecs, only returning keys that are present in both. + If a key is present in both TypeSpecs, the type must be the same. + """ + if left is None or right is None: + return None + # Find common keys and ensure types match + common_keys = set(left.keys()).intersection(set(right.keys())) + intersection = {} + for key in common_keys: + try: + intersection[key] = get_compatible_type(left[key], right[key]) + except TypeError: + # If types are not compatible, raise an error + raise TypeError(f"Type conflict for key '{key}': {left[key]} vs {right[key]}") + + return intersection + def common_elements(*values) -> Collection[str]: """ @@ -88,6 +107,20 @@ def join_tags(tag1: Mapping[K, V], tag2: Mapping[K, V]) -> dict[K, V] | None: joined_tag[k] = v return joined_tag +def semijoin_tags(tag1: Mapping[K, V], tag2: Mapping[K, V], target_keys: Collection[K]|None = None) -> dict[K, V] | None: + """ + Semijoin two tags. If the tags have the same key, the value must be the same or None will be returned. If all shared + key's value match, tag1 would be returned + """ + if target_keys is None: + target_keys = set(tag1.keys()).intersection(set(tag2.keys())) + if not target_keys: + return dict(tag1) + + for key in target_keys: + if tag1[key] != tag2[key]: + return None + return dict(tag1) def check_packet_compatibility(packet1: Packet, packet2: Packet) -> bool: """ From 5fb2435453bd66fadfdcd8324c1581421ceef86d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 26 Jun 2025 18:53:22 +0000 Subject: [PATCH 010/224] style: apply ruff formatting --- src/orcapod/core/streams.py | 19 ++++-- src/orcapod/pipeline/wrappers.py | 114 ++++++++++++++++++------------- 2 files changed, 80 insertions(+), 53 deletions(-) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index a1e9620..c70b009 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -5,7 +5,6 @@ from copy import copy - class SyncStreamFromLists(SyncStream): def __init__( self, @@ -50,16 +49,21 @@ def keys( super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) tag_keys = tag_keys or super_tag_keys packet_keys = packet_keys or super_packet_keys - + # If the keys are already set, return them return tag_keys, packet_keys - + def types( self, *, trigger_run: bool = False ) -> tuple[TypeSpec | None, TypeSpec | None]: - tag_typespec, packet_typespec = copy(self.tag_typespec), copy(self.packet_typespec) + tag_typespec, packet_typespec = ( + copy(self.tag_typespec), + copy(self.packet_typespec), + ) if tag_typespec is None or packet_typespec is None: - super_tag_typespec, super_packet_typespec = super().types(trigger_run=trigger_run) + super_tag_typespec, super_packet_typespec = super().types( + trigger_run=trigger_run + ) tag_typespec = tag_typespec or super_tag_typespec packet_typespec = packet_typespec or super_packet_typespec @@ -69,7 +73,6 @@ def types( def __iter__(self) -> Iterator[tuple[Tag, Packet]]: yield from self.paired - class SyncStreamFromGenerator(SyncStream): """ @@ -87,9 +90,11 @@ def __init__( self.tag_keys = tag_keys self.packet_keys = packet_keys self.generator_factory = generator_factory + self.check_consistency = False def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - yield from self.generator_factory() + if not self.check_consistency: + yield from self.generator_factory() def keys( self, *, trigger_run: bool = False diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index a77d67a..e953f1f 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -14,8 +14,10 @@ from orcapod.utils.stream_utils import get_typespec, union_typespecs import logging + logger = logging.getLogger(__name__) + def tag_to_arrow_table_with_metadata(tag, metadata: dict | None = None): """ Convert a tag dictionary to PyArrow table with metadata on each column. @@ -49,18 +51,23 @@ def tag_to_arrow_table_with_metadata(tag, metadata: dict | None = None): return table -def get_columns_with_metadata(df: pl.DataFrame, key: str, value: str|None = None) -> list[str]: + +def get_columns_with_metadata( + df: pl.DataFrame, key: str, value: str | None = None +) -> list[str]: """Get column names with specific metadata using list comprehension. If value is given, only - columns matching that specific value for the desginated metadata key will be returned. + columns matching that specific value for the desginated metadata key will be returned. Otherwise, all columns that contains the key as metadata will be returned regardless of the value""" return [ - col_name for col_name, dtype in df.schema.items() - if hasattr(dtype, "metadata") and (value is None or getattr(dtype, "metadata") == value) + col_name + for col_name, dtype in df.schema.items() + if hasattr(dtype, "metadata") + and (value is None or getattr(dtype, "metadata") == value) ] class PolarsSource(Source): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]|None = None): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str] | None = None): self.df = df self.tag_keys = tag_keys @@ -74,7 +81,7 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: class PolarsStream(SyncStream): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]|None = None): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str] | None = None): self.df = df if tag_keys is None: # extract tag_keys by picking columns with metadata source=tag @@ -87,8 +94,15 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: packet = {key: val for key, val in row.items() if key not in self.tag_keys} yield tag, packet + class EmptyStream(SyncStream): - def __init__(self, tag_keys: Collection[str]|None = None, packet_keys: Collection[str]|None = None, tag_typespec: TypeSpec | None = None, packet_typespec:TypeSpec|None = None): + def __init__( + self, + tag_keys: Collection[str] | None = None, + packet_keys: Collection[str] | None = None, + tag_typespec: TypeSpec | None = None, + packet_typespec: TypeSpec | None = None, + ): if tag_keys is None and tag_typespec is not None: tag_keys = tag_typespec.keys() self.tag_keys = list(tag_keys) if tag_keys else [] @@ -100,10 +114,14 @@ def __init__(self, tag_keys: Collection[str]|None = None, packet_keys: Collectio self.tag_typespec = tag_typespec self.packet_typespec = packet_typespec - def keys(self, *streams: SyncStream, trigger_run: bool = False) -> tuple[Collection[str] | None, Collection[str] | None]: + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: return self.tag_keys, self.packet_keys - def types(self, *streams: SyncStream, trigger_run: bool = False) -> tuple[TypeSpec | None, TypeSpec | None]: + def types( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[TypeSpec | None, TypeSpec | None]: return self.tag_typespec, self.packet_typespec def __iter__(self) -> Iterator[tuple[Tag, Packet]]: @@ -111,15 +129,14 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: return iter([]) - - class KernelInvocationWrapper(Kernel): - def __init__(self, kernel: Kernel, input_streams: Collection[SyncStream], **kwargs) -> None: + def __init__( + self, kernel: Kernel, input_streams: Collection[SyncStream], **kwargs + ) -> None: super().__init__(**kwargs) self.kernel = kernel self.input_streams = list(input_streams) - def __repr__(self): return f"{self.__class__.__name__}<{self.kernel!r}>" @@ -163,7 +180,7 @@ def claims_unique_tags( ) -> bool | None: resolved_streams = self.resolve_input_streams(*streams) return self.kernel.claims_unique_tags( - *resolved_streams, trigger_run=trigger_run + *resolved_streams, trigger_run=trigger_run ) @@ -189,7 +206,7 @@ def __init__( packet_type_registry: TypeRegistry | None = None, **kwargs, ) -> None: - super().__init__(kernel, input_streams,**kwargs) + super().__init__(kernel, input_streams, **kwargs) self.output_store = output_store @@ -204,15 +221,24 @@ def __init__( packet_type_registry = default_registry self._packet_type_registry = packet_type_registry - self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) self.output_converter = None self._cache_computed = False + @property + def arrow_hasher(self): + return self._arrow_packet_hasher + + @property + def registry(self): + return self._packet_type_registry + @property def kernel_hasher(self) -> ObjectHasher: + if self._kernel_hasher is None: + return get_default_object_hasher() return self._kernel_hasher @kernel_hasher.setter @@ -223,6 +249,10 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): # hasher changed -- trigger recomputation of properties that depend on kernel hasher self.update_cached_values() + def update_cached_values(self): + self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) + self.tag_keys, self.packet_keys = self.keys(trigger_run=False) + self.output_converter = None def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: @@ -233,7 +263,7 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) resolved_streams = self.resolve_input_streams(*streams) - + output_stream = self.kernel.forward(*resolved_streams, **kwargs) tag_type, packet_type = output_stream.types(trigger_run=False) @@ -279,26 +309,25 @@ def df(self) -> pl.DataFrame | None: return None return lazy_df.collect() - def reset_cache(self): self._cache_computed = False - class FunctionPodInvocationWrapper(KernelInvocationWrapper, Pod): """ Convenience class to wrap a function pod, providing default pass-through implementations """ - def __init__(self, function_pod: FunctionPod, input_streams: Collection[SyncStream], **kwargs): + def __init__( + self, function_pod: FunctionPod, input_streams: Collection[SyncStream], **kwargs + ): # note that this would be an alias to the self.kernel but here explicitly taken as function_pod # for better type hints # MRO will be KernelInvocationWrapper -> Pod -> Kernel super().__init__(function_pod, input_streams, **kwargs) self.function_pod = function_pod - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: resolved_streams = self.resolve_input_streams(*streams) return super().forward(*resolved_streams, **kwargs) @@ -306,7 +335,6 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: return self.function_pod.call(tag, packet) - # =============pass through methods/properties to the underlying function pod============= def set_active(self, active=True): @@ -322,10 +350,6 @@ def is_active(self) -> bool: return self.function_pod.is_active() - - - - class CachedFunctionPodWrapper(FunctionPodInvocationWrapper, Source): def __init__( self, @@ -359,16 +383,15 @@ def __init__( # These are configurable but are not expected to be modified except for special circumstances # Here I'm assigning to the hidden properties directly to avoid triggering setters - if _object_hasher is None: - _object_hasher = get_default_object_hasher() - self._object_hasher = _object_hasher - if _arrow_hasher is None: - _arrow_hasher = get_default_arrow_hasher() - self._arrow_hasher = _arrow_hasher - if _registry is None: - _registry = default_registry - self._registry = _registry - + if object_hasher is None: + object_hasher = get_default_object_hasher() + self._object_hasher = object_hasher + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self._arrow_hasher = arrow_hasher + if registry is None: + registry = default_registry + self._registry = registry # compute and cache properties and converters for efficiency self.update_cached_values() @@ -379,7 +402,7 @@ def object_hasher(self) -> ObjectHasher: return self._object_hasher @object_hasher.setter - def object_hasher(self, object_hasher:ObjectHasher | None = None): + def object_hasher(self, object_hasher: ObjectHasher | None = None): if object_hasher is None: object_hasher = get_default_object_hasher() self._object_hasher = object_hasher @@ -391,7 +414,7 @@ def arrow_hasher(self) -> ArrowHasher: return self._arrow_hasher @arrow_hasher.setter - def arrow_hasher(self, arrow_hasher:ArrowHasher | None = None): + def arrow_hasher(self, arrow_hasher: ArrowHasher | None = None): if arrow_hasher is None: arrow_hasher = get_default_arrow_hasher() self._arrow_hasher = arrow_hasher @@ -413,7 +436,9 @@ def registry(self, registry: TypeRegistry | None = None): def update_cached_values(self) -> None: self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) self.tag_keys, self.output_keys = self.keys(trigger_run=False) - self.input_typespec, self.output_typespec = self.function_pod.get_function_typespecs() + self.input_typespec, self.output_typespec = ( + self.function_pod.get_function_typespecs() + ) self.input_converter = PacketConverter(self.input_typespec, self.registry) self.output_converter = PacketConverter(self.output_typespec, self.registry) @@ -435,14 +460,11 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) logger.info(f"Computing and caching outputs for {self}") - return super().forward(*streams, **kwargs) - + return super().forward(*streams, **kwargs) def get_packet_key(self, packet: Packet) -> str: # TODO: reconsider the logic around input/output converter -- who should own this? - return self.arrow_hasher.hash_table( - self.input_converter.to_arrow_table(packet) - ) + return self.arrow_hasher.hash_table(self.input_converter.to_arrow_table(packet)) @property def source_info(self): @@ -701,15 +723,15 @@ def __init__(self, kernel: Kernel, input_nodes: Collection["Node"], **kwargs): def reset_cache(self) -> None: ... - class KernelNode(Node, CachedKernelWrapper): """ A node that wraps a Kernel and provides a Node interface. This is useful for creating nodes in a pipeline that can be executed. """ + class FunctionPodNode(Node, CachedFunctionPodWrapper): """ A node that wraps a FunctionPod and provides a Node interface. This is useful for creating nodes in a pipeline that can be executed. - """ \ No newline at end of file + """ From 09f59cbcfcabb5e24e69bc64e5bd97a6340bd582 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 26 Jun 2025 19:07:10 +0000 Subject: [PATCH 011/224] refactor: clean up test of name orcabridge --- .devcontainer/Dockerfile | 2 +- src/orcapod/core/base.py | 30 +++++++++------- src/orcapod/pipeline/pipeline.py | 35 +++++++++++-------- tests/test_hashing/generate_file_hashes.py | 3 +- .../generate_pathset_packet_hashes.py | 3 +- .../test_basic_composite_hasher.py | 1 - tests/test_hashing/test_cached_file_hasher.py | 1 - tests/test_hashing/test_file_hashes.py | 3 +- tests/test_hashing/test_packet_hasher.py | 1 - tests/test_hashing/test_pathset_and_packet.py | 3 +- .../test_pathset_packet_hashes.py | 3 +- tests/test_store/conftest.py | 1 - tests/test_store/test_dir_data_store.py | 1 - tests/test_store/test_integration.py | 1 - tests/test_store/test_noop_data_store.py | 1 - tests/test_store/test_transfer_data_store.py | 1 - tests/test_types/__init__.py | 2 +- tests/test_types/test_inference/__init__.py | 2 +- 18 files changed, 46 insertions(+), 48 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index c3b180d..33e1e11 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -26,7 +26,7 @@ RUN \ USER vscode ENV PATH=/home/vscode/.local/bin:$PATH WORKDIR /home/vscode -COPY --chown=vscode:nogroup src/orcabridge/requirements.txt /tmp/requirements.txt +COPY --chown=vscode:nogroup src/orcapod/requirements.txt /tmp/requirements.txt RUN \ # python setup curl -LsSf https://astral.sh/uv/install.sh | sh && \ diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 0a99a8a..fc18b48 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -1,4 +1,4 @@ -# Collection of base classes for operations and streams in the orcabridge framework. +# Collection of base classes for operations and streams in the orcapod framework. import threading from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterator @@ -30,12 +30,13 @@ class Kernel(ABC, ContentIdentifiableBase): for computational graph tracking. """ - def __init__(self, label: str | None = None, skip_tracking: bool = False, **kwargs) -> None: + def __init__( + self, label: str | None = None, skip_tracking: bool = False, **kwargs + ) -> None: super().__init__(**kwargs) self._label = label self._skip_tracking = skip_tracking - def pre_forward_hook( self, *streams: "SyncStream", **kwargs ) -> tuple["SyncStream", ...]: @@ -54,8 +55,9 @@ def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStrea """ return output_stream - - def __call__(self, *streams: "SyncStream", label:str|None = None, **kwargs) -> "SyncStream": + def __call__( + self, *streams: "SyncStream", label: str | None = None, **kwargs + ) -> "SyncStream": if label is not None: self.label = label # Special handling of Source: trigger call on source if passed as stream @@ -305,7 +307,6 @@ def computed_label(self) -> str | None: # use the invocation operation label return self.invocation.kernel.label return None - @property def invocation(self) -> Invocation | None: @@ -433,7 +434,7 @@ def __len__(self) -> int: """ return sum(1 for _ in self) - def join(self, other: "SyncStream", label:str|None=None) -> "SyncStream": + def join(self, other: "SyncStream", label: str | None = None) -> "SyncStream": """ Returns a new stream that is the result of joining with the other stream. The join is performed on the tags of the packets in the streams. @@ -455,7 +456,12 @@ def semijoin(self, other: "SyncStream", label: str | None = None) -> "SyncStream raise TypeError("other must be a SyncStream") return SemiJoin(label=label)(self, other) - def map(self, packet_map: dict | None = None, tag_map: dict | None = None, drop_unmapped:bool=True) -> "SyncStream": + def map( + self, + packet_map: dict | None = None, + tag_map: dict | None = None, + drop_unmapped: bool = True, + ) -> "SyncStream": """ Returns a new stream that is the result of mapping the packets and tags in the stream. The mapping is applied to each packet in the stream and the resulting packets @@ -464,6 +470,7 @@ def map(self, packet_map: dict | None = None, tag_map: dict | None = None, drop_ If tag_map is None, no mapping is applied to the tags. """ from .operators import MapTags, MapPackets + output = self if packet_map is not None: output = MapPackets(packet_map, drop_unmapped=drop_unmapped)(output) @@ -472,7 +479,7 @@ def map(self, packet_map: dict | None = None, tag_map: dict | None = None, drop_ return output - def apply(self, transformer: 'dict | Operator') -> "SyncStream": + def apply(self, transformer: "dict | Operator") -> "SyncStream": """ Returns a new stream that is the result of applying the mapping to the stream. The mapping is applied to each packet in the stream and the resulting packets @@ -487,9 +494,7 @@ def apply(self, transformer: 'dict | Operator') -> "SyncStream": return transformer(self) # Otherwise, do not know how to handle the transformer - raise TypeError( - "transformer must be a dictionary or an operator" - ) + raise TypeError("transformer must be a dictionary or an operator") def __rshift__( self, transformer: dict | Callable[["SyncStream"], "SyncStream"] @@ -521,7 +526,6 @@ def __mul__(self, other: "SyncStream") -> "SyncStream": raise TypeError("other must be a SyncStream") return Join()(self, other) - def claims_unique_tags(self, *, trigger_run=False) -> bool | None: """ For synchronous streams, if the stream is generated by an operation, the invocation diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 7edd03e..74eb998 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -31,7 +31,13 @@ class Pipeline(GraphTracker): Replaces the old Tracker with better persistence and view capabilities. """ - def __init__(self, name: str, results_store: ArrowDataStore, pipeline_store: ArrowDataStore, auto_compile:bool=True) -> None: + def __init__( + self, + name: str, + results_store: ArrowDataStore, + pipeline_store: ArrowDataStore, + auto_compile: bool = True, + ) -> None: super().__init__() self.name = name or f"pipeline_{id(self)}" self.results_store = results_store @@ -55,7 +61,7 @@ def save(self, path: Path | str) -> None: "metadata": { "created_at": time.time(), "python_version": sys.version_info[:2], - "orcabridge_version": "0.1.0", # You can make this dynamic + "orcapod_version": "0.1.0", # TODO: make this dynamic }, } @@ -79,22 +85,26 @@ def record(self, invocation: Invocation) -> None: super().record(invocation) self._dirty = True - def wrap_invocation( - self, kernel: Kernel, input_nodes: Collection[Node] - ) -> Node: + def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node: if isinstance(kernel, FunctionPod): - return FunctionPodNode(kernel, input_nodes, output_store=self.results_store, tag_store=self.pipeline_store) + return FunctionPodNode( + kernel, + input_nodes, + output_store=self.results_store, + tag_store=self.pipeline_store, + ) return KernelNode(kernel, input_nodes, output_store=self.pipeline_store) def compile(self): import networkx as nx + G = self.generate_graph() # Proposed labels for each Kernel in the graph # If name collides, unique name is generated by appending an index proposed_labels = defaultdict(list) node_lut = {} - edge_lut : dict[SyncStream, Node]= {} + edge_lut: dict[SyncStream, Node] = {} ordered_nodes = [] for invocation in nx.topological_sort(G): # map streams to the new streams based on Nodes @@ -109,7 +119,7 @@ def compile(self): for edge in G.out_edges(invocation): edge_lut[G.edges[edge]["stream"]] = new_node - + self._ordered_nodes = ordered_nodes # resolve duplicates in proposed_labels @@ -134,18 +144,17 @@ def __exit__(self, exc_type, exc_val, ext_tb): if self.auto_compile: self.compile() - def __getattr__(self, item: str) -> Any: """Allow direct access to pipeline attributes""" if item in self.labels_to_nodes: return self.labels_to_nodes[item] raise AttributeError(f"Pipeline has no attribute '{item}'") - + def __dir__(self): # Include both regular attributes and dynamic ones return list(super().__dir__()) + list(self.labels_to_nodes.keys()) - def run(self, full_sync:bool=False) -> None: + def run(self, full_sync: bool = False) -> None: """ Run the pipeline, compiling it if necessary. This method is a no-op if auto_compile is False. @@ -158,7 +167,7 @@ def run(self, full_sync:bool=False) -> None: if full_sync: node.reset_cache() node.flow() - + @classmethod def load(cls, path: Path | str) -> "Pipeline": """Load complete pipeline state""" @@ -196,5 +205,3 @@ def _validate_serializable(self) -> None: + "\n".join(f" - {issue}" for issue in issues) + "\n\nOnly named functions are supported for serialization." ) - - diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py index 1002b7f..0beb66c 100644 --- a/tests/test_hashing/generate_file_hashes.py +++ b/tests/test_hashing/generate_file_hashes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/generate_file_hashes.py """ Generate sample files with random content and record their hashes. @@ -14,7 +13,7 @@ from datetime import datetime from pathlib import Path -# Add the parent directory to the path to import orcabridge +# Add the parent directory to the path to import orcapod sys.path.append(str(Path(__file__).parent.parent.parent)) from orcapod.hashing import hash_file diff --git a/tests/test_hashing/generate_pathset_packet_hashes.py b/tests/test_hashing/generate_pathset_packet_hashes.py index 61a36eb..edd804d 100644 --- a/tests/test_hashing/generate_pathset_packet_hashes.py +++ b/tests/test_hashing/generate_pathset_packet_hashes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/generate_pathset_packet_hashes.py """ Generate sample pathsets and packets and record their hashes. @@ -11,7 +10,7 @@ import sys from pathlib import Path -# Add the parent directory to the path to import orcabridge +# Add the parent directory to the path to import orcapod sys.path.append(str(Path(__file__).parent.parent.parent)) from orcapod.hashing import hash_packet, hash_pathset diff --git a/tests/test_hashing/test_basic_composite_hasher.py b/tests/test_hashing/test_basic_composite_hasher.py index d2c5361..2ef9cf6 100644 --- a/tests/test_hashing/test_basic_composite_hasher.py +++ b/tests/test_hashing/test_basic_composite_hasher.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_default_file_hasher.py """ Test DefaultFileHasher functionality. diff --git a/tests/test_hashing/test_cached_file_hasher.py b/tests/test_hashing/test_cached_file_hasher.py index 3307628..42c9380 100644 --- a/tests/test_hashing/test_cached_file_hasher.py +++ b/tests/test_hashing/test_cached_file_hasher.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_cached_file_hasher.py """Tests for CachedFileHasher implementation.""" import json diff --git a/tests/test_hashing/test_file_hashes.py b/tests/test_hashing/test_file_hashes.py index 66ed987..1de0716 100644 --- a/tests/test_hashing/test_file_hashes.py +++ b/tests/test_hashing/test_file_hashes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_file_hashes.py """ Test file hash consistency. @@ -12,7 +11,7 @@ import pytest -# Add the parent directory to the path to import orcabridge +# Add the parent directory to the path to import orcapod from orcapod.hashing import hash_file diff --git a/tests/test_hashing/test_packet_hasher.py b/tests/test_hashing/test_packet_hasher.py index f9d519d..69b89d0 100644 --- a/tests/test_hashing/test_packet_hasher.py +++ b/tests/test_hashing/test_packet_hasher.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_packet_hasher.py """Tests for the PacketHasher protocol implementation.""" import pytest diff --git a/tests/test_hashing/test_pathset_and_packet.py b/tests/test_hashing/test_pathset_and_packet.py index 6b7eb6f..fc00b29 100644 --- a/tests/test_hashing/test_pathset_and_packet.py +++ b/tests/test_hashing/test_pathset_and_packet.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_pathset_and_packet.py """ -Test the hash_pathset and hash_packet functions from orcabridge.hashing. +Test the hash_pathset and hash_packet functions from orcapod.hashing. This module contains tests to verify the correct behavior of hash_pathset and hash_packet functions with various input types and configurations. diff --git a/tests/test_hashing/test_pathset_packet_hashes.py b/tests/test_hashing/test_pathset_packet_hashes.py index 49e2d0c..7745881 100644 --- a/tests/test_hashing/test_pathset_packet_hashes.py +++ b/tests/test_hashing/test_pathset_packet_hashes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_pathset_packet_hashes.py """ Test pathset and packet hash consistency. @@ -12,7 +11,7 @@ import pytest -# Add the parent directory to the path to import orcabridge +# Add the parent directory to the path to import orcapod from orcapod.hashing import hash_packet, hash_pathset diff --git a/tests/test_store/conftest.py b/tests/test_store/conftest.py index 77ca9f9..6b8aa6f 100644 --- a/tests/test_store/conftest.py +++ b/tests/test_store/conftest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_store/conftest.py """Common test fixtures for store tests.""" import shutil diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index c07f141..16d82d0 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_store/test_dir_data_store.py """Tests for DirDataStore.""" import json diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 023e6e6..fc26022 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_store/test_integration.py """Integration tests for the store module.""" import os diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 0da82c7..42606b8 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_store/test_noop_data_store.py """Tests for NoOpDataStore.""" import pytest diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 85d0a87..1e8d178 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_store/test_transfer_data_store.py """Tests for TransferDataStore.""" import json diff --git a/tests/test_types/__init__.py b/tests/test_types/__init__.py index aa691b1..2be2a50 100644 --- a/tests/test_types/__init__.py +++ b/tests/test_types/__init__.py @@ -1 +1 @@ -# Test package for orcabridge types module +# Test package for orcapod types module diff --git a/tests/test_types/test_inference/__init__.py b/tests/test_types/test_inference/__init__.py index 45e6baf..ae4cff0 100644 --- a/tests/test_types/test_inference/__init__.py +++ b/tests/test_types/test_inference/__init__.py @@ -1 +1 @@ -# Test package for orcabridge types inference module +# Test package for orcapod types inference module From c5fcb3d33a580262f30e56ec4671c70c00c6961e Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 05:19:19 +0000 Subject: [PATCH 012/224] test: remove filepath specification --- tests/test_hashing/test_hasher_parity.py | 1 - tests/test_store/test_dir_data_store.py | 2 +- tests/test_store/test_integration.py | 2 +- tests/test_store/test_noop_data_store.py | 4 +- tests/test_store/test_transfer_data_store.py | 4 +- .../test_extract_function_data_types.py | 60 +++++++++---------- 6 files changed, 36 insertions(+), 37 deletions(-) diff --git a/tests/test_hashing/test_hasher_parity.py b/tests/test_hashing/test_hasher_parity.py index fb83afb..64a6004 100644 --- a/tests/test_hashing/test_hasher_parity.py +++ b/tests/test_hashing/test_hasher_parity.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# filepath: /home/eywalker/workspace/orcabridge/tests/test_hashing/test_hasher_parity.py """ Test parity between DefaultFileHasher and core hashing functions. diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index 16d82d0..32d8618 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -13,7 +13,7 @@ PacketHasher, PathSetHasher, ) -from orcapod.store.core import DirDataStore +from orcapod.store.dict_data_stores import DirDataStore class MockFileHasher(FileHasher): diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index fc26022..48e0703 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -12,7 +12,7 @@ DefaultCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.store.core import DirDataStore, NoOpDataStore +from orcapod.store.dict_data_stores import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 42606b8..ab0eecd 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -3,7 +3,7 @@ import pytest -from orcapod.store.core import NoOpDataStore +from orcapod.store.dict_data_stores import NoOpDataStore def test_noop_data_store_memoize(): @@ -43,7 +43,7 @@ def test_noop_data_store_retrieve_memoized(): def test_noop_data_store_is_data_store_subclass(): """Test that NoOpDataStore is a subclass of DataStore.""" - from orcapod.store.core import DataStore + from orcapod.store import DataStore store = NoOpDataStore() assert isinstance(store, DataStore) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 1e8d178..6fd2add 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -7,8 +7,8 @@ import pytest from orcapod.hashing.types import PacketHasher -from orcapod.store.core import DirDataStore, NoOpDataStore -from orcapod.store.transfer import TransferDataStore +from orcapod.store.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.store.transfer_data_store import TransferDataStore class MockPacketHasher(PacketHasher): diff --git a/tests/test_types/test_inference/test_extract_function_data_types.py b/tests/test_types/test_inference/test_extract_function_data_types.py index a357bb0..e96fd9c 100644 --- a/tests/test_types/test_inference/test_extract_function_data_types.py +++ b/tests/test_types/test_inference/test_extract_function_data_types.py @@ -1,5 +1,5 @@ """ -Unit tests for the extract_function_data_types function. +Unit tests for the extract_function_typespecs function. This module tests the function type extraction functionality, covering: - Type inference from function annotations @@ -11,11 +11,11 @@ import pytest from collections.abc import Collection -from orcapod.types.inference import extract_function_data_types +from orcapod.types.typespec import extract_function_typespecs class TestExtractFunctionDataTypes: - """Test cases for extract_function_data_types function.""" + """Test cases for extract_function_typespecs function.""" def test_simple_annotated_function(self): """Test function with simple type annotations.""" @@ -23,7 +23,7 @@ def test_simple_annotated_function(self): def add(x: int, y: int) -> int: return x + y - input_types, output_types = extract_function_data_types(add, ["result"]) + input_types, output_types = extract_function_typespecs(add, ["result"]) assert input_types == {"x": int, "y": int} assert output_types == {"result": int} @@ -34,7 +34,7 @@ def test_multiple_return_values_tuple(self): def process(data: str) -> tuple[int, str]: return len(data), data.upper() - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( process, ["length", "upper_data"] ) @@ -54,7 +54,7 @@ def split_data(data: str) -> tuple[str, str]: # Note: This tests the case where we have multiple output keys # but the return type is list[str] (homogeneous) - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( split_data, ["first_word", "second_word"] ) @@ -71,7 +71,7 @@ def mystery_func(x: int): ValueError, match="Type for return item 'number' is not specified in output_types", ): - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( mystery_func, ["number", "text"], ) @@ -82,7 +82,7 @@ def test_input_types_override(self): def legacy_func(x, y) -> int: # No annotations return x + y - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( legacy_func, ["sum"], input_types={"x": int, "y": int} ) @@ -95,7 +95,7 @@ def test_partial_input_types_override(self): def mixed_func(x: int, y) -> int: # One annotated, one not return x + y - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( mixed_func, ["sum"], input_types={"y": float} ) @@ -108,7 +108,7 @@ def test_output_types_dict_override(self): def mystery_func(x: int) -> str: return str(x) - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( mystery_func, ["result"], output_types={"result": float} ) @@ -121,7 +121,7 @@ def test_output_types_sequence_override(self): def multi_return(data: list) -> tuple[int, float, str]: return len(data), sum(data), str(data) - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( multi_return, ["count", "total", "repr"], output_types=[int, float, str] ) @@ -134,7 +134,7 @@ def test_complex_types(self): def complex_func(x: str | None, y: int | float) -> tuple[bool, list[str]]: return bool(x), [x] if x else [] - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( complex_func, ["is_valid", "items"] ) @@ -147,7 +147,7 @@ def test_none_return_annotation(self): def side_effect_func(x: int) -> None: print(x) - input_types, output_types = extract_function_data_types(side_effect_func, []) + input_types, output_types = extract_function_typespecs(side_effect_func, []) assert input_types == {"x": int} assert output_types == {} @@ -158,7 +158,7 @@ def test_empty_parameters(self): def get_constant() -> int: return 42 - input_types, output_types = extract_function_data_types(get_constant, ["value"]) + input_types, output_types = extract_function_typespecs(get_constant, ["value"]) assert input_types == {} assert output_types == {"value": int} @@ -172,7 +172,7 @@ def bad_func(x, y: int): return x + y with pytest.raises(ValueError, match="Parameter 'x' has no type annotation"): - extract_function_data_types(bad_func, ["result"]) + extract_function_typespecs(bad_func, ["result"]) def test_return_annotation_but_no_output_keys_error(self): """Test error when function has return annotation but no output keys.""" @@ -184,7 +184,7 @@ def func_with_return(x: int) -> str: ValueError, match="Function has a return type annotation, but no return keys were specified", ): - extract_function_data_types(func_with_return, []) + extract_function_typespecs(func_with_return, []) def test_none_return_with_output_keys_error(self): """Test error when function returns None but output keys provided.""" @@ -196,7 +196,7 @@ def side_effect_func(x: int) -> None: ValueError, match="Function provides explicit return type annotation as None", ): - extract_function_data_types(side_effect_func, ["result"]) + extract_function_typespecs(side_effect_func, ["result"]) def test_single_return_multiple_keys_error(self): """Test error when single return type but multiple output keys.""" @@ -208,7 +208,7 @@ def single_return(x: int) -> str: ValueError, match="Multiple return keys were specified but return type annotation .* is not a sequence type", ): - extract_function_data_types(single_return, ["first", "second"]) + extract_function_typespecs(single_return, ["first", "second"]) def test_unparameterized_sequence_type_error(self): """Test error when return type is sequence but not parameterized.""" @@ -219,7 +219,7 @@ def bad_return(x: int) -> tuple: # tuple without types with pytest.raises( ValueError, match="is a Sequence type but does not specify item types" ): - extract_function_data_types(bad_return, ["number", "text"]) + extract_function_typespecs(bad_return, ["number", "text"]) def test_mismatched_return_types_count_error(self): """Test error when return type count doesn't match output keys count.""" @@ -230,7 +230,7 @@ def three_returns(x: int) -> tuple[int, str, float]: with pytest.raises( ValueError, match="has 3 items, but output_keys has 2 items" ): - extract_function_data_types(three_returns, ["first", "second"]) + extract_function_typespecs(three_returns, ["first", "second"]) def test_mismatched_output_types_sequence_length_error(self): """Test error when output_types sequence length doesn't match output_keys.""" @@ -242,7 +242,7 @@ def func(x: int) -> tuple[int, str]: ValueError, match="Output types collection length .* does not match return keys length", ): - extract_function_data_types( + extract_function_typespecs( func, ["first", "second"], output_types=[int, str, float], # Wrong length @@ -258,7 +258,7 @@ def no_return_annotation(x: int): ValueError, match="Type for return item 'first' is not specified in output_types", ): - extract_function_data_types(no_return_annotation, ["first", "second"]) + extract_function_typespecs(no_return_annotation, ["first", "second"]) # Edge cases @@ -268,7 +268,7 @@ def test_callable_with_args_kwargs(self): def flexible_func(x: int, *args: str, **kwargs: float) -> bool: return True - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( flexible_func, ["success"] ) @@ -284,7 +284,7 @@ def test_mixed_override_scenarios(self): def complex_func(a, b: str) -> tuple[int, str]: return len(b), b.upper() - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( complex_func, ["length", "upper"], input_types={"a": float}, @@ -300,7 +300,7 @@ def test_generic_types(self): def generic_func(data: list[int]) -> dict[str, int]: return {str(i): i for i in data} - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( generic_func, ["mapping"] ) @@ -316,7 +316,7 @@ def list_func( return str(x), x # This tests the sequence detection logic - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( list_func, ["text", "number"] ) @@ -330,7 +330,7 @@ def collection_func(x: int) -> Collection[str]: return [str(x)] # Single output key with Collection type - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( collection_func, ["result"] ) @@ -347,7 +347,7 @@ def test_empty_function(self): def empty_func(): pass - input_types, output_types = extract_function_data_types(empty_func, []) + input_types, output_types = extract_function_typespecs(empty_func, []) assert input_types == {} assert output_types == {} @@ -364,7 +364,7 @@ class Container(Generic[T]): def generic_container_func(x: Container[int]) -> Container[str]: return Container() - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( generic_container_func, ["result"] ) @@ -377,7 +377,7 @@ def test_output_types_dict_partial_override(self): def three_output_func() -> tuple[int, str, float]: return 1, "hello", 3.14 - input_types, output_types = extract_function_data_types( + input_types, output_types = extract_function_typespecs( three_output_func, ["num", "text", "decimal"], output_types={"text": bytes}, # Override only middle one From 22215ca49badc65ab894289fefad6ea7712455dc Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 05:19:37 +0000 Subject: [PATCH 013/224] fix: remove orcabridge reference --- misc/demo_redis_mocking.py | 6 +++--- src/orcapod/types/__init__.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/misc/demo_redis_mocking.py b/misc/demo_redis_mocking.py index cc18dcb..2fd1f92 100644 --- a/misc/demo_redis_mocking.py +++ b/misc/demo_redis_mocking.py @@ -72,10 +72,10 @@ def demonstrate_redis_mocking(): # Patch the Redis availability and exceptions with ( - patch("orcabridge.hashing.string_cachers.REDIS_AVAILABLE", True), - patch("orcabridge.hashing.string_cachers.redis.RedisError", MockRedisError), + patch("orcapod.hashing.string_cachers.REDIS_AVAILABLE", True), + patch("orcapod.hashing.string_cachers.redis.RedisError", MockRedisError), patch( - "orcabridge.hashing.string_cachers.redis.ConnectionError", + "orcapod.hashing.string_cachers.redis.ConnectionError", MockConnectionError, ), ): diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index cbcfffc..e51a6f8 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,4 +1,3 @@ -# src/orcabridge/types.py from .core import Tag, Packet, TypeSpec, PathLike, PathSet, PodFunction from .registry import TypeRegistry from .handlers import PathHandler, UUIDHandler, DateTimeHandler From 56d559a3613d916bb8d56bc04509bb7d984e53ba Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:41:04 +0000 Subject: [PATCH 014/224] refactor: rename module to match class --- ...nt_hashable.py => content_identifiable.py} | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) rename src/orcapod/hashing/{content_hashable.py => content_identifiable.py} (88%) diff --git a/src/orcapod/hashing/content_hashable.py b/src/orcapod/hashing/content_identifiable.py similarity index 88% rename from src/orcapod/hashing/content_hashable.py rename to src/orcapod/hashing/content_identifiable.py index 61eb0e5..1581e62 100644 --- a/src/orcapod/hashing/content_hashable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -1,22 +1,27 @@ - from .types import ObjectHasher from .defaults import get_default_object_hasher from typing import Any class ContentIdentifiableBase: - def __init__(self, identity_structure_hasher: ObjectHasher | None = None, label: str | None = None) -> None: + def __init__( + self, + identity_structure_hasher: ObjectHasher | None = None, + label: str | None = None, + ) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. Args: identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ - self.identity_structure_hasher = identity_structure_hasher or get_default_object_hasher() + self.identity_structure_hasher = ( + identity_structure_hasher or get_default_object_hasher() + ) self._label = label @property - def label(self) -> str : + def label(self) -> str: """ Get the label of this object. @@ -35,13 +40,12 @@ def label(self, label: str | None) -> None: """ self._label = label - def computed_label(self) -> str|None: + def computed_label(self) -> str | None: """ Compute a label for this object based on its content. If label is not explicitly set for this object and computed_label returns a valid value, it will be used as label of this object. """ return None - def identity_structure(self) -> Any: """ @@ -56,7 +60,6 @@ def identity_structure(self) -> Any: """ return None - def __hash__(self) -> int: """ Hash implementation that uses the identity structure if provided, @@ -72,7 +75,7 @@ def __hash__(self) -> int: return super().__hash__() return self.identity_structure_hasher.hash_to_int(structure) - + def __eq__(self, other: object) -> bool: """ Equality check that compares the identity structures of two objects. @@ -86,4 +89,4 @@ def __eq__(self, other: object) -> bool: if not isinstance(other, ContentIdentifiableBase): return NotImplemented - return self.identity_structure() == other.identity_structure() \ No newline at end of file + return self.identity_structure() == other.identity_structure() From 59ad526334f977180820fa476a740972740f7e51 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:41:26 +0000 Subject: [PATCH 015/224] refactor: move core to legacy_core --- src/orcapod/hashing/__init__.py | 4 ++-- .../hashing/{core.py => legacy_core.py} | 22 ++++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) rename src/orcapod/hashing/{core.py => legacy_core.py} (98%) diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index d3d83e9..2354696 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -1,4 +1,4 @@ -from .core import ( +from .legacy_core import ( HashableMixin, function_content_hash, get_function_signature, @@ -24,7 +24,7 @@ FunctionInfoExtractor, CompositeFileHasher, ) -from .content_hashable import ContentIdentifiableBase +from .content_identifiable import ContentIdentifiableBase __all__ = [ "FileHasher", diff --git a/src/orcapod/hashing/core.py b/src/orcapod/hashing/legacy_core.py similarity index 98% rename from src/orcapod/hashing/core.py rename to src/orcapod/hashing/legacy_core.py index 08fd812..cfe9c56 100644 --- a/src/orcapod/hashing/core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -5,7 +5,8 @@ A library for creating stable, content-based hashes that remain consistent across Python sessions, suitable for arbitrarily nested data structures and custom objects via HashableMixin. """ -WARN_NONE_IDENTITY=False + +WARN_NONE_IDENTITY = False import hashlib import inspect import json @@ -436,11 +437,16 @@ def process_structure( if isinstance(obj, HashableMixin): logger.debug(f"Processing HashableMixin instance of type {type(obj).__name__}") return obj.content_hash() - - from .content_hashable import ContentIdentifiableBase + + from .content_identifiable import ContentIdentifiableBase + if isinstance(obj, ContentIdentifiableBase): - logger.debug(f"Processing ContentHashableBase instance of type {type(obj).__name__}") - return process_structure(obj.identity_structure(), visited, function_info_extractor) + logger.debug( + f"Processing ContentHashableBase instance of type {type(obj).__name__}" + ) + return process_structure( + obj.identity_structure(), visited, function_info_extractor + ) # Handle basic types if isinstance(obj, (str, int, float, bool)): @@ -838,7 +844,7 @@ def get_function_signature( name_override: str | None = None, include_defaults: bool = True, include_module: bool = True, - output_names: Collection[str] | None = None + output_names: Collection[str] | None = None, ) -> str: """ Get a stable string representation of a function's signature. @@ -877,9 +883,9 @@ def get_function_signature( if sig.return_annotation is not inspect.Signature.empty: parts["returns"] = sig.return_annotation - fn_string = f"{parts["module"] + "." if "module" in parts else ""}{parts["name"]}{parts["params"]}" + fn_string = f"{parts['module'] + '.' if 'module' in parts else ''}{parts['name']}{parts['params']}" if "returns" in parts: - fn_string = fn_string + f"-> {str(parts["returns"])}" + fn_string = fn_string + f"-> {str(parts['returns'])}" return fn_string From 3e0cdf40ab9669067c50f015f1c0f789ae2603a4 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:41:57 +0000 Subject: [PATCH 016/224] fix: update reference to core --- src/orcapod/store/safe_dir_data_store.py | 2 +- tests/test_hashing/test_basic_hashing.py | 2 +- tests/test_hashing/test_composite_hasher.py | 2 +- tests/test_hashing/test_path_set_hasher.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/orcapod/store/safe_dir_data_store.py b/src/orcapod/store/safe_dir_data_store.py index 0f0ce6a..7e16f63 100644 --- a/src/orcapod/store/safe_dir_data_store.py +++ b/src/orcapod/store/safe_dir_data_store.py @@ -205,7 +205,7 @@ def __init__( def _get_output_dir(self, function_name, content_hash, packet): """Get the output directory for a specific packet""" - from orcapod.hashing.core import hash_dict + from orcapod.hashing.legacy_core import hash_dict packet_hash = hash_dict(packet) return self.store_dir / function_name / content_hash / str(packet_hash) diff --git a/tests/test_hashing/test_basic_hashing.py b/tests/test_hashing/test_basic_hashing.py index df90a1a..c67723a 100644 --- a/tests/test_hashing/test_basic_hashing.py +++ b/tests/test_hashing/test_basic_hashing.py @@ -1,4 +1,4 @@ -from orcapod.hashing.core import ( +from orcapod.hashing.legacy_core import ( HashableMixin, hash_to_hex, hash_to_int, diff --git a/tests/test_hashing/test_composite_hasher.py b/tests/test_hashing/test_composite_hasher.py index 1cbe386..f92cfea 100644 --- a/tests/test_hashing/test_composite_hasher.py +++ b/tests/test_hashing/test_composite_hasher.py @@ -5,7 +5,7 @@ import pytest -from orcapod.hashing.core import hash_to_hex +from orcapod.hashing.legacy_core import hash_to_hex from orcapod.hashing.file_hashers import BasicFileHasher, DefaultCompositeFileHasher from orcapod.hashing.types import FileHasher, PacketHasher, PathSetHasher diff --git a/tests/test_hashing/test_path_set_hasher.py b/tests/test_hashing/test_path_set_hasher.py index 999cc2a..65e626a 100644 --- a/tests/test_hashing/test_path_set_hasher.py +++ b/tests/test_hashing/test_path_set_hasher.py @@ -8,7 +8,7 @@ import pytest -import orcapod.hashing.core +import orcapod.hashing.legacy_core from orcapod.hashing.file_hashers import DefaultPathsetHasher from orcapod.hashing.types import FileHasher @@ -35,7 +35,7 @@ def create_temp_file(content="test content"): # Store original function for restoration -original_hash_pathset = orcapod.hashing.core.hash_pathset +original_hash_pathset = orcapod.hashing.legacy_core.hash_pathset # Custom implementation of hash_pathset for tests that doesn't check for file existence @@ -46,7 +46,7 @@ def mock_hash_pathset( from collections.abc import Collection from os import PathLike - from orcapod.hashing.core import hash_to_hex + from orcapod.hashing.legacy_core import hash_to_hex from orcapod.utils.name import find_noncolliding_name # If file_hasher is None, we'll need to handle it differently From 50e07722d07cb19b3c5a1ab555c7acf0783bbd61 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:42:21 +0000 Subject: [PATCH 017/224] refactor: rename semantic arrow hasher module to generic arrow hashers --- ...antic_arrow_hasher.py => arrow_hashers.py} | 81 +------------------ 1 file changed, 2 insertions(+), 79 deletions(-) rename src/orcapod/hashing/{semantic_arrow_hasher.py => arrow_hashers.py} (70%) diff --git a/src/orcapod/hashing/semantic_arrow_hasher.py b/src/orcapod/hashing/arrow_hashers.py similarity index 70% rename from src/orcapod/hashing/semantic_arrow_hasher.py rename to src/orcapod/hashing/arrow_hashers.py index f3682ed..728b904 100644 --- a/src/orcapod/hashing/semantic_arrow_hasher.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,87 +1,10 @@ import hashlib -import os -from typing import Any, Protocol -from abc import ABC, abstractmethod +from typing import Any import pyarrow as pa import pyarrow.ipc as ipc from io import BytesIO import polars as pl - - -class SemanticTypeHasher(Protocol): - """Abstract base class for semantic type-specific hashers.""" - - @abstractmethod - def hash_column(self, column: pa.Array) -> bytes: - """Hash a column with this semantic type and return the hash bytes.""" - pass - - -class PathHasher(SemanticTypeHasher): - """Hasher for Path semantic type columns - hashes file contents.""" - - def __init__(self, chunk_size: int = 8192, handle_missing: str = "error"): - """ - Initialize PathHasher. - - Args: - chunk_size: Size of chunks to read files in bytes - handle_missing: How to handle missing files ('error', 'skip', 'null_hash') - """ - self.chunk_size = chunk_size - self.handle_missing = handle_missing - - def _hash_file_content(self, file_path: str) -> str: - """Hash the content of a single file and return hex string.""" - import os - - try: - if not os.path.exists(file_path): - if self.handle_missing == "error": - raise FileNotFoundError(f"File not found: {file_path}") - elif self.handle_missing == "skip": - return hashlib.sha256(b"").hexdigest() - elif self.handle_missing == "null_hash": - return hashlib.sha256(b"").hexdigest() - - hasher = hashlib.sha256() - - # Read file in chunks to handle large files efficiently - with open(file_path, "rb") as f: - while chunk := f.read(self.chunk_size): - hasher.update(chunk) - - return hasher.hexdigest() - - except (IOError, OSError, PermissionError) as e: - if self.handle_missing == "error": - raise IOError(f"Cannot read file {file_path}: {e}") - else: # skip or null_hash - error_msg = f"" - return hashlib.sha256(error_msg.encode("utf-8")).hexdigest() - - def hash_column(self, column: pa.Array) -> pa.Array: - """ - Replace path column with file content hashes. - Returns a new array where each path is replaced with its file content hash. - """ - - # Convert to python list for processing - paths = column.to_pylist() - - # Hash each file's content individually - content_hashes = [] - for path in paths: - if path is not None: - # Normalize path for consistency - normalized_path = os.path.normpath(str(path)) - file_content_hash = self._hash_file_content(normalized_path) - content_hashes.append(file_content_hash) - else: - content_hashes.append(None) # Preserve nulls - - # Return new array with content hashes instead of paths - return pa.array(content_hashes) +from .types import SemanticTypeHasher class SemanticArrowHasher: From 33103b8668602a1b66f94a005b9d0f0455dc1c08 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:44:04 +0000 Subject: [PATCH 018/224] refactor: rename variables to typespec --- src/orcapod/hashing/function_info_extractors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/orcapod/hashing/function_info_extractors.py b/src/orcapod/hashing/function_info_extractors.py index 2c32f05..816208b 100644 --- a/src/orcapod/hashing/function_info_extractors.py +++ b/src/orcapod/hashing/function_info_extractors.py @@ -14,8 +14,8 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_types: TypeSpec | None = None, - output_types: TypeSpec | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | None = None, ) -> dict[str, Any]: if not callable(func): raise TypeError("Provided object is not callable") @@ -38,8 +38,8 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_types: TypeSpec | None = None, - output_types: TypeSpec | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | None = None, ) -> dict[str, Any]: if not callable(func): raise TypeError("Provided object is not callable") From e35b024ca0d4e9cea92d4a178671e2d12dd1ffda Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:44:29 +0000 Subject: [PATCH 019/224] feat: collect refined hashing functions --- src/orcapod/hashing/hash_utils.py | 304 ++++++++++++++++++++++++++++++ 1 file changed, 304 insertions(+) create mode 100644 src/orcapod/hashing/hash_utils.py diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py new file mode 100644 index 0000000..df2d435 --- /dev/null +++ b/src/orcapod/hashing/hash_utils.py @@ -0,0 +1,304 @@ +from typing import Any +from .function_info_extractors import FunctionInfoExtractor +import logging +import json +from uuid import UUID +from pathlib import Path +from collections.abc import Mapping, Collection +import hashlib +import xxhash +import zlib + +logger = logging.getLogger(__name__) + + +def serialize_through_json(processed_obj) -> bytes: + """ + Create a deterministic string representation of a processed object structure. + + Args: + processed_obj: The processed object to serialize + + Returns: + A bytes object ready for hashing + """ + # TODO: add type check of processed obj + return json.dumps(processed_obj, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + + +def process_structure( + obj: Any, + visited: set[int] | None = None, + function_info_extractor: FunctionInfoExtractor | None = None, + force_hash: bool = False, +) -> Any: + """ + Recursively process a structure to prepare it for hashing. + + Args: + obj: The object or structure to process + visited: Set of object ids already visited (to handle circular references) + function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation + + Returns: + A processed version of the structure suitable for stable hashing + """ + # Initialize the visited set if this is the top-level call + if visited is None: + visited = set() + + # Check for circular references - use object's memory address + # NOTE: While id() is not stable across sessions, we only use it within a session + # to detect circular references, not as part of the final hash + obj_id = id(obj) + if obj_id in visited: + logger.debug( + f"Detected circular reference for object of type {type(obj).__name__}" + ) + return "CircularRef" # Don't include the actual id in hash output + + # For objects that could contain circular references, add to visited + if isinstance(obj, (dict, list, tuple, set)) or not isinstance( + obj, (str, int, float, bool, type(None)) + ): + visited.add(obj_id) + + # Handle None + if obj is None: + return None + + from .content_identifiable import ContentIdentifiableBase + + if isinstance(obj, ContentIdentifiableBase): + logger.debug( + f"Processing ContentHashableBase instance of type {type(obj).__name__}" + ) + # replace the object with expanded identity structure and re-process + return process_structure( + obj.identity_structure(), visited, function_info_extractor + ) + + # Handle basic types + if isinstance(obj, (str, int, float, bool)): + return obj + + # Handle bytes and bytearray + if isinstance(obj, (bytes, bytearray)): + logger.debug( + f"Converting bytes/bytearray of length {len(obj)} to hex representation" + ) + return obj.hex() + + # Handle Path objects + if isinstance(obj, Path): + logger.debug(f"Converting Path object to string: {obj}") + return str(obj) + + # Handle UUID objects + if isinstance(obj, UUID): + logger.debug(f"Converting UUID to string: {obj}") + return str(obj) + + # Handle named tuples (which are subclasses of tuple) + if hasattr(obj, "_fields") and isinstance(obj, tuple): + logger.debug(f"Processing named tuple of type {type(obj).__name__}") + # For namedtuples, convert to dict and then process + d = {field: getattr(obj, field) for field in obj._fields} # type: ignore + return process_structure(d, visited, function_info_extractor) + + # Handle mappings (dict-like objects) + if isinstance(obj, Mapping): + # Process both keys and values + processed_items = [ + ( + process_structure(k, visited, function_info_extractor), + process_structure(v, visited, function_info_extractor), + ) + for k, v in obj.items() + ] + + # Sort by the processed keys for deterministic order + processed_items.sort(key=lambda x: str(x[0])) + + # Create a new dictionary with string keys based on processed keys + # TODO: consider checking for possibly problematic values in processed_k + # and issue a warning + return { + str(processed_k): processed_v + for processed_k, processed_v in processed_items + } + + # Handle sets and frozensets + if isinstance(obj, (set, frozenset)): + logger.debug( + f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" + ) + # Process each item first, then sort the processed results + processed_items = [ + process_structure(item, visited, function_info_extractor) for item in obj + ] + return sorted(processed_items, key=str) + + # Handle collections (list-like objects) + if isinstance(obj, Collection): + logger.debug( + f"Processing collection of type {type(obj).__name__} with {len(obj)} items" + ) + return [ + process_structure(item, visited, function_info_extractor) for item in obj + ] + + # For functions, use the function_content_hash + if callable(obj) and hasattr(obj, "__code__"): + logger.debug(f"Processing function: {getattr(obj, '__name__')}") + if function_info_extractor is not None: + # Use the extractor to get a stable representation + function_info = function_info_extractor.extract_function_info(obj) + logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") + + # simply return the function info as a stable representation + return function_info + else: + raise ValueError( + f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" + ) + + # For other objects, attempt to create deterministic representation only if force_hash=True + class_name = obj.__class__.__name__ + module_name = obj.__class__.__module__ + if force_hash: + try: + import re + + logger.debug( + f"Processing generic object of type {module_name}.{class_name}" + ) + + # Try to get a stable dict representation if possible + if hasattr(obj, "__dict__"): + # Sort attributes to ensure stable order + attrs = sorted( + (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") + ) + # Limit to first 10 attributes to avoid extremely long representations + if len(attrs) > 10: + logger.debug( + f"Object has {len(attrs)} attributes, limiting to first 10" + ) + attrs = attrs[:10] + attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] + obj_repr = f"{{{', '.join(attr_strs)}}}" + else: + # Get basic repr but remove memory addresses + logger.debug( + "Object has no __dict__, using repr() with memory address removal" + ) + obj_repr = repr(obj) + if len(obj_repr) > 1000: + logger.debug( + f"Object repr is {len(obj_repr)} chars, truncating to 1000" + ) + obj_repr = obj_repr[:1000] + "..." + # Remove memory addresses which look like '0x7f9a1c2b3d4e' + obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + + return f"{module_name}.{class_name}-{obj_repr}" + except Exception as e: + # Last resort - use class name only + logger.warning(f"Failed to process object representation: {e}") + try: + return f"Object-{obj.__class__.__module__}.{obj.__class__.__name__}" + except AttributeError: + logger.error("Could not determine object class, using UnknownObject") + return "UnknownObject" + else: + raise ValueError( + f"Processing of {obj} of type {module_name}.{class_name} is not supported" + ) + + +def hash_object( + obj: Any, + function_info_extractor: FunctionInfoExtractor | None = None, +) -> bytes: + # Process the object to handle nested structures and HashableMixin instances + processed = process_structure(obj, function_info_extractor=function_info_extractor) + + # Serialize the processed structure + json_str = json.dumps(processed, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + logger.debug( + f"Successfully serialized {type(obj).__name__} using custom serializer" + ) + + # Create the hash + return hashlib.sha256(json_str).digest() + + +def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: + """ + Calculate the hash of a file using the specified algorithm. + + Parameters: + file_path (str): Path to the file to hash + algorithm (str): Hash algorithm to use - options include: + 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path' + buffer_size (int): Size of chunks to read from the file at a time + + Returns: + str: Hexadecimal digest of the hash + """ + # Verify the file exists + if not Path(file_path).is_file(): + raise FileNotFoundError(f"The file {file_path} does not exist") + + # Handle special case for 'hash_path' algorithm + if algorithm == "hash_path": + # Hash the name of the file instead of its content + # This is useful for cases where the file content is well known or + # not relevant + hasher = hashlib.sha256() + hasher.update(file_path.encode("utf-8")) + return hasher.digest() + + # Handle non-cryptographic hash functions + if algorithm == "xxh64": + hasher = xxhash.xxh64() + with open(file_path, "rb") as file: + while True: + data = file.read(buffer_size) + if not data: + break + hasher.update(data) + return hasher.digest() + + if algorithm == "crc32": + crc = 0 + with open(file_path, "rb") as file: + while True: + data = file.read(buffer_size) + if not data: + break + crc = zlib.crc32(data, crc) + return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big") + + # Handle cryptographic hash functions from hashlib + try: + hasher = hashlib.new(algorithm) + except ValueError: + valid_algorithms = ", ".join(sorted(hashlib.algorithms_available)) + raise ValueError( + f"Invalid algorithm: {algorithm}. Available algorithms: {valid_algorithms}, xxh64, crc32" + ) + + with open(file_path, "rb") as file: + while True: + data = file.read(buffer_size) + if not data: + break + hasher.update(data) + + return hasher.digest() From 02412d08b4834a2d2c183b538a985aa277a7318a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:44:51 +0000 Subject: [PATCH 020/224] feat: collect semantic type hashsers into a module --- src/orcapod/hashing/semantic_type_hashers.py | 64 ++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/orcapod/hashing/semantic_type_hashers.py diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py new file mode 100644 index 0000000..36dfd53 --- /dev/null +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -0,0 +1,64 @@ +from .types import SemanticTypeHasher, FileHasher +import os +import hashlib +import pyarrow as pa + + +class PathHasher(SemanticTypeHasher): + """Hasher for Path semantic type columns - hashes file contents.""" + + def __init__(self, file_hasher: FileHasher, handle_missing: str = "error"): + """ + Initialize PathHasher. + + Args: + chunk_size: Size of chunks to read files in bytes + handle_missing: How to handle missing files ('error', 'skip', 'null_hash') + """ + self.file_hasher = file_hasher + self.handle_missing = handle_missing + + def _hash_file_content(self, file_path: str) -> str: + """Hash the content of a single file and return hex string.""" + import os + + try: + if not os.path.exists(file_path): + if self.handle_missing == "error": + raise FileNotFoundError(f"File not found: {file_path}") + elif self.handle_missing == "skip": + return hashlib.sha256(b"").hexdigest() + elif self.handle_missing == "null_hash": + return hashlib.sha256(b"").hexdigest() + + return self.file_hasher.hash_file(file_path).hex() + + except (IOError, OSError, PermissionError) as e: + if self.handle_missing == "error": + raise IOError(f"Cannot read file {file_path}: {e}") + else: # skip or null_hash + error_msg = f"" + return hashlib.sha256(error_msg.encode("utf-8")).hexdigest() + + def hash_column(self, column: pa.Array) -> pa.Array: + """ + Replace path column with file content hashes. + Returns a new array where each path is replaced with its file content hash. + """ + + # Convert to python list for processing + paths = column.to_pylist() + + # Hash each file's content individually + content_hashes = [] + for path in paths: + if path is not None: + # Normalize path for consistency + normalized_path = os.path.normpath(str(path)) + file_content_hash = self._hash_file_content(normalized_path) + content_hashes.append(file_content_hash) + else: + content_hashes.append(None) # Preserve nulls + + # Return new array with content hashes instead of paths + return pa.array(content_hashes) From 1e9067968173812f4eaf6e0e71be83e282ea375f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:45:19 +0000 Subject: [PATCH 021/224] refactor: make file hasher return bytes --- src/orcapod/hashing/file_hashers.py | 75 ++++++++++++++++++++--------- src/orcapod/hashing/types.py | 17 ++++--- 2 files changed, 61 insertions(+), 31 deletions(-) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 77833ee..58076ac 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,4 +1,5 @@ -from orcapod.hashing.core import hash_file, hash_pathset, hash_packet +from orcapod.hashing import legacy_core +from orcapod.hashing.hash_utils import hash_file from orcapod.hashing.types import ( FileHasher, PathSetHasher, @@ -8,8 +9,6 @@ from orcapod.types import Packet, PathLike, PathSet -# Completely unnecessary to inherit from FileHasher, but this -# allows for type checking based on isinstance class BasicFileHasher: """Basic implementation for file hashing.""" @@ -21,7 +20,7 @@ def __init__( self.algorithm = algorithm self.buffer_size = buffer_size - def hash_file(self, file_path: PathLike) -> str: + def hash_file(self, file_path: PathLike) -> bytes: return hash_file( file_path, algorithm=self.algorithm, buffer_size=self.buffer_size ) @@ -38,18 +37,38 @@ def __init__( self.file_hasher = file_hasher self.string_cacher = string_cacher - def hash_file(self, file_path: PathLike) -> str: + def hash_file(self, file_path: PathLike) -> bytes: cache_key = f"file:{file_path}" cached_value = self.string_cacher.get_cached(cache_key) if cached_value is not None: - return cached_value + return bytes.fromhex(cached_value) value = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, value) + self.string_cacher.set_cached(cache_key, value.hex()) return value -class DefaultPathsetHasher: +# ----------------Legacy implementations for backward compatibility----------------- + + +class LegacyFileHasher: + def __init__( + self, + algorithm: str = "sha256", + buffer_size: int = 65536, + ): + self.algorithm = algorithm + self.buffer_size = buffer_size + + def hash_file(self, file_path: PathLike) -> bytes: + return bytes.fromhex( + legacy_core.hash_file( + file_path, algorithm=self.algorithm, buffer_size=self.buffer_size + ), + ) + + +class LegacyPathsetHasher: """Default pathset hasher that composes file hashing.""" def __init__( @@ -60,16 +79,21 @@ def __init__( self.file_hasher = file_hasher self.char_count = char_count - def hash_pathset(self, pathset: PathSet) -> str: + def _hash_file_to_hex(self, file_path: PathLike) -> str: + return self.file_hasher.hash_file(file_path).hex() + + def hash_pathset(self, pathset: PathSet) -> bytes: """Hash a pathset using the injected file hasher.""" - return hash_pathset( - pathset, - char_count=self.char_count, - file_hasher=self.file_hasher.hash_file, # Inject the method + return bytes.fromhex( + legacy_core.hash_pathset( + pathset, + char_count=self.char_count, + file_hasher=self._hash_file_to_hex, # Inject the method + ) ) -class DefaultPacketHasher: +class LegacyPacketHasher: """Default packet hasher that composes pathset hashing.""" def __init__( @@ -82,19 +106,22 @@ def __init__( self.char_count = char_count self.prefix = prefix + def _hash_pathset_to_hex(self, pathset: PathSet): + return self.pathset_hasher.hash_pathset(pathset).hex() + def hash_packet(self, packet: Packet) -> str: """Hash a packet using the injected pathset hasher.""" - hash_str = hash_packet( + hash_str = legacy_core.hash_packet( packet, char_count=self.char_count, prefix_algorithm=False, # Will apply prefix on our own - pathset_hasher=self.pathset_hasher.hash_pathset, # Inject the method + pathset_hasher=self._hash_pathset_to_hex, # Inject the method ) return f"{self.prefix}-{hash_str}" if self.prefix else hash_str # Convenience composite implementation -class DefaultCompositeFileHasher: +class LegacyCompositeFileHasher: """Composite hasher that implements all interfaces.""" def __init__( @@ -104,15 +131,15 @@ def __init__( packet_prefix: str = "", ): self.file_hasher = file_hasher - self.pathset_hasher = DefaultPathsetHasher(self.file_hasher, char_count) - self.packet_hasher = DefaultPacketHasher( + self.pathset_hasher = LegacyPathsetHasher(self.file_hasher, char_count) + self.packet_hasher = LegacyPacketHasher( self.pathset_hasher, char_count, packet_prefix ) - def hash_file(self, file_path: PathLike) -> str: + def hash_file(self, file_path: PathLike) -> bytes: return self.file_hasher.hash_file(file_path) - def hash_pathset(self, pathset: PathSet) -> str: + def hash_pathset(self, pathset: PathSet) -> bytes: return self.pathset_hasher.hash_pathset(pathset) def hash_packet(self, packet: Packet) -> str: @@ -120,7 +147,7 @@ def hash_packet(self, packet: Packet) -> str: # Factory for easy construction -class PathLikeHasherFactory: +class LegacyPathLikeHasherFactory: """Factory for creating various hasher combinations.""" @staticmethod @@ -132,7 +159,7 @@ def create_basic_composite( """Create a basic composite hasher.""" file_hasher = BasicFileHasher(algorithm, buffer_size) # use algorithm as the prefix for the packet hasher - return DefaultCompositeFileHasher( + return LegacyCompositeFileHasher( file_hasher, char_count, packet_prefix=algorithm ) @@ -146,7 +173,7 @@ def create_cached_composite( """Create a composite hasher with file caching.""" basic_file_hasher = BasicFileHasher(algorithm, buffer_size) cached_file_hasher = CachedFileHasher(basic_file_hasher, string_cacher) - return DefaultCompositeFileHasher( + return LegacyCompositeFileHasher( cached_file_hasher, char_count, packet_prefix=algorithm ) diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index abae409..310b5a2 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -29,6 +29,7 @@ def identity_structure(self) -> Any: class ObjectHasher(ABC): """Abstract class for general object hashing.""" + # TODO: consider more explicitly stating types of objects accepted @abstractmethod def hash(self, obj: Any) -> bytes: """ @@ -81,7 +82,7 @@ def hash_to_uuid( class FileHasher(Protocol): """Protocol for file-related hashing.""" - def hash_file(self, file_path: PathLike) -> str: ... + def hash_file(self, file_path: PathLike) -> bytes: ... # Higher-level operations that compose file hashing @@ -89,12 +90,7 @@ def hash_file(self, file_path: PathLike) -> str: ... class PathSetHasher(Protocol): """Protocol for hashing pathsets (files, directories, collections).""" - def hash_pathset(self, pathset: PathSet) -> str: ... - - -@runtime_checkable -class SemanticHasher(Protocol): - pass + def hash_pathset(self, pathset: PathSet) -> bytes: ... @runtime_checkable @@ -142,3 +138,10 @@ def extract_function_info( ) -> dict[str, Any]: ... +class SemanticTypeHasher(Protocol): + """Abstract base class for semantic type-specific hashers.""" + + @abstractmethod + def hash_column(self, column: pa.Array) -> list[bytes]: + """Hash a column with this semantic type and return the hash bytes.""" + pass From 78fdead1c5c84cd08a6f03789eb61276e014d1a9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:46:03 +0000 Subject: [PATCH 022/224] feat: add new defaut object hasher --- src/orcapod/hashing/defaults.py | 34 ++++++++++++++++++--------- src/orcapod/hashing/object_hashers.py | 30 ++++++++++++++++++++--- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 85e1405..1f3aca2 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,26 +1,27 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. -from orcapod.hashing.types import CompositeFileHasher, ArrowHasher -from orcapod.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.types import CompositeFileHasher, ArrowHasher, FileHasher +from orcapod.hashing.file_hashers import BasicFileHasher, LegacyPathLikeHasherFactory from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.hashing.object_hashers import ObjectHasher -from orcapod.hashing.object_hashers import LegacyObjectHasher +from orcapod.hashing.object_hashers import DefaultObjectHasher, LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory -from orcapod.hashing.semantic_arrow_hasher import SemanticArrowHasher, PathHasher +from orcapod.hashing.arrow_hashers import SemanticArrowHasher +from orcapod.hashing.semantic_type_hashers import PathHasher def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher: if with_cache: # use unlimited caching string_cacher = InMemoryCacher(max_size=None) - return PathLikeHasherFactory.create_cached_composite(string_cacher) - return PathLikeHasherFactory.create_basic_composite() + return LegacyPathLikeHasherFactory.create_cached_composite(string_cacher) + return LegacyPathLikeHasherFactory.create_basic_composite() def get_default_composite_file_hasher_with_cacher(cacher=None) -> CompositeFileHasher: if cacher is None: cacher = InMemoryCacher(max_size=None) - return PathLikeHasherFactory.create_cached_composite(cacher) + return LegacyPathLikeHasherFactory.create_cached_composite(cacher) def get_default_object_hasher() -> ObjectHasher: @@ -29,15 +30,26 @@ def get_default_object_hasher() -> ObjectHasher: strategy="signature" ) ) - return LegacyObjectHasher( - char_count=32, function_info_extractor=function_info_extractor + return DefaultObjectHasher(function_info_extractor=function_info_extractor) + + +def get_legacy_object_hasher() -> ObjectHasher: + function_info_extractor = ( + FunctionInfoExtractorFactory.create_function_info_extractor( + strategy="signature" + ) ) + return LegacyObjectHasher(function_info_extractor=function_info_extractor) def get_default_arrow_hasher( - chunk_size: int = 8192, handle_missing: str = "error" + chunk_size: int = 8192, + handle_missing: str = "error", + file_hasher: FileHasher | None = None, ) -> ArrowHasher: + if file_hasher is None: + file_hasher = BasicFileHasher() hasher = SemanticArrowHasher(chunk_size=chunk_size, handle_missing=handle_missing) # register semantic hasher for Path - hasher.register_semantic_hasher("Path", PathHasher()) + hasher.register_semantic_hasher("Path", PathHasher(file_hasher=file_hasher)) return hasher diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index a3f4b39..7e35ccb 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,5 +1,31 @@ +from polars import Object from .types import FunctionInfoExtractor, ObjectHasher -from .core import legacy_hash +from .legacy_core import legacy_hash +from .hash_utils import hash_object + + +class DefaultObjectHasher(ObjectHasher): + """ + Default object hasher used throughout the codebase. + """ + + def __init__( + self, + function_info_extractor: FunctionInfoExtractor | None = None, + ): + self.function_info_extractor = function_info_extractor + + def hash(self, obj: object) -> bytes: + """ + Hash an object to a byte representation. + + Args: + obj (object): The object to hash. + + Returns: + bytes: The byte representation of the hash. + """ + return hash_object(obj, function_info_extractor=self.function_info_extractor) class LegacyObjectHasher(ObjectHasher): @@ -13,7 +39,6 @@ class LegacyObjectHasher(ObjectHasher): def __init__( self, - char_count: int | None = 32, function_info_extractor: FunctionInfoExtractor | None = None, ): """ @@ -22,7 +47,6 @@ def __init__( Args: function_info_extractor (FunctionInfoExtractor | None): Optional extractor for function information. This must be provided if an object containing function information is to be hashed. """ - self.char_count = char_count self.function_info_extractor = function_info_extractor def hash(self, obj: object) -> bytes: From 3dcaa0b377f67ab96166b532ea54fc7bbf06f9fe Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 07:46:13 +0000 Subject: [PATCH 023/224] test: update ref --- tests/test_hashing/test_process_structure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hashing/test_process_structure.py b/tests/test_hashing/test_process_structure.py index 933e2dc..2967ed4 100644 --- a/tests/test_hashing/test_process_structure.py +++ b/tests/test_hashing/test_process_structure.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import Any -from orcapod.hashing.core import HashableMixin, hash_to_hex, process_structure +from orcapod.hashing.legacy_core import HashableMixin, hash_to_hex, process_structure # Define a simple HashableMixin class for testing From 89ddd76bf2cf0cf130a9f633f1523c495da8cc79 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 27 Jun 2025 16:31:08 +0000 Subject: [PATCH 024/224] fix: handle type vars in process_structure --- src/orcapod/hashing/hash_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index df2d435..7fee36b 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -165,6 +165,11 @@ def process_structure( f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" ) + # handle data types + if isinstance(obj, type): + logger.debug(f"Processing class/type: {obj.__name__}") + return f"type:{obj.__class__.__module__}.{obj.__class__.__name__}" + # For other objects, attempt to create deterministic representation only if force_hash=True class_name = obj.__class__.__name__ module_name = obj.__class__.__module__ @@ -204,12 +209,12 @@ def process_structure( # Remove memory addresses which look like '0x7f9a1c2b3d4e' obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) - return f"{module_name}.{class_name}-{obj_repr}" + return f"{module_name}.{class_name}:{obj_repr}" except Exception as e: # Last resort - use class name only logger.warning(f"Failed to process object representation: {e}") try: - return f"Object-{obj.__class__.__module__}.{obj.__class__.__name__}" + return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" except AttributeError: logger.error("Could not determine object class, using UnknownObject") return "UnknownObject" From 905f91524b34e7d1cacfbaf9dbf868e20a1ec685 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 30 Jun 2025 19:39:50 +0000 Subject: [PATCH 025/224] wip: use new schema system --- src/orcapod/core/base.py | 7 +- src/orcapod/core/operators.py | 18 +- src/orcapod/core/pod.py | 6 +- src/orcapod/core/sources.py | 2 +- src/orcapod/core/streams.py | 10 +- src/orcapod/hashing/__init__.py | 4 +- src/orcapod/hashing/arrow_hashers.py | 77 ++- src/orcapod/hashing/arrow_utils.py | 403 +++++++++++++++ src/orcapod/hashing/defaults.py | 47 +- src/orcapod/hashing/file_hashers.py | 18 +- src/orcapod/hashing/semantic_type_hashers.py | 35 +- src/orcapod/hashing/string_cachers.py | 26 +- src/orcapod/hashing/types.py | 16 +- src/orcapod/hashing/versioned_hashers.py | 71 +++ src/orcapod/pipeline/wrappers.py | 121 ++--- src/orcapod/types/__init__.py | 18 +- src/orcapod/types/core.py | 29 +- src/orcapod/types/packet_converter.py | 177 +++++++ src/orcapod/types/packets.py | 241 +++++++++ src/orcapod/types/registry.py | 437 ---------------- src/orcapod/types/schemas.py | 267 ++++++++++ ...{handlers.py => semantic_type_handlers.py} | 12 +- src/orcapod/types/semantic_type_registry.py | 468 ++++++++++++++++++ .../types/{typespec.py => typespec_utils.py} | 59 ++- src/orcapod/utils/stream_utils.py | 50 -- tests/test_hashing/test_composite_hasher.py | 156 ------ tests/test_store/test_transfer_data_store.py | 1 - .../test_extract_function_data_types.py | 2 +- 28 files changed, 1945 insertions(+), 833 deletions(-) create mode 100644 src/orcapod/hashing/arrow_utils.py create mode 100644 src/orcapod/hashing/versioned_hashers.py create mode 100644 src/orcapod/types/packet_converter.py create mode 100644 src/orcapod/types/packets.py delete mode 100644 src/orcapod/types/registry.py create mode 100644 src/orcapod/types/schemas.py rename src/orcapod/types/{handlers.py => semantic_type_handlers.py} (92%) create mode 100644 src/orcapod/types/semantic_type_registry.py rename src/orcapod/types/{typespec.py => typespec_utils.py} (83%) delete mode 100644 tests/test_hashing/test_composite_hasher.py diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index fc18b48..9a30873 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -10,8 +10,7 @@ from orcapod.hashing import ContentIdentifiableBase from orcapod.types import Packet, Tag, TypeSpec -from orcapod.utils.stream_utils import get_typespec - +from orcapod.types.typespec import get_typespec_from_dict import logging @@ -151,7 +150,7 @@ def types( return None, None tag, packet = next(iter(self(*streams))) - return get_typespec(tag), get_typespec(packet) + return get_typespec_from_dict(tag), get_typespec_from_dict(packet) def claims_unique_tags( self, *streams: "SyncStream", trigger_run: bool = False @@ -391,7 +390,7 @@ def types(self, *, trigger_run=False) -> tuple[TypeSpec | None, TypeSpec | None] # otherwise, use the keys from the first packet in the stream # note that this may be computationally expensive tag, packet = next(iter(self)) - return tag_types or get_typespec(tag), packet_types or get_typespec(packet) + return tag_types or get_typespec_from_dict(tag), packet_types or get_typespec_from_dict(packet) def claims_unique_tags(self, *, trigger_run=False) -> bool | None: """ diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index 53ecacc..c68f34f 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -4,6 +4,7 @@ from typing import Any from orcapod.types import Packet, Tag, TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from orcapod.hashing import function_content_hash, hash_function from orcapod.core.base import Kernel, SyncStream, Operator from orcapod.core.streams import SyncStreamFromGenerator @@ -11,11 +12,8 @@ batch_packet, batch_tags, check_packet_compatibility, - intersection_typespecs, join_tags, semijoin_tags, - union_typespecs, - intersection_typespecs, fill_missing ) @@ -268,7 +266,7 @@ def generator() -> Iterator[tuple[Tag, Packet]]: raise ValueError( f"Packets are not compatible: {left_packet} and {right_packet}" ) - yield joined_tag, {**left_packet, **right_packet} + yield joined_tag, Packet({**left_packet, **right_packet}) return SyncStreamFromGenerator(generator) @@ -307,7 +305,7 @@ def generator(): ) # match is found - remove the packet from the inner stream inner_stream.pop(idx) - yield joined_tag, {**outer_packet, **inner_packet} + yield joined_tag, Packet({**outer_packet, **inner_packet}) # if enough matches found, move onto the next outer stream packet break @@ -402,11 +400,11 @@ def forward(self, *streams: SyncStream) -> SyncStream: def generator(): for tag, packet in stream: if self.drop_unmapped: - packet = { + packet = Packet({ v: packet[k] for k, v in self.key_map.items() if k in packet - } + }) else: - packet = {self.key_map.get(k, k): v for k, v in packet.items()} + packet = Packet({self.key_map.get(k, k): v for k, v in packet.items()}) yield tag, packet return SyncStreamFromGenerator(generator) @@ -861,9 +859,9 @@ def generator() -> Iterator[tuple[Tag, Packet]]: if k not in new_tag: new_tag[k] = [t.get(k, None) for t, _ in packets] # combine all packets into a single packet - combined_packet: Packet = { + combined_packet: Packet = Packet({ k: [p.get(k, None) for _, p in packets] for k in packet_keys - } + }) yield new_tag, combined_packet return SyncStreamFromGenerator(generator) diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index 77d1610..eb880b4 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -8,8 +8,8 @@ ) from orcapod.types import Packet, Tag, TypeSpec, default_registry -from orcapod.types.typespec import extract_function_typespecs -from orcapod.types.registry import PacketConverter +from orcapod.types.typespec_utils import extract_function_typespecs +from orcapod.types.semantic_type_registry import PacketConverter from orcapod.hashing import ( FunctionInfoExtractor, @@ -258,7 +258,7 @@ def call(self, tag, packet) -> tuple[Tag, Packet | None]: f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" ) - output_packet: Packet = {k: v for k, v in zip(self.output_keys, output_values)} + output_packet: Packet = Packet({k: v for k, v in zip(self.output_keys, output_values)}) return tag, output_packet def identity_structure(self, *streams) -> Any: diff --git a/src/orcapod/core/sources.py b/src/orcapod/core/sources.py index 33df20d..21adae9 100644 --- a/src/orcapod/core/sources.py +++ b/src/orcapod/core/sources.py @@ -78,7 +78,7 @@ def forward(self, *streams: SyncStream) -> SyncStream: def generator() -> Iterator[tuple[Tag, Packet]]: for file in Path(self.file_path).glob(self.pattern): - yield self.tag_function(file), {self.name: str(file)} + yield self.tag_function(file), Packet({self.name: str(file)}) return SyncStreamFromGenerator(generator) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index c70b009..33f6b78 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -1,7 +1,7 @@ from collections.abc import Callable, Collection, Iterator from orcapod.core.base import SyncStream -from orcapod.types import Packet, Tag, TypeSpec +from orcapod.types import Packet, PacketLike, Tag, TypeSpec from copy import copy @@ -9,8 +9,8 @@ class SyncStreamFromLists(SyncStream): def __init__( self, tags: Collection[Tag] | None = None, - packets: Collection[Packet] | None = None, - paired: Collection[tuple[Tag, Packet]] | None = None, + packets: Collection[PacketLike] | None = None, + paired: Collection[tuple[Tag, PacketLike]] | None = None, tag_keys: list[str] | None = None, packet_keys: list[str] | None = None, tag_typespec: TypeSpec | None = None, @@ -33,9 +33,9 @@ def __init__( raise ValueError( "tags and packets must have the same length if both are provided" ) - self.paired = list(zip(tags, packets)) + self.paired = list((t, Packet(v)) for t, v in zip(tags, packets)) elif paired is not None: - self.paired = list(paired) + self.paired = list((t, Packet(v)) for t, v in paired) else: raise ValueError( "Either tags and packets or paired must be provided to SyncStreamFromLists" diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 2354696..2bdff2b 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -16,7 +16,7 @@ get_default_arrow_hasher, ) from .types import ( - FileHasher, + FileContentHasher, PacketHasher, ArrowHasher, ObjectHasher, @@ -27,7 +27,7 @@ from .content_identifiable import ContentIdentifiableBase __all__ = [ - "FileHasher", + "FileContentHasher", "PacketHasher", "ArrowHasher", "StringCacher", diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 728b904..c50ebfc 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -4,7 +4,33 @@ import pyarrow.ipc as ipc from io import BytesIO import polars as pl -from .types import SemanticTypeHasher +import json +from orcapod.hashing.types import SemanticTypeHasher, StringCacher + + +def serialize_pyarrow_table(table: pa.Table) -> str: + """ + Serialize a PyArrow table to a stable JSON string by converting to dictionary of lists. + + Args: + table: PyArrow table to serialize + + Returns: + JSON string representation with sorted keys and no whitespace + """ + # Convert table to dictionary of lists using to_pylist() + data_dict = {} + + for column_name in table.column_names: + # Convert Arrow column to Python list, which visits all elements + data_dict[column_name] = table.column(column_name).to_pylist() + + # Serialize to JSON with sorted keys and no whitespace + return json.dumps( + data_dict, + separators=(",", ":"), + sort_keys=True, + ) class SemanticArrowHasher: @@ -18,7 +44,14 @@ class SemanticArrowHasher: 4. Computes final hash of the processed packet """ - def __init__(self, chunk_size: int = 8192, handle_missing: str = "error"): + def __init__( + self, + hasher_id: str, + hash_algorithm: str = "sha256", + chunk_size: int = 8192, + handle_missing: str = "error", + semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, + ): """ Initialize SemanticArrowHasher. @@ -26,9 +59,28 @@ def __init__(self, chunk_size: int = 8192, handle_missing: str = "error"): chunk_size: Size of chunks to read files in bytes handle_missing: How to handle missing files ('error', 'skip', 'null_hash') """ + self._hasher_id = hasher_id self.chunk_size = chunk_size self.handle_missing = handle_missing - self.semantic_type_hashers: dict[str, SemanticTypeHasher] = {} + self.semantic_type_hashers: dict[str, SemanticTypeHasher] = ( + semantic_type_hashers or {} + ) + self.hash_algorithm = hash_algorithm + + def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: + """ + Add a string cacher for caching hash values. + + This is a no-op for SemanticArrowHasher since it hashes column contents directly. + """ + # SemanticArrowHasher does not use string caching, so this is a no-op + if semantic_type in self.semantic_type_hashers: + self.semantic_type_hashers[semantic_type].set_cacher(cacher) + else: + raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") + + def get_hasher_id(self) -> str: + return self._hasher_id def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): """Register a custom hasher for a semantic type.""" @@ -117,6 +169,7 @@ def _sort_table_columns(self, table: pa.Table) -> pa.Table: return pa.table(sorted_columns, schema=sorted_schema) def _serialize_table_ipc(self, table: pa.Table) -> bytes: + # TODO: fix and use logical table hashing instead """Serialize table using Arrow IPC format for stable binary representation.""" buffer = BytesIO() @@ -126,13 +179,12 @@ def _serialize_table_ipc(self, table: pa.Table) -> bytes: return buffer.getvalue() - def hash_table(self, table: pa.Table, algorithm: str = "sha256") -> str: + def hash_table(self, table: pa.Table, add_prefix: bool = True) -> str: """ Compute stable hash of Arrow table. Args: table: Arrow table to hash - algorithm: Hash algorithm to use ('sha256', 'md5', etc.) Returns: Hex string of the computed hash @@ -152,14 +204,16 @@ def hash_table(self, table: pa.Table, algorithm: str = "sha256") -> str: serialized_bytes = self._serialize_table_ipc(sorted_table) # Step 4: Compute final hash - hasher = hashlib.new(algorithm) + hasher = hashlib.new(self.hash_algorithm) hasher.update(serialized_bytes) - return hasher.hexdigest() + hash_str = hasher.hexdigest() + if add_prefix: + hash_str = f"{self.get_hasher_id()}:{hash_str}" + + return hash_str - def hash_table_with_metadata( - self, table: pa.Table, algorithm: str = "sha256" - ) -> dict[str, Any]: + def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: """ Compute hash with additional metadata about the process. @@ -180,11 +234,10 @@ def hash_table_with_metadata( processed_columns.append(column_info) # Compute hash - table_hash = self.hash_table(table, algorithm) + table_hash = self.hash_table(table) return { "hash": table_hash, - "algorithm": algorithm, "num_rows": len(table), "num_columns": len(table.schema), "processed_columns": processed_columns, diff --git a/src/orcapod/hashing/arrow_utils.py b/src/orcapod/hashing/arrow_utils.py new file mode 100644 index 0000000..168c53f --- /dev/null +++ b/src/orcapod/hashing/arrow_utils.py @@ -0,0 +1,403 @@ +import pyarrow as pa +import json +import hashlib +from typing import Dict, List, Any +from decimal import Decimal +import base64 + + +def serialize_pyarrow_table_schema(table: pa.Table) -> str: + """ + Serialize PyArrow table schema to JSON with Python type names and filtered metadata. + + Args: + table: PyArrow table + + Returns: + JSON string representation of schema + """ + schema_info = [] + + for field in table.schema: + field_info = { + "name": field.name, + "type": _arrow_type_to_python_type(field.type), + "metadata": _extract_semantic_metadata(field.metadata), + } + schema_info.append(field_info) + + return json.dumps(schema_info, separators=(",", ":"), sort_keys=True) + + +def serialize_pyarrow_table(table: pa.Table) -> str: + """ + Serialize a PyArrow table to a stable JSON string with both schema and data. + + Args: + table: PyArrow table to serialize + + Returns: + JSON string representation with schema and data sections + """ + # Convert table to dictionary of lists using to_pylist() + data_dict = {} + + for column_name in table.column_names: + column = table.column(column_name) + # Convert Arrow column to Python list, which visits all elements + column_values = column.to_pylist() + + # Handle special types that need encoding for JSON + data_dict[column_name] = [ + _serialize_value_for_json(val) for val in column_values + ] + + # Serialize schema + schema_info = [] + for field in table.schema: + field_info = { + "name": field.name, + "type": _arrow_type_to_python_type(field.type), + "metadata": _extract_semantic_metadata(field.metadata), + } + schema_info.append(field_info) + + # Combine schema and data + serialized_table = {"schema": schema_info, "data": data_dict} + + # Serialize to JSON with sorted keys and no whitespace + return json.dumps( + serialized_table, + separators=(",", ":"), + sort_keys=True, + default=_json_serializer, + ) + + +def get_pyarrow_table_hash(table: pa.Table) -> str: + """ + Get a stable SHA-256 hash of the table content. + + Args: + table: PyArrow table + + Returns: + SHA-256 hash of the serialized table + """ + serialized = serialize_pyarrow_table(table) + return hashlib.sha256(serialized.encode("utf-8")).hexdigest() + + +def deserialize_to_pyarrow_table(serialized_str: str) -> pa.Table: + """ + Deserialize JSON string back to a PyArrow table. + + Args: + serialized_str: JSON string from serialize_pyarrow_table + + Returns: + Reconstructed PyArrow table + """ + parsed_data = json.loads(serialized_str) + + # Handle both old format (dict of lists) and new format (schema + data) + if "data" in parsed_data and "schema" in parsed_data: + # New format with schema and data + data_dict = parsed_data["data"] + schema_info = parsed_data["schema"] + else: + # Old format - just data dict + data_dict = parsed_data + schema_info = None + + if not data_dict: + return pa.table([]) + + # Deserialize each column + arrays = [] + names = [] + + for column_name in sorted(data_dict.keys()): # Sort for consistency + column_values = [_deserialize_value(val) for val in data_dict[column_name]] + arrays.append(pa.array(column_values)) + names.append(column_name) + + return pa.table(arrays, names=names) + + +def _arrow_type_to_python_type(arrow_type: pa.DataType) -> str: + """ + Convert PyArrow data type to standard Python type name. + + Args: + arrow_type: PyArrow data type + + Returns: + Python type name as string + """ + if pa.types.is_boolean(arrow_type): + return "bool" + elif pa.types.is_integer(arrow_type): + return "int" + elif pa.types.is_floating(arrow_type): + return "float" + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return "str" + elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + return "bytes" + elif pa.types.is_date(arrow_type): + return "date" + elif pa.types.is_timestamp(arrow_type): + return "datetime" + elif pa.types.is_time(arrow_type): + return "time" + elif pa.types.is_decimal(arrow_type): + return "decimal" + elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type): + element_type = _arrow_type_to_python_type(arrow_type.value_type) + return f"list[{element_type}]" + elif pa.types.is_struct(arrow_type): + return "dict" + elif pa.types.is_dictionary(arrow_type): + value_type = _arrow_type_to_python_type(arrow_type.value_type) + return value_type # Dictionary encoding is transparent + elif pa.types.is_null(arrow_type): + return "NoneType" + else: + # Fallback for other types + return str(arrow_type).lower() + + +def _extract_semantic_metadata(field_metadata) -> Dict[str, str]: + """ + Extract only 'semantic_type' metadata from field metadata. + + Args: + field_metadata: PyArrow field metadata (can be None) + + Returns: + Dictionary containing only semantic_type if present, empty dict otherwise + """ + if field_metadata is None: + return {} + + metadata_dict = dict(field_metadata) + + # Only keep semantic_type if it exists + if "semantic_type" in metadata_dict: + return { + "semantic_type": metadata_dict["semantic_type"].decode("utf-8") + if isinstance(metadata_dict["semantic_type"], bytes) + else metadata_dict["semantic_type"] + } + else: + return {} + + +def _serialize_value_for_json(value: Any) -> Any: + """ + Prepare a Python value for JSON serialization. + + Args: + value: Python value from to_pylist() + + Returns: + JSON-serializable value + """ + if value is None: + return None + elif isinstance(value, bytes): + return { + "__type__": "bytes", + "__value__": base64.b64encode(value).decode("ascii"), + } + elif isinstance(value, Decimal): + return {"__type__": "decimal", "__value__": str(value)} + elif hasattr(value, "date") and hasattr(value, "time"): # datetime objects + return {"__type__": "datetime", "__value__": value.isoformat()} + elif hasattr(value, "isoformat") and not hasattr( + value, "time" + ): # date objects (no time component) + return {"__type__": "date", "__value__": value.isoformat()} + elif isinstance(value, (list, tuple)): + return [_serialize_value_for_json(item) for item in value] + elif isinstance(value, dict): + return {k: _serialize_value_for_json(v) for k, v in sorted(value.items())} + else: + return value + + +def _deserialize_value(value: Any) -> Any: + """ + Deserialize a value from the JSON representation. + + Args: + value: Value from JSON + + Returns: + Python value suitable for PyArrow + """ + if value is None: + return None + elif isinstance(value, dict) and "__type__" in value: + type_name = value["__type__"] + val = value["__value__"] + + if type_name == "bytes": + return base64.b64decode(val.encode("ascii")) + elif type_name == "decimal": + return Decimal(val) + elif type_name == "datetime": + from datetime import datetime + + return datetime.fromisoformat(val) + elif type_name == "date": + from datetime import date + + return date.fromisoformat(val) + else: + return val + elif isinstance(value, list): + return [_deserialize_value(item) for item in value] + elif isinstance(value, dict): + return {k: _deserialize_value(v) for k, v in value.items()} + else: + return value + + +def _json_serializer(obj): + """Custom JSON serializer for edge cases.""" + if hasattr(obj, "date") and hasattr(obj, "time"): # datetime objects + return {"__type__": "datetime", "__value__": obj.isoformat()} + elif hasattr(obj, "isoformat") and not hasattr(obj, "time"): # date objects + return {"__type__": "date", "__value__": obj.isoformat()} + elif isinstance(obj, bytes): + return {"__type__": "bytes", "__value__": base64.b64encode(obj).decode("ascii")} + elif isinstance(obj, Decimal): + return {"__type__": "decimal", "__value__": str(obj)} + else: + return str(obj) # Fallback to string representation + + +# Example usage and testing +if __name__ == "__main__": + import datetime + + # Create a sample PyArrow table with various types + data = { + "integers": [1, 2, 3, 4, 5], + "floats": [1.1, 2.2, 3.3, 4.4, 5.5], + "strings": ["a", "b", "c", "d", "e"], + "booleans": [True, False, True, False, True], + "nulls": [1, None, 3, None, 5], + "dates": [ + datetime.date(2023, 1, 1), + datetime.date(2023, 1, 2), + None, + datetime.date(2023, 1, 4), + datetime.date(2023, 1, 5), + ], + } + + table = pa.table(data) + print("Original table:") + print(table) + print() + + # Serialize the table + serialized = serialize_pyarrow_table(table) + print("Serialized JSON (first 200 chars):") + print(serialized[:200] + "..." if len(serialized) > 200 else serialized) + print() + + # Get hash + table_hash = get_pyarrow_table_hash(table) + print(f"Table hash: {table_hash}") + print() + + # Test stability + serialized2 = serialize_pyarrow_table(table) + hash2 = get_pyarrow_table_hash(table) + + print(f"Serialization is stable: {serialized == serialized2}") + print(f"Hash is stable: {table_hash == hash2}") + print() + + # Test with different column order + print("--- Testing column order stability ---") + data_reordered = { + "strings": ["a", "b", "c", "d", "e"], + "integers": [1, 2, 3, 4, 5], + "nulls": [1, None, 3, None, 5], + "floats": [1.1, 2.2, 3.3, 4.4, 5.5], + "booleans": [True, False, True, False, True], + "dates": [ + datetime.date(2023, 1, 1), + datetime.date(2023, 1, 2), + None, + datetime.date(2023, 1, 4), + datetime.date(2023, 1, 5), + ], + } + + table_reordered = pa.table(data_reordered) + serialized_reordered = serialize_pyarrow_table(table_reordered) + hash_reordered = get_pyarrow_table_hash(table_reordered) + + print( + f"Same content, different column order produces same serialization: {serialized == serialized_reordered}" + ) + print( + f"Same content, different column order produces same hash: {table_hash == hash_reordered}" + ) + print() + + # Test schema serialization + print("\n--- Testing schema serialization ---") + + # Create table with metadata + schema = pa.schema( + [ + pa.field( + "integers", + pa.int64(), + metadata={"semantic_type": "id", "other_meta": "ignored"}, + ), + pa.field("floats", pa.float64(), metadata={"semantic_type": "measurement"}), + pa.field("strings", pa.string()), # No metadata + pa.field( + "booleans", pa.bool_(), metadata={"other_meta": "ignored"} + ), # No semantic_type + pa.field("dates", pa.date32(), metadata={"semantic_type": "event_date"}), + ] + ) + + table_with_schema = pa.table(data, schema=schema) + schema_json = serialize_pyarrow_table_schema(table_with_schema) + print(f"Schema JSON: {schema_json}") + + # Parse and display nicely + import json as json_module + + schema_parsed = json_module.loads(schema_json) + print("\nParsed schema:") + for field in schema_parsed: + print(f" {field['name']}: {field['type']} (metadata: {field['metadata']})") + + # Test deserialization + reconstructed = deserialize_to_pyarrow_table(serialized) + print("Reconstructed table:") + print(reconstructed) + print() + + # Verify round-trip + reconstructed_hash = get_pyarrow_table_hash(reconstructed) + print(f"Round-trip hash matches: {table_hash == reconstructed_hash}") + + # Show actual JSON structure for small example + print("\n--- Small example JSON structure ---") + small_table = pa.table( + {"numbers": [1, 2, None], "text": ["hello", "world", "test"]} + ) + small_json = serialize_pyarrow_table(small_table) + print(f"Small table JSON: {small_json}") diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 1f3aca2..61539b5 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,6 +1,11 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. -from orcapod.hashing.types import CompositeFileHasher, ArrowHasher, FileHasher +from orcapod.hashing.types import ( + CompositeFileHasher, + ArrowHasher, + FileContentHasher, + StringCacher, +) from orcapod.hashing.file_hashers import BasicFileHasher, LegacyPathLikeHasherFactory from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.hashing.object_hashers import ObjectHasher @@ -8,20 +13,41 @@ from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.arrow_hashers import SemanticArrowHasher from orcapod.hashing.semantic_type_hashers import PathHasher +from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher + + +def get_default_arrow_hasher( + cache_file_hash: bool | StringCacher = True, +) -> ArrowHasher: + """ + Get the default Arrow hasher with semantic type support. + If `with_cache` is True, it uses an in-memory cacher for caching hash values. + """ + arrow_hasher = get_versioned_semantic_arrow_hasher() + if cache_file_hash: + # use unlimited caching + if isinstance(cache_file_hash, StringCacher): + string_cacher = cache_file_hash + else: + string_cacher = InMemoryCacher(max_size=None) + + arrow_hasher.set_cacher("path", string_cacher) + + return arrow_hasher def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher: if with_cache: # use unlimited caching string_cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_composite(string_cacher) - return LegacyPathLikeHasherFactory.create_basic_composite() + return LegacyPathLikeHasherFactory.create_cached_legacy_composite(string_cacher) + return LegacyPathLikeHasherFactory.create_basic_legacy_composite() def get_default_composite_file_hasher_with_cacher(cacher=None) -> CompositeFileHasher: if cacher is None: cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_composite(cacher) + return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) def get_default_object_hasher() -> ObjectHasher: @@ -40,16 +66,3 @@ def get_legacy_object_hasher() -> ObjectHasher: ) ) return LegacyObjectHasher(function_info_extractor=function_info_extractor) - - -def get_default_arrow_hasher( - chunk_size: int = 8192, - handle_missing: str = "error", - file_hasher: FileHasher | None = None, -) -> ArrowHasher: - if file_hasher is None: - file_hasher = BasicFileHasher() - hasher = SemanticArrowHasher(chunk_size=chunk_size, handle_missing=handle_missing) - # register semantic hasher for Path - hasher.register_semantic_hasher("Path", PathHasher(file_hasher=file_hasher)) - return hasher diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 58076ac..cd12e80 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,7 +1,7 @@ from orcapod.hashing import legacy_core from orcapod.hashing.hash_utils import hash_file from orcapod.hashing.types import ( - FileHasher, + FileContentHasher, PathSetHasher, StringCacher, CompositeFileHasher, @@ -31,7 +31,7 @@ class CachedFileHasher: def __init__( self, - file_hasher: FileHasher, + file_hasher: FileContentHasher, string_cacher: StringCacher, ): self.file_hasher = file_hasher @@ -73,7 +73,7 @@ class LegacyPathsetHasher: def __init__( self, - file_hasher: FileHasher, + file_hasher: FileContentHasher, char_count: int | None = 32, ): self.file_hasher = file_hasher @@ -126,7 +126,7 @@ class LegacyCompositeFileHasher: def __init__( self, - file_hasher: FileHasher, + file_hasher: FileContentHasher, char_count: int | None = 32, packet_prefix: str = "", ): @@ -151,27 +151,27 @@ class LegacyPathLikeHasherFactory: """Factory for creating various hasher combinations.""" @staticmethod - def create_basic_composite( + def create_basic_legacy_composite( algorithm: str = "sha256", buffer_size: int = 65536, char_count: int | None = 32, ) -> CompositeFileHasher: """Create a basic composite hasher.""" - file_hasher = BasicFileHasher(algorithm, buffer_size) + file_hasher = LegacyFileHasher(algorithm, buffer_size) # use algorithm as the prefix for the packet hasher return LegacyCompositeFileHasher( file_hasher, char_count, packet_prefix=algorithm ) @staticmethod - def create_cached_composite( + def create_cached_legacy_composite( string_cacher: StringCacher, algorithm: str = "sha256", buffer_size: int = 65536, char_count: int | None = 32, ) -> CompositeFileHasher: """Create a composite hasher with file caching.""" - basic_file_hasher = BasicFileHasher(algorithm, buffer_size) + basic_file_hasher = LegacyFileHasher(algorithm, buffer_size) cached_file_hasher = CachedFileHasher(basic_file_hasher, string_cacher) return LegacyCompositeFileHasher( cached_file_hasher, char_count, packet_prefix=algorithm @@ -182,7 +182,7 @@ def create_file_hasher( string_cacher: StringCacher | None = None, algorithm: str = "sha256", buffer_size: int = 65536, - ) -> FileHasher: + ) -> FileContentHasher: """Create just a file hasher, optionally with caching.""" basic_hasher = BasicFileHasher(algorithm, buffer_size) if string_cacher is None: diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 36dfd53..5be28b0 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -1,4 +1,4 @@ -from .types import SemanticTypeHasher, FileHasher +from orcapod.hashing.types import SemanticTypeHasher, FileContentHasher, StringCacher import os import hashlib import pyarrow as pa @@ -7,7 +7,13 @@ class PathHasher(SemanticTypeHasher): """Hasher for Path semantic type columns - hashes file contents.""" - def __init__(self, file_hasher: FileHasher, handle_missing: str = "error"): + def __init__( + self, + file_hasher: FileContentHasher, + handle_missing: str = "error", + string_cacher: StringCacher | None = None, + cache_key_prefix: str = "path_hasher", + ): """ Initialize PathHasher. @@ -17,11 +23,20 @@ def __init__(self, file_hasher: FileHasher, handle_missing: str = "error"): """ self.file_hasher = file_hasher self.handle_missing = handle_missing + self.cacher = string_cacher + self.cache_key_prefix = cache_key_prefix def _hash_file_content(self, file_path: str) -> str: """Hash the content of a single file and return hex string.""" import os + # if cacher exists, check if the hash is cached + if self.cacher: + cache_key = f"{self.cache_key_prefix}:{file_path}" + cached_hash = self.cacher.get_cached(cache_key) + if cached_hash is not None: + return cached_hash + try: if not os.path.exists(file_path): if self.handle_missing == "error": @@ -31,7 +46,13 @@ def _hash_file_content(self, file_path: str) -> str: elif self.handle_missing == "null_hash": return hashlib.sha256(b"").hexdigest() - return self.file_hasher.hash_file(file_path).hex() + hashed_value = self.file_hasher.hash_file(file_path).hex() + if self.cacher: + # Cache the computed hash + self.cacher.set_cached( + f"{self.cache_key_prefix}:{file_path}", hashed_value + ) + return hashed_value except (IOError, OSError, PermissionError) as e: if self.handle_missing == "error": @@ -62,3 +83,11 @@ def hash_column(self, column: pa.Array) -> pa.Array: # Return new array with content hashes instead of paths return pa.array(content_hashes) + + def set_cacher(self, cacher: StringCacher) -> None: + """ + Add a string cacher for caching hash values. + This is a no-op for PathHasher since it hashes file contents directly. + """ + # PathHasher does not use string caching, so this is a no-op + self.cacher = cacher diff --git a/src/orcapod/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py index 9b2244a..620dece 100644 --- a/src/orcapod/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -13,10 +13,12 @@ if TYPE_CHECKING: import redis + def _get_redis(): """Lazy import for Redis to avoid circular dependencies.""" try: import redis + return redis except ImportError as e: return None @@ -615,7 +617,9 @@ def __init__( # TODO: cleanup the redis use pattern self._redis_module = _get_redis() if self._redis_module is None: - raise ImportError("Could not import Redis module. redis package is required for RedisCacher") + raise ImportError( + "Could not import Redis module. redis package is required for RedisCacher" + ) self.key_prefix = key_prefix self._connection_failed = False self._lock = threading.RLock() @@ -658,7 +662,10 @@ def _test_connection(self) -> None: f"Redis connection established successfully with prefix '{self.key_prefix}'" ) - except (self._redis_module.RedisError, self._redis_module.ConnectionError) as e: + except ( + self._redis_module.RedisError, + self._redis_module.ConnectionError, + ) as e: logging.error(f"Failed to establish Redis connection: {e}") raise RuntimeError(f"Redis connection test failed: {e}") @@ -690,7 +697,10 @@ def get_cached(self, cache_key: str) -> str | None: return str(result) - except (self._redis_module.RedisError, self._redis_module.ConnectionError) as e: + except ( + self._redis_module.RedisError, + self._redis_module.ConnectionError, + ) as e: self._handle_redis_error("get", e) return None @@ -708,7 +718,10 @@ def set_cached(self, cache_key: str, value: str) -> None: self.redis.set(self._get_prefixed_key(cache_key), value) - except (self._redis_module.RedisError, self._redis_module.ConnectionError) as e: + except ( + self._redis_module.RedisError, + self._redis_module.ConnectionError, + ) as e: self._handle_redis_error("set", e) def clear_cache(self) -> None: @@ -722,7 +735,10 @@ def clear_cache(self) -> None: if keys: self.redis.delete(*list(keys)) # type: ignore[arg-type] - except (self._redis_module.RedisError, self._redis_module.ConnectionError) as e: + except ( + self._redis_module.RedisError, + self._redis_module.ConnectionError, + ) as e: self._handle_redis_error("clear", e) def is_connected(self) -> bool: diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 310b5a2..10ed267 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -79,7 +79,7 @@ def hash_to_uuid( @runtime_checkable -class FileHasher(Protocol): +class FileContentHasher(Protocol): """Protocol for file-related hashing.""" def hash_file(self, file_path: PathLike) -> bytes: ... @@ -104,7 +104,7 @@ def hash_packet(self, packet: Packet) -> str: ... class ArrowHasher(Protocol): """Protocol for hashing arrow packets.""" - def hash_table(self, table: pa.Table) -> str: ... + def hash_table(self, table: pa.Table, add_prefix: bool = True) -> str: ... @runtime_checkable @@ -118,7 +118,7 @@ def clear_cache(self) -> None: ... # Combined interface for convenience (optional) @runtime_checkable -class CompositeFileHasher(FileHasher, PathSetHasher, PacketHasher, Protocol): +class CompositeFileHasher(FileContentHasher, PathSetHasher, PacketHasher, Protocol): """Combined interface for all file-related hashing operations.""" pass @@ -142,6 +142,14 @@ class SemanticTypeHasher(Protocol): """Abstract base class for semantic type-specific hashers.""" @abstractmethod - def hash_column(self, column: pa.Array) -> list[bytes]: + def hash_column( + self, + column: pa.Array, + ) -> pa.Array: """Hash a column with this semantic type and return the hash bytes.""" pass + + @abstractmethod + def set_cacher(self, cacher: StringCacher) -> None: + """Add a string cacher for caching hash values.""" + pass diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py new file mode 100644 index 0000000..22c715e --- /dev/null +++ b/src/orcapod/hashing/versioned_hashers.py @@ -0,0 +1,71 @@ +# A collection of versioned hashers that provide a "default" implementation of hashers. +from .arrow_hashers import SemanticArrowHasher +import importlib +from typing import Any + +CURRENT_VERSION = "v0.1" + +versioned_hashers = { + "v0.1": { + "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", + "config": { + "hasher_id": "default_v0.1", + "hash_algorithm": "sha256", + "chunk_size": 8192, + "semantic_type_hashers": { + "path": { + "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", + "config": { + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "config": { + "algorithm": "sha256", + }, + } + }, + } + }, + }, + } +} + + +def parse_objectspec(obj_spec: dict) -> Any: + if "_class" in obj_spec: + # if _class is specified, treat the dict as an object specification + module_name, class_name = obj_spec["_class"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + configs = parse_objectspec(obj_spec.get("config", {})) + return cls(**configs) + else: + # otherwise, parse through the dictionary recursively + parsed_object = obj_spec + for k, v in obj_spec.items(): + if isinstance(v, dict): + parsed_object[k] = parse_objectspec(v) + else: + parsed_object[k] = v + return parsed_object + + +def get_versioned_semantic_arrow_hasher( + version: str | None = None, +) -> SemanticArrowHasher: + """ + Get the versioned hasher for the specified version. + + Args: + version (str): The version of the hasher to retrieve. + + Returns: + SemanticArrowHasher: An instance of the hasher for the specified version. + """ + if version is None: + version = CURRENT_VERSION + + if version not in versioned_hashers: + raise ValueError(f"Unsupported hasher version: {version}") + + hasher_spec = versioned_hashers[version] + return parse_objectspec(hasher_spec) diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index e953f1f..4396223 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -2,69 +2,23 @@ from orcapod.core import SyncStream, Source, Kernel from orcapod.store import ArrowDataStore from orcapod.types import Tag, Packet, TypeSpec, default_registry -from orcapod.types.typespec import extract_function_typespecs +from orcapod.types.typespec_utils import get_typespec_from_dict, union_typespecs, extract_function_typespecs +from orcapod.types.semantic_type_registry import create_arrow_table_with_meta from orcapod.hashing import ObjectHasher, ArrowHasher from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher from typing import Any, Literal from collections.abc import Collection, Iterator -from orcapod.types.registry import TypeRegistry, PacketConverter +from orcapod.types.semantic_type_registry import TypeRegistry +from orcapod.types.packet_converter import PacketConverter import pyarrow as pa import polars as pl from orcapod.core.streams import SyncStreamFromGenerator -from orcapod.utils.stream_utils import get_typespec, union_typespecs import logging logger = logging.getLogger(__name__) -def tag_to_arrow_table_with_metadata(tag, metadata: dict | None = None): - """ - Convert a tag dictionary to PyArrow table with metadata on each column. - - Args: - tag: Dictionary with string keys and any Python data type values - metadata_key: The metadata key to add to each column - metadata_value: The metadata value to indicate this column came from tag - """ - if metadata is None: - metadata = {} - - # First create the table to infer types - temp_table = pa.Table.from_pylist([tag]) - - # Create new fields with metadata - fields_with_metadata = [] - for field in temp_table.schema: - # Add metadata to each field - field_metadata = metadata - new_field = pa.field( - field.name, field.type, nullable=field.nullable, metadata=field_metadata - ) - fields_with_metadata.append(new_field) - - # Create schema with metadata - schema_with_metadata = pa.schema(fields_with_metadata) - - # Create the final table with the metadata-enriched schema - table = pa.Table.from_pylist([tag], schema=schema_with_metadata) - - return table - - -def get_columns_with_metadata( - df: pl.DataFrame, key: str, value: str | None = None -) -> list[str]: - """Get column names with specific metadata using list comprehension. If value is given, only - columns matching that specific value for the desginated metadata key will be returned. - Otherwise, all columns that contains the key as metadata will be returned regardless of the value""" - return [ - col_name - for col_name, dtype in df.schema.items() - if hasattr(dtype, "metadata") - and (value is None or getattr(dtype, "metadata") == value) - ] - class PolarsSource(Source): def __init__(self, df: pl.DataFrame, tag_keys: Collection[str] | None = None): @@ -81,18 +35,15 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: class PolarsStream(SyncStream): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str] | None = None): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]): self.df = df - if tag_keys is None: - # extract tag_keys by picking columns with metadata source=tag - tag_keys = get_columns_with_metadata(df, "source", "tag") self.tag_keys = tag_keys def __iter__(self) -> Iterator[tuple[Tag, Packet]]: for row in self.df.iter_rows(named=True): tag = {key: row[key] for key in self.tag_keys} packet = {key: val for key, val in row.items() if key not in self.tag_keys} - yield tag, packet + yield tag, Packet(packet) class EmptyStream(SyncStream): @@ -266,26 +217,44 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: output_stream = self.kernel.forward(*resolved_streams, **kwargs) - tag_type, packet_type = output_stream.types(trigger_run=False) - if tag_type is not None and packet_type is not None: - joined_type = union_typespecs(tag_type, packet_type) + tag_typespec, packet_typespec = output_stream.types(trigger_run=False) + if tag_typespec is not None and packet_typespec is not None: + joined_type = union_typespecs(tag_typespec, packet_typespec) assert joined_type is not None, "Joined typespec should not be None" - self.output_converter = PacketConverter(joined_type, registry=self.registry) + all_type = dict(joined_type) + for k in packet_typespec: + all_type[f'_source_{k}'] = str + # + self.output_converter = PacketConverter(all_type, registry=self.registry) # Cache the output stream of the underlying kernel - # This is a no-op if the output stream is already cached + # If an entry with same tag and packet already exists in the output store, + # it will not be added again, thus avoiding duplicates. def generator() -> Iterator[tuple[Tag, Packet]]: logger.info(f"Computing and caching outputs for {self}") for tag, packet in output_stream: merged_info = {**tag, **packet} + # add entries for source_info + for k, v in packet.source_info.items(): + merged_info[f'_source_{k}'] = v + if self.output_converter is None: - joined_type = get_typespec(merged_info) + # TODO: cleanup logic here + joined_type = get_typespec_from_dict(merged_info) assert joined_type is not None, "Joined typespec should not be None" + all_type = dict(joined_type) + for k in packet: + all_type[f'_source_{k}'] = str self.output_converter = PacketConverter( - joined_type, registry=self.registry + all_type, registry=self.registry ) + # add entries for source_info + for k, v in packet.source_info.items(): + merged_info[f'_source_{k}'] = v + output_table = self.output_converter.to_arrow_table(merged_info) + # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table) if not self.output_store.get_record(*self.source_info, output_id): self.output_store.add_record( @@ -463,7 +432,6 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: return super().forward(*streams, **kwargs) def get_packet_key(self, packet: Packet) -> str: - # TODO: reconsider the logic around input/output converter -- who should own this? return self.arrow_hasher.hash_table(self.input_converter.to_arrow_table(packet)) @property @@ -473,23 +441,25 @@ def source_info(self): def is_memoized(self, packet: Packet) -> bool: return self.retrieve_memoized(packet) is not None - def add_tag_record(self, tag: Tag, packet: Packet) -> Tag: + def add_pipeline_record(self, tag: Tag, packet: Packet) -> Tag: """ Record the tag for the packet in the record store. This is used to keep track of the tags associated with memoized packets. """ - return self._add_tag_record_with_packet_key(tag, self.get_packet_key(packet)) + return self._add_pipeline_record_with_packet_key(tag, self.get_packet_key(packet), packet.source_info) - def _add_tag_record_with_packet_key(self, tag: Tag, packet_key: str) -> Tag: + def _add_pipeline_record_with_packet_key(self, tag: Tag, packet_key: str, packet_source_info: dict[str, str | None]) -> Tag: if self.tag_store is None: raise ValueError("Recording of tag requires tag_store but none provided") - tag = dict(tag) # ensure we don't modify the original tag - tag["__packet_key"] = packet_key + combined_info = dict(tag) # ensure we don't modify the original tag + combined_info["__packet_key"] = packet_key + for k, v in packet_source_info.items(): + combined_info[f'__{k}_source'] = v # TODO: consider making this more efficient # convert tag to arrow table - columns are labeled with metadata source=tag - table = tag_to_arrow_table_with_metadata(tag, {"source": "tag"}) + table = create_arrow_table_with_meta(combined_info, {"source": "tag"}) entry_hash = self.arrow_hasher.hash_table(table) @@ -553,8 +523,7 @@ def _memoize_with_packet_key( # consider simpler alternative packets = self.output_converter.from_arrow_table( self.output_store.add_record( - self.function_pod.function_name, - self.function_pod_hash, + *self.source_info, packet_key, self.output_converter.to_arrow_table(output_packet), ) @@ -563,7 +532,13 @@ def _memoize_with_packet_key( assert len(packets) == 1, ( f"Memoizing single packet returned {len(packets)} packets!" ) - return packets[0] + packet = packets[0] + # TODO: reconsider the right place to attach this information + # attach provenance information + packet_source_id = ":".join(self.source_info + (packet_key,)) + source_info = {k: f'{packet_source_id}:{k}' for k in packet} + return Packet(packet, source_info=source_info) + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: packet_key = "" @@ -603,7 +578,7 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: # result was successfully computed -- save the tag if not self.skip_tag_record and self.tag_store is not None: - self._add_tag_record_with_packet_key(tag, packet_key) + self._add_pipeline_record_with_packet_key(tag, packet_key, packet.source_info) return tag, output_packet diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index e51a6f8..a4615f5 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,12 +1,13 @@ -from .core import Tag, Packet, TypeSpec, PathLike, PathSet, PodFunction -from .registry import TypeRegistry -from .handlers import PathHandler, UUIDHandler, DateTimeHandler -from . import handlers -from . import typespec +from .core import Tag, PathLike, PathSet, PodFunction, TypeSpec +from .packets import Packet +from .semantic_type_registry import SemanticTypeRegistry +from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler +from . import semantic_type_handlers +from . import typespec_utils # Create default registry and register handlers -default_registry = TypeRegistry() +default_registry = SemanticTypeRegistry() # Register with semantic names - registry extracts supported types automatically default_registry.register("path", PathHandler()) @@ -19,10 +20,11 @@ "default_registry", "Tag", "Packet", + "PacketLike" "TypeSpec", "PathLike", "PathSet", "PodFunction", - "handlers", - "typespec", + "semantic_type_handlers", + "typespec_utils", ] diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 097750e..dd02141 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -1,19 +1,10 @@ -from typing import Protocol, Any, TypeAlias +from typing import Protocol, Any, TypeAlias, TypeVar, Generic import pyarrow as pa from dataclasses import dataclass import os from collections.abc import Collection, Mapping -# TODO: reconsider the need for this dataclass as its information is superfluous -# to the registration of the handler into the registry. -@dataclass -class TypeInfo: - python_type: type - arrow_type: pa.DataType - semantic_type: str | None # name under which the type is registered - handler: "TypeHandler" - DataType: TypeAlias = type @@ -22,8 +13,6 @@ class TypeInfo: ] # Mapping of parameter names to their types -SUPPORTED_PYTHON_TYPES = (str, int, float, bool, bytes) - # Convenience alias for anything pathlike PathLike = str | os.PathLike @@ -45,14 +34,8 @@ class TypeInfo: # Extended data values that can be stored in packets # Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = PathSet | SupportedNativePythonData | Collection["DataValue"] - - -# a packet is a mapping from string keys to data values -Packet: TypeAlias = Mapping[str, DataValue] +DataValue: TypeAlias = PathSet | SupportedNativePythonData | None | Collection["DataValue"] -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[Tag, Collection[Packet]] class PodFunction(Protocol): @@ -68,7 +51,7 @@ def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: . class TypeHandler(Protocol): - """Protocol for handling conversion between Python types and underlying Arrow + """Protocol for handling conversion between Python type and Arrow data types used for storage. The handler itself IS the definition of a semantic type. The semantic type @@ -78,11 +61,11 @@ class TypeHandler(Protocol): and focus purely on conversion logic. """ - def python_types(self) -> type | tuple[type, ...]: + def python_type(self) -> type: """Return the Python type(s) this handler can process. Returns: - Single Type or tuple of Types this handler supports + Python type the handler supports Examples: - PathHandler: return Path @@ -91,7 +74,7 @@ def python_types(self) -> type | tuple[type, ...]: """ ... - def storage_type(self) -> pa.DataType: + def storage_type(self) -> type: """Return the Arrow DataType instance for schema definition.""" ... diff --git a/src/orcapod/types/packet_converter.py b/src/orcapod/types/packet_converter.py new file mode 100644 index 0000000..0a8389d --- /dev/null +++ b/src/orcapod/types/packet_converter.py @@ -0,0 +1,177 @@ +from orcapod.types.core import TypeSpec, TypeHandler +from orcapod.types.packets import Packet, PacketLike +from orcapod.types.semantic_type_registry import SemanticTypeRegistry, TypeInfo, get_metadata_from_schema, arrow_to_dicts +from typing import Any +from collections.abc import Mapping, Sequence +import pyarrow as pa +import logging + +logger = logging.getLogger(__name__) + + +def is_packet_supported( + python_type_info: TypeSpec, registry: SemanticTypeRegistry, type_lut: dict | None = None +) -> bool: + """Check if all types in the packet are supported by the registry or known to the default lut.""" + if type_lut is None: + type_lut = {} + return all( + python_type in registry or python_type in type_lut + for python_type in python_type_info.values() + ) + + + +class PacketConverter: + def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): + self.python_type_spec = python_type_spec + self.registry = registry + + # Lookup handlers and type info for fast access + self.handlers: dict[str, TypeHandler] = {} + self.storage_type_info: dict[str, TypeInfo] = {} + + self.expected_key_set = set(python_type_spec.keys()) + + # prepare the corresponding arrow table schema with metadata + self.keys_with_handlers, self.schema = create_schema_from_python_type_info( + python_type_spec, registry + ) + + self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") + + def _check_key_consistency(self, keys): + """Check if the provided keys match the expected keys.""" + keys_set = set(keys) + if keys_set != self.expected_key_set: + missing_keys = self.expected_key_set - keys_set + extra_keys = keys_set - self.expected_key_set + error_parts = [] + if missing_keys: + error_parts.append(f"Missing keys: {missing_keys}") + if extra_keys: + error_parts.append(f"Extra keys: {extra_keys}") + + raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") + + def _to_storage_packet(self, packet: PacketLike) -> dict[str, Any]: + """Convert packet to storage representation. + + Args: + packet: Dictionary mapping parameter names to Python values + + Returns: + Dictionary with same keys but values converted to storage format + + Raises: + KeyError: If packet keys don't match the expected type_info keys + TypeError: If value type doesn't match expected type + ValueError: If conversion fails + """ + # Validate packet keys + packet_keys = set(packet.keys()) + + self._check_key_consistency(packet_keys) + + # Convert each value + storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet + + for key, handler in self.keys_with_handlers: + try: + storage_packet[key] = handler.python_to_storage(storage_packet[key]) + except Exception as e: + raise ValueError(f"Failed to convert value for '{key}': {e}") from e + + return storage_packet + + def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> PacketLike: + """Convert storage packet back to Python packet. + + Args: + storage_packet: Dictionary with values in storage format + + Returns: + Packet with values converted back to Python types + + Raises: + KeyError: If storage packet keys don't match the expected type_info keys + TypeError: If value type doesn't match expected type + ValueError: If conversion fails + """ + # Validate storage packet keys + storage_keys = set(storage_packet.keys()) + + self._check_key_consistency(storage_keys) + + # Convert each value back to Python type + packet: PacketLike = dict(storage_packet) + + for key, handler in self.keys_with_handlers: + try: + packet[key] = handler.storage_to_python(storage_packet[key]) + except Exception as e: + raise ValueError(f"Failed to convert value for '{key}': {e}") from e + + return packet + + def to_arrow_table(self, packet: PacketLike | Sequence[PacketLike]) -> pa.Table: + """Convert packet to PyArrow Table with field metadata. + + Args: + packet: Dictionary mapping parameter names to Python values + + Returns: + PyArrow Table with the packet data as a single row + """ + # Convert packet to storage format + if not isinstance(packet, Sequence): + packets = [packet] + else: + packets = packet + + storage_packets = [self._to_storage_packet(p) for p in packets] + + # Create arrays + arrays = [] + for field in self.schema: + values = [p[field.name] for p in storage_packets] + array = pa.array(values, type=field.type) + arrays.append(array) + + return pa.Table.from_arrays(arrays, schema=self.schema) + + def from_arrow_table( + self, table: pa.Table, verify_semantic_equivalence: bool = True + ) -> list[Packet]: + """Convert Arrow table to packet with field metadata. + + Args: + table: PyArrow Table with metadata + + Returns: + List of packets converted from the Arrow table + """ + # Check for consistency in the semantic type mapping: + semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") + + if semantic_type_info != self.semantic_type_lut: + if not verify_semantic_equivalence: + logger.warning( + "Arrow table semantic types do not match expected type registry. " + f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" + ) + else: + raise ValueError( + "Arrow table semantic types do not match expected type registry. " + f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" + ) + + # Create packets from the Arrow table + # TODO: make this more efficient + storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore + if not self.keys_with_handlers: + # no special handling required + return storage_packets + + return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] + diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/packets.py new file mode 100644 index 0000000..a8d8775 --- /dev/null +++ b/src/orcapod/types/packets.py @@ -0,0 +1,241 @@ +from orcapod.types.core import DataValue +from typing import TypeAlias, Any +from collections.abc import Mapping, Collection +from orcapod.types.core import TypeSpec, Tag, TypeHandler +from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types import schemas +import pyarrow as pa + +# # a packet is a mapping from string keys to data values +PacketLike: TypeAlias = Mapping[str, DataValue] + + +class Packet(dict[str, DataValue]): + def __init__( + self, + obj: PacketLike | None = None, + typespec: TypeSpec | None = None, + source_info: dict[str, str|None] | None = None + ): + if obj is None: + obj = {} + super().__init__(obj) + if typespec is None: + from orcapod.types.typespec_utils import get_typespec_from_dict + typespec = get_typespec_from_dict(self) + self._typespec = typespec + if source_info is None: + source_info = {} + self._source_info = source_info + + @property + def typespec(self) -> TypeSpec: + # consider returning a copy for immutability + return self._typespec + + @property + def source_info(self) -> dict[str, str | None]: + return {key: self._source_info.get(key, None) for key in self.keys()} + + + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[Tag, Collection[Packet]] + + +class SemanticPacket(dict[str, Any]): + """ + A packet that conforms to a semantic schema, mapping string keys to values. + + This is used to represent data packets in OrcaPod with semantic types. + + Attributes + ---------- + keys : str + The keys of the packet. + values : Any + The values corresponding to each key. + + Examples + -------- + >>> packet = SemanticPacket(name='Alice', age=30) + >>> print(packet) + {'name': 'Alice', 'age': 30} + """ + def __init__(self, *args, semantic_schema: schemas.SemanticSchema | None = None, source_info: dict[str, str|None] | None = None, **kwargs): + super().__init__(*args, **kwargs) + self.schema = semantic_schema + if source_info is None: + source_info = {} + self.source_info = source_info + + + +class PacketConverter: + def __init__(self, python_schema: schemas.PythonSchema, registry: SemanticTypeRegistry, include_source_info: bool = True): + self.python_schema = python_schema + self.registry = registry + + self.semantic_schema = schemas.from_python_schema_to_semantic_schema( + python_schema, registry + ) + + self.include_source_info = include_source_info + + self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( + self.semantic_schema, include_source_info=self.include_source_info + ) + + + + self.key_handlers: dict[str, TypeHandler] = {} + + self.expected_key_set = set(self.python_schema.keys()) + + for key, (_, semantic_type) in self.semantic_schema.items(): + if semantic_type is None: + continue + handler = registry.get_handler_by_semantic_type(semantic_type) + if handler is None: + raise ValueError( + f"No handler found for semantic type '{semantic_type}' in key '{key}'" + ) + self.key_handlers[key] = handler + + def _check_key_consistency(self, keys): + """Check if the provided keys match the expected keys.""" + keys_set = set(keys) + if keys_set != self.expected_key_set: + missing_keys = self.expected_key_set - keys_set + extra_keys = keys_set - self.expected_key_set + error_parts = [] + if missing_keys: + error_parts.append(f"Missing keys: {missing_keys}") + if extra_keys: + error_parts.append(f"Extra keys: {extra_keys}") + + raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") + + def from_python_packet_to_semantic_packet(self, python_packet: PacketLike) -> SemanticPacket: + """Convert a Python packet to a semantic packet. + + Args: + python_packet: Dictionary mapping parameter names to Python values + + Returns: + Packet with values converted to semantic types + + Raises: + KeyError: If packet keys don't match the expected type_info keys + TypeError: If value type doesn't match expected type + ValueError: If conversion fails + """ + # Validate packet keys + semantic_packet = SemanticPacket(python_packet, semantic_schema=self.semantic_schema, source_info=getattr(python_packet, "source_info", None)) + self._check_key_consistency(set(semantic_packet.keys())) + + # convert from storage to Python types for semantic types + for key, handler in self.key_handlers.items(): + try: + semantic_packet[key] = handler.python_to_storage( + semantic_packet[key] + ) + except Exception as e: + raise ValueError(f"Failed to convert value for '{key}': {e}") from e + + return semantic_packet + + + + def from_python_packet_to_arrow_table(self, python_packet: PacketLike) -> pa.Table: + """Convert a Python packet to an Arrow table. + + Args: + python_packet: Dictionary mapping parameter names to Python values + + Returns: + Arrow table representation of the packet + """ + semantic_packet = self.from_python_packet_to_semantic_packet(python_packet) + return self.from_semantic_packet_to_arrow_table(semantic_packet) + + def from_semantic_packet_to_arrow_table(self, semantic_packet: SemanticPacket) -> pa.Table: + """Convert a semantic packet to an Arrow table. + + Args: + semantic_packet: SemanticPacket with values to convert + + Returns: + Arrow table representation of the packet + """ + arrays = [] + for field in self.arrow_schema: + value = semantic_packet.get(field.name, None) + arrays.append(pa.array([value], type=field.type)) + + if self.include_source_info: + for field, value in semantic_packet.source_info.items(): + arrays.append(pa.array([value], type=pa.large_string())) + + return pa.Table.from_arrays(arrays, schema=self.arrow_schema) + + def from_arrow_table_to_semantic_packets(self, arrow_table: pa.Table) -> Collection[SemanticPacket]: + """Convert an Arrow table to a semantic packet. + + Args: + arrow_table: Arrow table representation of the packet + + Returns: + SemanticPacket with values converted from Arrow types + """ + # TODO: this is a crude check, implement more robust one to check that + # schema matches what's expected + if not arrow_table.schema.equals(self.arrow_schema): + raise ValueError("Arrow table schema does not match expected schema") + + semantic_packets_contents = arrow_table.to_pylist() + + semantic_packets = [] + for all_packet_content in semantic_packets_contents: + packet_content = {k: v for k, v in all_packet_content.items() if k in self.expected_key_set} + source_info = {k.strip('_source_info_'): v for k, v in all_packet_content.items() if k.startswith('_source_info_')} + semantic_packets.append(SemanticPacket(packet_content, _semantic_schema=self.semantic_schema, _source_info=source_info)) + + return semantic_packets + + def from_semantic_packet_to_python_packet(self, semantic_packet: SemanticPacket) -> Packet: + """Convert a semantic packet to a Python packet. + + Args: + semantic_packet: SemanticPacket with values to convert + + Returns: + Python packet representation of the semantic packet + """ + # Validate packet keys + python_packet = Packet(semantic_packet, typespec=self.python_schema, source_info=semantic_packet.source_info) + packet_keys = set(python_packet.keys()) + self._check_key_consistency(packet_keys) + + for key, handler in self.key_handlers.items(): + try: + python_packet[key] = handler.storage_to_python( + python_packet[key] + ) + except Exception as e: + raise ValueError(f"Failed to convert value for '{key}': {e}") from e + + return python_packet + + def from_arrow_table_to_python_packets(self, arrow_table: pa.Table) -> list[Packet]: + """Convert an Arrow table to a list of Python packets. + + Args: + arrow_table: Arrow table representation of the packets + + Returns: + List of Python packets converted from the Arrow table + """ + semantic_packets = self.from_arrow_table_to_semantic_packets(arrow_table) + return [self.from_semantic_packet_to_python_packet(sp) for sp in semantic_packets] + diff --git a/src/orcapod/types/registry.py b/src/orcapod/types/registry.py deleted file mode 100644 index 6b56183..0000000 --- a/src/orcapod/types/registry.py +++ /dev/null @@ -1,437 +0,0 @@ -from collections.abc import Callable, Collection, Sequence, Mapping -import logging -from optparse import Values -from typing import Any -import pyarrow as pa -from orcapod.types import Packet -from .core import TypeHandler, TypeInfo, TypeSpec - -# This mapping is expected to be stable -# Be sure to test this assumption holds true -DEFAULT_ARROW_TYPE_LUT = { - int: pa.int64(), - float: pa.float64(), - str: pa.string(), - bool: pa.bool_(), -} - -logger = logging.getLogger(__name__) - - -class TypeRegistry: - """Registry that manages type handlers with semantic type names.""" - - def __init__(self): - self._handlers: dict[ - type, tuple[TypeHandler, str] - ] = {} # Type -> (Handler, semantic_name) - self._semantic_handlers: dict[str, TypeHandler] = {} # semantic_name -> Handler - - def register( - self, - semantic_name: str, - handler: TypeHandler, - explicit_types: type | tuple[type, ...] | None = None, - override: bool = False, - ): - """Register a handler with a semantic type name. - - Args: - semantic_name: Identifier for this semantic type (e.g., 'path', 'uuid') - handler: The type handler instance - explicit_types: Optional override of types to register for (if different from handler's supported_types) - override: If True, allow overriding existing registration for the same semantic name and Python type(s) - """ - # Determine which types to register for - if explicit_types is not None: - types_to_register = ( - explicit_types - if isinstance(explicit_types, tuple) - else (explicit_types,) - ) - else: - supported = handler.python_types() - types_to_register = ( - supported if isinstance(supported, tuple) else (supported,) - ) - - # Register handler for each type - for python_type in types_to_register: - if python_type in self._handlers and not override: - existing_semantic = self._handlers[python_type][1] - # TODO: handle overlapping registration more gracefully - raise ValueError( - f"Type {python_type} already registered with semantic type '{existing_semantic}'" - ) - - self._handlers[python_type] = (handler, semantic_name) - - # Register by semantic name - if semantic_name in self._semantic_handlers and not override: - raise ValueError(f"Semantic type '{semantic_name}' already registered") - - self._semantic_handlers[semantic_name] = handler - - def get_handler(self, python_type: type) -> TypeHandler | None: - """Get handler for a Python type.""" - handler_info = self._handlers.get(python_type) - return handler_info[0] if handler_info else None - - def get_semantic_name(self, python_type: type) -> str | None: - """Get semantic name for a Python type.""" - handler_info = self._handlers.get(python_type) - return handler_info[1] if handler_info else None - - def get_type_info(self, python_type: type) -> TypeInfo | None: - """Get TypeInfo for a Python type.""" - handler = self.get_handler(python_type) - if handler is None: - return None - semantic_name = self.get_semantic_name(python_type) - return TypeInfo( - python_type=python_type, - arrow_type=handler.storage_type(), - semantic_type=semantic_name, - handler=handler, - ) - - def get_handler_by_semantic_name(self, semantic_name: str) -> TypeHandler | None: - """Get handler by semantic name.""" - return self._semantic_handlers.get(semantic_name) - - def __contains__(self, python_type: type) -> bool: - """Check if a Python type is registered.""" - return python_type in self._handlers - - -class PacketConverter: - def __init__(self, python_type_spec: TypeSpec, registry: TypeRegistry): - self.python_type_spec = python_type_spec - self.registry = registry - - # Lookup handlers and type info for fast access - self.handlers: dict[str, TypeHandler] = {} - self.storage_type_info: dict[str, TypeInfo] = {} - - self.expected_key_set = set(python_type_spec.keys()) - - # prepare the corresponding arrow table schema with metadata - self.keys_with_handlers, self.schema = create_schema_from_python_type_info( - python_type_spec, registry - ) - - self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") - - def _check_key_consistency(self, keys): - """Check if the provided keys match the expected keys.""" - keys_set = set(keys) - if keys_set != self.expected_key_set: - missing_keys = self.expected_key_set - keys_set - extra_keys = keys_set - self.expected_key_set - error_parts = [] - if missing_keys: - error_parts.append(f"Missing keys: {missing_keys}") - if extra_keys: - error_parts.append(f"Extra keys: {extra_keys}") - - raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - - def _to_storage_packet(self, packet: Packet) -> dict[str, Any]: - """Convert packet to storage representation. - - Args: - packet: Dictionary mapping parameter names to Python values - - Returns: - Dictionary with same keys but values converted to storage format - - Raises: - KeyError: If packet keys don't match the expected type_info keys - TypeError: If value type doesn't match expected type - ValueError: If conversion fails - """ - # Validate packet keys - packet_keys = set(packet.keys()) - - self._check_key_consistency(packet_keys) - - # Convert each value - storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet - - for key, handler in self.keys_with_handlers: - try: - storage_packet[key] = handler.python_to_storage(storage_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return storage_packet - - def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> Packet: - """Convert storage packet back to Python packet. - - Args: - storage_packet: Dictionary with values in storage format - - Returns: - Packet with values converted back to Python types - - Raises: - KeyError: If storage packet keys don't match the expected type_info keys - TypeError: If value type doesn't match expected type - ValueError: If conversion fails - """ - # Validate storage packet keys - storage_keys = set(storage_packet.keys()) - - self._check_key_consistency(storage_keys) - - # Convert each value back to Python type - packet: Packet = dict(storage_packet) - - for key, handler in self.keys_with_handlers: - try: - packet[key] = handler.storage_to_python(storage_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return packet - - def to_arrow_table(self, packet: Packet | Sequence[Packet]) -> pa.Table: - """Convert packet to PyArrow Table with field metadata. - - Args: - packet: Dictionary mapping parameter names to Python values - - Returns: - PyArrow Table with the packet data as a single row - """ - # Convert packet to storage format - if not isinstance(packet, Sequence): - packets = [packet] - else: - packets = packet - - storage_packets = [self._to_storage_packet(p) for p in packets] - - # Create arrays - arrays = [] - for field in self.schema: - values = [p[field.name] for p in storage_packets] - array = pa.array(values, type=field.type) - arrays.append(array) - - return pa.Table.from_arrays(arrays, schema=self.schema) - - def from_arrow_table( - self, table: pa.Table, verify_semantic_equivalence: bool = True - ) -> list[Packet]: - """Convert Arrow table to packet with field metadata. - - Args: - table: PyArrow Table with metadata - - Returns: - List of packets converted from the Arrow table - """ - # Check for consistency in the semantic type mapping: - semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") - - if semantic_type_info != self.semantic_type_lut: - if not verify_semantic_equivalence: - logger.warning( - "Arrow table semantic types do not match expected type registry. " - f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" - ) - else: - raise ValueError( - "Arrow table semantic types do not match expected type registry. " - f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" - ) - - # Create packets from the Arrow table - # TODO: make this more efficient - storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore - if not self.keys_with_handlers: - # no special handling required - return storage_packets - - return [self._from_storage_packet(packet) for packet in storage_packets] - - -def arrow_to_dicts(table: pa.Table) -> list[dict[str, Any]]: - """ - Convert Arrow table to dictionary or list of dictionaries. - By default returns a list of dictionaries (one per row) with column names as keys. - If `collapse_singleton` is True, return a single dictionary for single-row tables. - Args: - table: PyArrow Table to convert - collapse_singleton: If True, return a single dictionary for single-row tables. Defaults to False. - Returns: - A dictionary if singleton and collapse_singleton=True. Otherwise, list of dictionaries for multi-row tables. - """ - if len(table) == 0: - return [] - - # Multiple rows: return list of dicts (one per row) - return [ - {col_name: table.column(col_name)[i].as_py() for col_name in table.column_names} - for i in range(len(table)) - ] - - -def get_metadata_from_schema( - schema: pa.Schema, metadata_field: bytes -) -> dict[str, str]: - """ - Extract metadata from Arrow schema fields. Metadata value will be utf-8 decoded. - Args: - schema: PyArrow Schema to extract metadata from - metadata_field: Metadata field to extract (e.g., b'semantic_type') - Returns: - Dictionary mapping field names to their metadata values - """ - metadata = {} - for field in schema: - if field.metadata and metadata_field in field.metadata: - metadata[field.name] = field.metadata[metadata_field].decode("utf-8") - return metadata - - -def create_schema_from_python_type_info( - python_type_spec: TypeSpec, - registry: TypeRegistry, - arrow_type_lut: dict[type, pa.DataType] | None = None, -) -> tuple[list[tuple[str, TypeHandler]], pa.Schema]: - if arrow_type_lut is None: - arrow_type_lut = DEFAULT_ARROW_TYPE_LUT - keys_with_handlers: list[tuple[str, TypeHandler]] = [] - schema_fields = [] - for key, python_type in python_type_spec.items(): - type_info = registry.get_type_info(python_type) - - field_metadata = {} - if type_info and type_info.semantic_type: - field_metadata["semantic_type"] = type_info.semantic_type - keys_with_handlers.append((key, type_info.handler)) - arrow_type = type_info.arrow_type - else: - arrow_type = arrow_type_lut.get(python_type) - if arrow_type is None: - raise ValueError( - f"Direct support for Python type {python_type} is not provided. Register a handler to work with {python_type}" - ) - - schema_fields.append(pa.field(key, arrow_type, metadata=field_metadata)) - return keys_with_handlers, pa.schema(schema_fields) - - -def arrow_table_to_packets( - table: pa.Table, - registry: TypeRegistry, -) -> list[Packet]: - """Convert Arrow table to packet with field metadata. - - Args: - packet: Dictionary mapping parameter names to Python values - - Returns: - PyArrow Table with the packet data as a single row - """ - packets: list[Packet] = [] - - # prepare converter for each field - - def no_op(x) -> Any: - return x - - converter_lut = {} - for field in table.schema: - if field.metadata and b"semantic_type" in field.metadata: - semantic_type = field.metadata[b"semantic_type"].decode("utf-8") - if semantic_type: - handler = registry.get_handler_by_semantic_name(semantic_type) - if handler is None: - raise ValueError( - f"No handler registered for semantic type '{semantic_type}'" - ) - converter_lut[field.name] = handler.storage_to_python - - # Create packets from the Arrow table - # TODO: make this more efficient - for row in range(table.num_rows): - packet: Packet = {} - for field in table.schema: - value = table.column(field.name)[row].as_py() - packet[field.name] = converter_lut.get(field.name, no_op)(value) - packets.append(packet) - - return packets - - -def is_packet_supported( - python_type_info: TypeSpec, registry: TypeRegistry, type_lut: dict | None = None -) -> bool: - """Check if all types in the packet are supported by the registry or known to the default lut.""" - if type_lut is None: - type_lut = {} - return all( - python_type in registry or python_type in type_lut - for python_type in python_type_info.values() - ) - - -def create_arrow_table_with_meta( - storage_packet: dict[str, Any], type_info: dict[str, TypeInfo] -): - """Create an Arrow table with metadata from a storage packet. - - Args: - storage_packet: Dictionary with values in storage format - type_info: Dictionary mapping parameter names to TypeInfo objects - - Returns: - PyArrow Table with metadata - """ - schema_fields = [] - for key, type_info_obj in type_info.items(): - field_metadata = {} - if type_info_obj.semantic_type: - field_metadata["semantic_type"] = type_info_obj.semantic_type - - field = pa.field(key, type_info_obj.arrow_type, metadata=field_metadata) - schema_fields.append(field) - - schema = pa.schema(schema_fields) - - arrays = [] - for field in schema: - value = storage_packet[field.name] - array = pa.array([value], type=field.type) - arrays.append(array) - - return pa.Table.from_arrays(arrays, schema=schema) - - -def retrieve_storage_packet_from_arrow_with_meta( - arrow_table: pa.Table, -) -> dict[str, Any]: - """Retrieve storage packet from Arrow table with metadata. - - Args: - arrow_table: PyArrow Table with metadata - - Returns: - Dictionary representing the storage packet - """ - storage_packet = {} - for field in arrow_table.schema: - # Extract value from Arrow array - array = arrow_table.column(field.name) - if array.num_chunks > 0: - value = array.chunk(0).as_py()[0] # Get first value - else: - value = None # Handle empty arrays - - storage_packet[field.name] = value - - return storage_packet diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py new file mode 100644 index 0000000..4f78ca5 --- /dev/null +++ b/src/orcapod/types/schemas.py @@ -0,0 +1,267 @@ + +from orcapod.types import TypeSpec +from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from typing import Any +import pyarrow as pa +import datetime + +# This mapping is expected to be stable +# Be sure to test this assumption holds true +DEFAULT_ARROW_TYPE_LUT = { + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), + bool: pa.bool_(), +} + +def python_to_arrow_type(python_type: type) -> pa.DataType: + if python_type in DEFAULT_ARROW_TYPE_LUT: + return DEFAULT_ARROW_TYPE_LUT[python_type] + raise TypeError(f"Converstion of python type {python_type} is not supported yet") + +def arrow_to_python_type(arrow_type: pa.DataType) -> type: + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_date(arrow_type): + return datetime.date + elif pa.types.is_timestamp(arrow_type): + return datetime.datetime + elif pa.types.is_binary(arrow_type): + return bytes + else: + raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") + + + +# class PythonSchema(dict[str, type]): +# """ +# A schema for Python data types, mapping string keys to Python types. + +# This is used to define the expected structure of data packets in OrcaPod. + +# Attributes +# ---------- +# keys : str +# The keys of the schema. +# values : type +# The types corresponding to each key. + +# Examples +# -------- +# >>> schema = PythonSchema(name=str, age=int) +# >>> print(schema) +# {'name': , 'age': } +# """ + +PythonSchema = TypeSpec + + +class SemanticSchema(dict[str, tuple[type, str|None]]): + """ + A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. + + This is used to define the expected structure of data packets with semantic types in OrcaPod. + + Attributes + ---------- + keys : str + The keys of the schema. + values : tuple[type, str|None] + The types and optional semantic type corresponding to each key. + + Examples + -------- + >>> schema = SemanticSchema(image=(str, 'path'), age=(int, None)) + >>> print(schema) + {'image': (, 'path'), 'age': (, None)} + """ + def get_store_type(self, key: str) -> type | None: + """ + Get the storage type for a given key in the schema. + + Parameters + ---------- + key : str + The key for which to retrieve the storage type. + + Returns + ------- + type | None + The storage type associated with the key, or None if not found. + """ + return self.get(key, (None, None))[0] + + def get_semantic_type(self, key: str) -> str | None: + """ + Get the semantic type for a given key in the schema. + + Parameters + ---------- + key : str + The key for which to retrieve the semantic type. + + Returns + ------- + str | None + The semantic type associated with the key, or None if not found. + """ + return self.get(key, (None, None))[1] + + +def from_python_schema_to_semantic_schema( + python_schema: PythonSchema, + semantic_type_registry: SemanticTypeRegistry, +) -> SemanticSchema: + """ + Convert a Python schema to a semantic schema using the provided semantic type registry. + + Parameters + ---------- + python_schema : PythonSchema + The schema to convert, mapping keys to Python types. + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + + Examples + -------- + >>> python_schema = PythonSchema(name=str, age=int) + >>> semantic_schema = from_python_schema_to_semantic_schema(python_schema, registry) + >>> print(semantic_schema) + {'name': (, None), 'age': (, None)} + """ + semantic_schema = {} + for key, python_type in python_schema.items(): + if python_type in semantic_type_registry: + type_info = semantic_type_registry.get_type_info(python_type) + assert type_info is not None, f"Type {python_type} should be found in the registry as `in` returned True" + semantic_schema[key] = (type_info.storage_type, type_info.semantic_type) + else: + semantic_schema[key] = (python_type, None) + return SemanticSchema(semantic_schema) + +def from_semantic_schema_to_python_schema( + semantic_schema: SemanticSchema, + semantic_type_registry: SemanticTypeRegistry, +) -> PythonSchema: + """ + Convert a semantic schema to a Python schema using the provided semantic type registry. + + Parameters + ---------- + semantic_schema : SemanticSchema + The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + + Examples + -------- + >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) + >>> python_schema = from_semantic_schema_to_python_schema(semantic_schema, registry) + >>> print(python_schema) + {'name': , 'age': } + """ + python_schema = {} + for key, (python_type, semantic_type) in semantic_schema.items(): + if semantic_type is not None: + # If the semantic type is registered, use the corresponding Python type + python_type = semantic_type_registry.get_python_type(semantic_type) + python_schema[key] = python_type + return python_schema + +def from_semantic_schema_to_arrow_schema( + semantic_schema: SemanticSchema, + include_source_info: bool = True, +) -> pa.Schema: + """ + Convert a semantic schema to an Arrow schema. + + Parameters + ---------- + semantic_schema : SemanticSchema + The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. + + Returns + ------- + dict[str, type] + A new schema mapping keys to Arrow-compatible types. + + Examples + -------- + >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) + >>> arrow_schema = from_semantic_schema_to_arrow_schema(semantic_schema) + >>> print(arrow_schema) + {'name': str, 'age': int} + """ + fields = [] + for field_name, (python_type, semantic_type) in semantic_schema.items(): + arrow_type = DEFAULT_ARROW_TYPE_LUT[python_type] + field_metadata = {b"semantic_type": semantic_type.encode('utf-8')} if semantic_type else {} + fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) + + if include_source_info: + for field in semantic_schema: + field_metadata = {b'field_type': b'source_info'} + fields.append(pa.field(f'_source_info_{field}', pa.large_string(), metadata=field_metadata)) + + return pa.schema(fields) + +def from_arrow_schema_to_semantic_schema( + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry | None = None, +) -> SemanticSchema: + """ + Convert an Arrow schema to a semantic schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The schema to convert, containing fields with metadata. + + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + + Examples + -------- + >>> arrow_schema = pa.schema([pa.field('name', pa.string(), metadata={'semantic_type': 'name'}), + ... pa.field('age', pa.int64(), metadata={'semantic_type': 'age'})]) + >>> semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) + >>> print(semantic_schema) + {'name': (str, 'name'), 'age': (int, 'age')} + """ + semantic_schema = {} + for field in arrow_schema: + if field.metadata.get(b'field_type', b'') == b'source_info': + # Skip source info fields + continue + semantic_type = field.metadata.get(b'semantic_type', None) + semantic_type = semantic_type.decode() if semantic_type else None + if semantic_type: + if semantic_type_registry is None: + raise ValueError("Semantic type registry must be provided for semantic types") + python_type = semantic_type_registry.get_python_type(semantic_type) + if python_type is None: + raise ValueError(f"Semantic type '{semantic_type}' is not registered in the registry") + else: + python_type = arrow_to_python_type(field.type) + + semantic_schema[field.name] = (python_type, semantic_type) + return SemanticSchema(semantic_schema) + diff --git a/src/orcapod/types/handlers.py b/src/orcapod/types/semantic_type_handlers.py similarity index 92% rename from src/orcapod/types/handlers.py rename to src/orcapod/types/semantic_type_handlers.py index ecbdfba..a15f9d5 100644 --- a/src/orcapod/types/handlers.py +++ b/src/orcapod/types/semantic_type_handlers.py @@ -9,7 +9,7 @@ class PathHandler: """Handler for pathlib.Path objects, stored as strings.""" - def python_types(self) -> type: + def python_type(self) -> type: return Path def storage_type(self) -> pa.DataType: @@ -25,7 +25,7 @@ def storage_to_python(self, value: str) -> Path | None: class UUIDHandler: """Handler for UUID objects, stored as strings.""" - def python_types(self) -> type: + def python_type(self) -> type: return UUID def storage_type(self) -> pa.DataType: @@ -41,7 +41,7 @@ def storage_to_python(self, value: str) -> UUID | None: class DecimalHandler: """Handler for Decimal objects, stored as strings.""" - def python_types(self) -> type: + def python_type(self) -> type: return Decimal def storage_type(self) -> pa.DataType: @@ -61,7 +61,7 @@ def __init__(self, python_type: type, arrow_type: pa.DataType): self._python_type = python_type self._arrow_type = arrow_type - def python_types(self) -> type: + def python_type(self) -> type: return self._python_type def storage_type(self) -> pa.DataType: @@ -80,7 +80,7 @@ class DirectArrowHandler: def __init__(self, arrow_type: pa.DataType): self._arrow_type = arrow_type - def python_types(self) -> type: + def python_type(self) -> type: return self._arrow_type def storage_type(self) -> pa.DataType: @@ -96,7 +96,7 @@ def storage_to_python(self, value: Any) -> Any: class DateTimeHandler: """Handler for datetime objects.""" - def python_types(self) -> tuple[type, ...]: + def python_type(self) -> type: return (datetime, date, time) # Handles multiple related types def storage_type(self) -> pa.DataType: diff --git a/src/orcapod/types/semantic_type_registry.py b/src/orcapod/types/semantic_type_registry.py new file mode 100644 index 0000000..d954891 --- /dev/null +++ b/src/orcapod/types/semantic_type_registry.py @@ -0,0 +1,468 @@ +from collections.abc import Callable, Collection, Sequence, Mapping +import logging +from optparse import Values +from typing import Any, get_origin, get_args +from types import UnionType +import pyarrow as pa +from orcapod.types.packets import Packet, PacketLike +from .core import TypeHandler, TypeSpec +from dataclasses import dataclass + +# This mapping is expected to be stable +# Be sure to test this assumption holds true +DEFAULT_ARROW_TYPE_LUT = { + int: pa.int64(), + float: pa.float64(), + str: pa.string(), + bool: pa.bool_(), +} + +logger = logging.getLogger(__name__) + + +# TODO: reconsider the need for this dataclass as its information is superfluous +# to the registration of the handler into the registry. +@dataclass +class TypeInfo: + python_type: type + storage_type: type + semantic_type: str | None # name under which the type is registered + handler: "TypeHandler" + + +class SemanticTypeRegistry: + """Registry that manages type handlers with semantic type names.""" + + def __init__(self): + self._handlers: dict[ + type, tuple[TypeHandler, str] + ] = {} # PythonType -> (Handler, semantic_name) + self._semantic_handlers: dict[str, TypeHandler] = {} # semantic_name -> Handler + self._semantic_to_python_lut: dict[str, type] = {} # semantic_name -> Python type + + def register( + self, + semantic_type: str, + handler: TypeHandler, + ): + """Register a handler with a semantic type name. + + Args: + semantic_name: Identifier for this semantic type (e.g., 'path', 'uuid') + handler: The type handler instance + explicit_types: Optional override of types to register for (if different from handler's supported_types) + override: If True, allow overriding existing registration for the same semantic name and Python type(s) + """ + # Determine which types to register for + + python_type = handler.python_type() + + # Register handler for each type + if python_type in self._handlers: + existing_semantic = self._handlers[python_type][1] + # TODO: handle overlapping registration more gracefully + raise ValueError( + f"Type {python_type} already registered with semantic type '{existing_semantic}'" + ) + + # Register by semantic name + if semantic_type in self._semantic_handlers: + raise ValueError(f"Semantic type '{semantic_type}' already registered") + + self._handlers[python_type] = (handler, semantic_type) + self._semantic_handlers[semantic_type] = handler + self._semantic_to_python_lut[semantic_type] = python_type + + def get_python_type(self, semantic_type: str) -> type | None: + """Get Python type for a semantic type.""" + return self._semantic_to_python_lut.get(semantic_type) + + + + def get_semantic_type(self, python_type: type) -> str | None: + """Get semantic type for a Python type.""" + handler_info = self._handlers.get(python_type) + return handler_info[1] if handler_info else None + + def get_handler(self, python_type: type) -> TypeHandler | None: + """Get handler for a Python type.""" + handler_info = self._handlers.get(python_type) + return handler_info[0] if handler_info else None + + def get_handler_by_semantic_type(self, semantic_type: str) -> TypeHandler | None: + """Get handler by semantic type.""" + return self._semantic_handlers.get(semantic_type) + + + def get_type_info(self, python_type: type) -> TypeInfo | None: + """Get TypeInfo for a Python type.""" + handler = self.get_handler(python_type) + if handler is None: + return None + semantic_type = self.get_semantic_type(python_type) + return TypeInfo( + python_type=python_type, + storage_type=handler.storage_type(), + semantic_type=semantic_type, + handler=handler, + ) + + + def __contains__(self, python_type: type) -> bool: + """Check if a Python type is registered.""" + return python_type in self._handlers + + + + + + +# Below is a collection of functions that handles converting between various aspects of Python packets and Arrow tables. +# Here for convenience, any Python dictionary with str keys and supported Python values are referred to as a packet. + + +# Conversions are: +# python packet <-> storage packet <-> arrow table +# python typespec <-> storage typespec <-> arrow schema +# +# python packet <-> storage packet requires the use of SemanticTypeRegistry +# conversion between storage packet <-> arrow table requires info about semantic_type + + +# # Storage packet <-> Arrow table + +# def stroage_typespec_to_arrow_schema(storage_typespec:TypeSpec, semantic_type_info: dict[str, str]|None = None) -> pa.Schema: +# """Convert storage typespec to Arrow Schema with semantic_type metadata.""" +# """Convert storage typespec to PyArrow Schema with semantic_type metadata.""" +# if semantic_type_info is None: +# semantic_type_info = {} + +# fields = [] +# for field_name, field_type in storage_typespec.items(): +# arrow_type = python_to_pyarrow_type(field_type) +# semantic_type = semantic_type_info.get(field_name, None) +# field_metadata = {"semantic_type": semantic_type} if semantic_type else {} +# fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) +# return pa.schema(fields) + +# def arrow_schema_to_storage_typespec(schema: pa.Schema) -> tuple[TypeSpec, dict[str, str]|None]: +# """Convert Arrow Schema to storage typespec and semantic type metadata.""" +# typespec = {} +# semantic_type_info = {} + +# for field in schema: +# field_type = field.type +# typespec[field.name] = field_type.to_pandas_dtype() # Convert Arrow type to Pandas dtype +# if field.metadata and b"semantic_type" in field.metadata: +# semantic_type_info[field.name] = field.metadata[b"semantic_type"].decode("utf-8") + +# return typespec, semantic_type_info + + +# def storage_packet_to_arrow_table( +# storage_packet: PacketLike, +# typespec: TypeSpec | None = None, +# semantic_type_info: dict[str, str] | None = None, + + + +# # TypeSpec + TypeRegistry + ArrowLUT -> Arrow Schema (annotated with semantic_type) + +# # + + + + + + +# # TypeSpec <-> Arrow Schema + +# def schema_from_typespec(typespec: TypeSpec, registry: SemanticTypeRegistry, metadata_info: dict | None = None) -> pa.Schema: +# """Convert TypeSpec to PyArrow Schema.""" +# if metadata_info is None: +# metadata_info = {} + +# fields = [] +# for field_name, field_type in typespec.items(): +# type_info = registry.get_type_info(field_type) +# if type_info is None: +# raise ValueError(f"No type info registered for {field_type}") +# fields.append(pa.field(field_name, type_info.arrow_type, metadata={ +# "semantic_type": type_info.semantic_type +# })) +# return pa.schema(fields) + +# def create_schema_from_typespec( +# typespec: TypeSpec, +# registry: SemanticTypeRegistry, +# metadata_info: dict | None = None, +# arrow_type_lut: dict[type, pa.DataType] | None = None, +# ) -> tuple[list[tuple[str, TypeHandler]], pa.Schema]: +# if metadata_info is None: +# metadata_info = {} +# if arrow_type_lut is None: +# arrow_type_lut = DEFAULT_ARROW_TYPE_LUT + +# keys_with_handlers: list[tuple[str, TypeHandler]] = [] +# schema_fields = [] +# for key, python_type in typespec.items(): +# type_info = registry.get_type_info(python_type) + +# field_metadata = {} +# if type_info and type_info.semantic_type: +# field_metadata["semantic_type"] = type_info.semantic_type +# keys_with_handlers.append((key, type_info.handler)) +# arrow_type = type_info.arrow_type +# else: +# arrow_type = arrow_type_lut.get(python_type) +# if arrow_type is None: +# raise ValueError( +# f"Direct support for Python type {python_type} is not provided. Register a handler to work with {python_type}" +# ) + +# schema_fields.append(pa.field(key, arrow_type, metadata=field_metadata)) +# return keys_with_handlers, pa.schema(schema_fields) + + + +# def arrow_table_to_packets( +# table: pa.Table, +# registry: SemanticTypeRegistry, +# ) -> list[Packet]: +# """Convert Arrow table to packet with field metadata. + +# Args: +# packet: Dictionary mapping parameter names to Python values + +# Returns: +# PyArrow Table with the packet data as a single row +# """ +# packets: list[Packet] = [] + +# # prepare converter for each field + +# def no_op(x) -> Any: +# return x + +# converter_lut = {} +# for field in table.schema: +# if field.metadata and b"semantic_type" in field.metadata: +# semantic_type = field.metadata[b"semantic_type"].decode("utf-8") +# if semantic_type: +# handler = registry.get_handler_by_semantic_name(semantic_type) +# if handler is None: +# raise ValueError( +# f"No handler registered for semantic type '{semantic_type}'" +# ) +# converter_lut[field.name] = handler.storage_to_python + +# # Create packets from the Arrow table +# # TODO: make this more efficient +# for row in range(table.num_rows): +# packet: Packet = Packet() +# for field in table.schema: +# value = table.column(field.name)[row].as_py() +# packet[field.name] = converter_lut.get(field.name, no_op)(value) +# packets.append(packet) + +# return packets + + +# def create_arrow_table_with_meta( +# storage_packet: dict[str, Any], type_info: dict[str, TypeInfo] +# ): +# """Create an Arrow table with metadata from a storage packet. + +# Args: +# storage_packet: Dictionary with values in storage format +# type_info: Dictionary mapping parameter names to TypeInfo objects + +# Returns: +# PyArrow Table with metadata +# """ +# schema_fields = [] +# for key, type_info_obj in type_info.items(): +# field_metadata = {} +# if type_info_obj.semantic_type: +# field_metadata["semantic_type"] = type_info_obj.semantic_type + +# field = pa.field(key, type_info_obj.arrow_type, metadata=field_metadata) +# schema_fields.append(field) + +# schema = pa.schema(schema_fields) + +# arrays = [] +# for field in schema: +# value = storage_packet[field.name] +# array = pa.array([value], type=field.type) +# arrays.append(array) + +# return pa.Table.from_arrays(arrays, schema=schema) + + +# def retrieve_storage_packet_from_arrow_with_meta( +# arrow_table: pa.Table, +# ) -> dict[str, Any]: +# """Retrieve storage packet from Arrow table with metadata. + +# Args: +# arrow_table: PyArrow Table with metadata + +# Returns: +# Dictionary representing the storage packet +# """ +# storage_packet = {} +# for field in arrow_table.schema: +# # Extract value from Arrow array +# array = arrow_table.column(field.name) +# if array.num_chunks > 0: +# value = array.chunk(0).as_py()[0] # Get first value +# else: +# value = None # Handle empty arrays + +# storage_packet[field.name] = value + +# return storage_packet + +# def typespec_to_schema_with_metadata(typespec: TypeSpec, field_metadata: dict|None = None) -> pa.Schema: +# """Convert TypeSpec to PyArrow Schema""" +# fields = [] +# for field_name, field_type in typespec.items(): +# arrow_type = python_to_pyarrow_type(field_type) +# fields.append(pa.field(field_name, arrow_type)) +# return pa.schema(fields) + +# def python_to_pyarrow_type(python_type: type, strict:bool=True) -> pa.DataType: +# """Convert Python type (including generics) to PyArrow type""" +# # For anywhere we need to store str value, we use large_string as is done in Polars + +# # Handle basic types first +# basic_mapping = { +# int: pa.int64(), +# float: pa.float64(), +# str: pa.large_string(), +# bool: pa.bool_(), +# bytes: pa.binary(), +# } + +# if python_type in basic_mapping: +# return basic_mapping[python_type] + +# # Handle generic types +# origin = get_origin(python_type) +# args = get_args(python_type) + +# if origin is list: +# # Handle list[T] +# if args: +# element_type = python_to_pyarrow_type(args[0]) +# return pa.list_(element_type) +# else: +# return pa.list_(pa.large_string()) # default to list of strings + +# elif origin is dict: +# # Handle dict[K, V] - PyArrow uses map type +# if len(args) == 2: +# key_type = python_to_pyarrow_type(args[0]) +# value_type = python_to_pyarrow_type(args[1]) +# return pa.map_(key_type, value_type) +# else: +# # Otherwise default to using long string +# return pa.map_(pa.large_string(), pa.large_string()) + +# elif origin is UnionType: +# # Handle Optional[T] (Union[T, None]) +# if len(args) == 2 and type(None) in args: +# non_none_type = args[0] if args[1] is type(None) else args[1] +# return python_to_pyarrow_type(non_none_type) + +# # Default fallback +# if not strict: +# logger.warning(f"Unsupported type {python_type}, defaulting to large_string") +# return pa.large_string() +# else: +# raise TypeError(f"Unsupported type {python_type} for PyArrow conversion. " +# "Set strict=False to allow fallback to large_string.") + +# def arrow_to_dicts(table: pa.Table) -> list[dict[str, Any]]: +# """ +# Convert Arrow table to dictionary or list of dictionaries. +# Returns a list of dictionaries (one per row) with column names as keys. +# Args: +# table: PyArrow Table to convert +# Returns: +# A list of dictionaries for multi-row tables. +# """ +# if len(table) == 0: +# return [] + +# # Multiple rows: return list of dicts (one per row) +# return [ +# {col_name: table.column(col_name)[i].as_py() for col_name in table.column_names} +# for i in range(len(table)) +# ] + +# def get_metadata_from_schema( +# schema: pa.Schema, metadata_field: bytes +# ) -> dict[str, str]: +# """ +# Extract metadata from Arrow schema fields. Metadata value will be utf-8 decoded. +# Args: +# schema: PyArrow Schema to extract metadata from +# metadata_field: Metadata field to extract (e.g., b'semantic_type') +# Returns: +# Dictionary mapping field names to their metadata values +# """ +# metadata = {} +# for field in schema: +# if field.metadata and metadata_field in field.metadata: +# metadata[field.name] = field.metadata[metadata_field].decode("utf-8") +# return metadata + +# def dict_to_arrow_table_with_metadata(data: dict, data_type_info: TypeSpec | None = None, metadata: dict | None = None): +# """ +# Convert a tag dictionary to PyArrow table with metadata on each column. + +# Args: +# tag: Dictionary with string keys and any Python data type values +# metadata_key: The metadata key to add to each column +# metadata_value: The metadata value to indicate this column came from tag +# """ +# if metadata is None: +# metadata = {} + +# if field_types is None: +# # First create the table to infer types +# temp_table = pa.Table.from_pylist([data]) + +# # Create new fields with metadata +# fields_with_metadata = [] +# for field in temp_table.schema: +# # Add metadata to each field +# field_metadata = metadata +# new_field = pa.field( +# field.name, field.type, nullable=field.nullable, metadata=field_metadata +# ) +# fields_with_metadata.append(new_field) + +# # Create schema with metadata +# schema_with_metadata = pa.schema(fields_with_metadata) + +# # Create the final table with the metadata-enriched schema +# table = pa.Table.from_pylist([tag], schema=schema_with_metadata) + +# return table + + +# # def get_columns_with_metadata( +# # df: pl.DataFrame, key: str, value: str | None = None +# # ) -> list[str]: +# # """Get column names with specific metadata using list comprehension. If value is given, only +# # columns matching that specific value for the desginated metadata key will be returned. +# # Otherwise, all columns that contains the key as metadata will be returned regardless of the value""" +# # return [ +# # col_name +# # for col_name, dtype in df.schema.items() +# # if hasattr(dtype, "metadata") +# # and (value is None or getattr(dtype, "metadata") == value) +# # ] diff --git a/src/orcapod/types/typespec.py b/src/orcapod/types/typespec_utils.py similarity index 83% rename from src/orcapod/types/typespec.py rename to src/orcapod/types/typespec_utils.py index eb5be89..0786d10 100644 --- a/src/orcapod/types/typespec.py +++ b/src/orcapod/types/typespec_utils.py @@ -1,8 +1,7 @@ # Library of functions for working with TypeSpecs and for extracting TypeSpecs from a function's signature - -from collections.abc import Callable, Collection, Sequence -from typing import get_origin, get_args +from collections.abc import Callable, Collection, Sequence, Mapping +from typing import get_origin, get_args, Any from .core import TypeSpec import inspect import logging @@ -213,3 +212,57 @@ def extract_function_typespecs( f"Type for return item '{key}' is not specified in output_types and has no type annotation in function signature." ) return param_info, inferred_output_types + + + +def get_typespec_from_dict(dict: Mapping) -> TypeSpec: + """ + Returns a TypeSpec for the given dictionary. + The TypeSpec is a mapping from field name to Python type. + """ + return {key: type(value) for key, value in dict.items()} + + +def get_compatible_type(type1: Any, type2: Any) -> Any: + if type1 is type2: + return type1 + if issubclass(type1, type2): + return type2 + if issubclass(type2, type1): + return type1 + raise TypeError(f"Types {type1} and {type2} are not compatible") + + +def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: + if left is None: + return right + if right is None: + return left + # Merge the two TypeSpecs but raise an error if conflicts in types are found + merged = dict(left) + for key, right_type in right.items(): + merged[key] = ( + get_compatible_type(merged[key], right_type) + if key in merged + else right_type + ) + return merged + +def intersection_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: + """ + Returns the intersection of two TypeSpecs, only returning keys that are present in both. + If a key is present in both TypeSpecs, the type must be the same. + """ + if left is None or right is None: + return None + # Find common keys and ensure types match + common_keys = set(left.keys()).intersection(set(right.keys())) + intersection = {} + for key in common_keys: + try: + intersection[key] = get_compatible_type(left[key], right[key]) + except TypeError: + # If types are not compatible, raise an error + raise TypeError(f"Type conflict for key '{key}': {left[key]} vs {right[key]}") + + return intersection \ No newline at end of file diff --git a/src/orcapod/utils/stream_utils.py b/src/orcapod/utils/stream_utils.py index 95703c8..5c5bb62 100644 --- a/src/orcapod/utils/stream_utils.py +++ b/src/orcapod/utils/stream_utils.py @@ -12,23 +12,6 @@ V = TypeVar("V") -def get_typespec(dict: Mapping) -> TypeSpec: - """ - Returns a TypeSpec for the given dictionary. - The TypeSpec is a mapping from field name to Python type. - """ - return {key: type(value) for key, value in dict.items()} - - -def get_compatible_type(type1: Any, type2: Any) -> Any: - if type1 is type2: - return type1 - if issubclass(type1, type2): - return type2 - if issubclass(type2, type1): - return type1 - raise TypeError(f"Types {type1} and {type2} are not compatible") - def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: merged = left.copy() @@ -43,39 +26,6 @@ def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: return merged -def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: - if left is None: - return right - if right is None: - return left - # Merge the two TypeSpecs but raise an error if conflicts in types are found - merged = dict(left) - for key, right_type in right.items(): - merged[key] = ( - get_compatible_type(merged[key], right_type) - if key in merged - else right_type - ) - return merged - -def intersection_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: - """ - Returns the intersection of two TypeSpecs, only returning keys that are present in both. - If a key is present in both TypeSpecs, the type must be the same. - """ - if left is None or right is None: - return None - # Find common keys and ensure types match - common_keys = set(left.keys()).intersection(set(right.keys())) - intersection = {} - for key in common_keys: - try: - intersection[key] = get_compatible_type(left[key], right[key]) - except TypeError: - # If types are not compatible, raise an error - raise TypeError(f"Type conflict for key '{key}': {left[key]} vs {right[key]}") - - return intersection def common_elements(*values) -> Collection[str]: diff --git a/tests/test_hashing/test_composite_hasher.py b/tests/test_hashing/test_composite_hasher.py deleted file mode 100644 index f92cfea..0000000 --- a/tests/test_hashing/test_composite_hasher.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -"""Tests for the CompositeFileHasher implementation.""" - -from unittest.mock import patch - -import pytest - -from orcapod.hashing.legacy_core import hash_to_hex -from orcapod.hashing.file_hashers import BasicFileHasher, DefaultCompositeFileHasher -from orcapod.hashing.types import FileHasher, PacketHasher, PathSetHasher - - -# Custom implementation of hash_file for tests that doesn't check for file existence -def mock_hash_file(file_path, algorithm="sha256", buffer_size=65536) -> str: - """Mock implementation of hash_file that doesn't check for file existence.""" - # Simply return a deterministic hash based on the file path - return hash_to_hex(f"mock_file_hash_{file_path}_{algorithm}") - - -# Custom implementation of hash_pathset for tests that doesn't check for file existence -def mock_hash_pathset( - pathset, algorithm="sha256", buffer_size=65536, char_count=32, file_hasher=None -): - """Mock implementation of hash_pathset that doesn't check for file existence.""" - from collections.abc import Collection - from os import PathLike - from pathlib import Path - - # If file_hasher is None, we'll need to handle it differently - if file_hasher is None: - # Just return a mock hash for testing - if isinstance(pathset, (str, Path, PathLike)): - return f"mock_{pathset}" - return "mock_hash" - - # Handle dictionary case for nested paths - if isinstance(pathset, dict): - hash_dict = {} - for key, value in pathset.items(): - hash_dict[key] = mock_hash_pathset( - value, algorithm, buffer_size, char_count, file_hasher - ) - return hash_to_hex(str(hash_dict)) - - # Handle collection case (list, set, etc.) - if isinstance(pathset, Collection) and not isinstance( - pathset, (str, Path, PathLike) - ): - hash_list = [] - for item in pathset: - hash_list.append( - mock_hash_pathset(item, algorithm, buffer_size, char_count, file_hasher) - ) - return hash_to_hex(str(hash_list)) - - # Handle simple string or Path case - if isinstance(pathset, (str, Path, PathLike)): - if hasattr(file_hasher, "__self__"): # For bound methods - return file_hasher(str(pathset)) - else: - return file_hasher(str(pathset)) - - return "mock_hash" - - -# Custom implementation of hash_packet for tests that doesn't check for file existence -def mock_hash_packet( - packet, - algorithm="sha256", - buffer_size=65536, - char_count=32, - prefix_algorithm=True, - pathset_hasher=None, -): - """Mock implementation of hash_packet that doesn't check for file existence.""" - # Create a simple hash based on the packet structure - hash_value = hash_to_hex(str(packet)) - - # Format it like the real function would - if prefix_algorithm and algorithm: - return ( - f"{algorithm}-{hash_value[: char_count if char_count else len(hash_value)]}" - ) - else: - return hash_value[: char_count if char_count else len(hash_value)] - - -@pytest.fixture(autouse=True) -def patch_hash_functions(): - """Patch the hash functions in the core module for all tests.""" - with ( - patch("orcapod.hashing.core.hash_file", side_effect=mock_hash_file), - patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset), - patch("orcapod.hashing.core.hash_packet", side_effect=mock_hash_packet), - ): - yield - - -def test_default_composite_hasher_implements_all_protocols(): - """Test that CompositeFileHasher implements all three protocols.""" - # Create a basic file hasher to be used within the composite hasher - file_hasher = BasicFileHasher() - - # Create the composite hasher - composite_hasher = DefaultCompositeFileHasher(file_hasher) - - # Verify it implements all three protocols - assert isinstance(composite_hasher, FileHasher) - assert isinstance(composite_hasher, PathSetHasher) - assert isinstance(composite_hasher, PacketHasher) - - -def test_default_composite_hasher_file_hashing(): - """Test CompositeFileHasher's file hashing functionality.""" - # We can use a mock path since our mocks don't require real files - file_path = "/path/to/mock_file.txt" - - # Create a custom mock file hasher - class MockFileHasher: - def hash_file(self, file_path): - return mock_hash_file(file_path) - - file_hasher = MockFileHasher() - composite_hasher = DefaultCompositeFileHasher(file_hasher) - - # Get hash from the composite hasher and directly from the file hasher - direct_hash = file_hasher.hash_file(file_path) - composite_hash = composite_hasher.hash_file(file_path) - - # The hashes should be identical - assert direct_hash == composite_hash - - -def test_default_composite_hasher_pathset_hashing(): - """Test CompositeFileHasher's path set hashing functionality.""" - - # Create a custom mock file hasher that doesn't check for file existence - class MockFileHasher: - def hash_file(self, file_path): - return mock_hash_file(file_path) - - file_hasher = MockFileHasher() - composite_hasher = DefaultCompositeFileHasher(file_hasher) - - # Simple path set with non-existent paths - pathset = ["/path/to/file1.txt", "/path/to/file2.txt"] - - # Hash the pathset - result = composite_hasher.hash_pathset(pathset) - - # The result should be a string hash - assert isinstance(result, str) - - -if __name__ == "__main__": - pytest.main(["-v", __file__]) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 6fd2add..191da89 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -1,7 +1,6 @@ #!/usr/bin/env python """Tests for TransferDataStore.""" -import json from pathlib import Path import pytest diff --git a/tests/test_types/test_inference/test_extract_function_data_types.py b/tests/test_types/test_inference/test_extract_function_data_types.py index e96fd9c..8ae1ea5 100644 --- a/tests/test_types/test_inference/test_extract_function_data_types.py +++ b/tests/test_types/test_inference/test_extract_function_data_types.py @@ -11,7 +11,7 @@ import pytest from collections.abc import Collection -from orcapod.types.typespec import extract_function_typespecs +from orcapod.types.typespec_utils import extract_function_typespecs class TestExtractFunctionDataTypes: From a3ba1723d40c0cb8b16d95567ba16eacaf6b2a1f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 00:52:46 +0000 Subject: [PATCH 026/224] feat: add field source tracking --- src/orcapod/core/base.py | 2 +- src/orcapod/core/pod.py | 2 +- src/orcapod/pipeline/wrappers.py | 200 ++++++++++++-------- src/orcapod/types/__init__.py | 4 +- src/orcapod/types/packets.py | 42 ++-- src/orcapod/types/schemas.py | 139 ++++++++++---- src/orcapod/types/semantic_type_handlers.py | 44 ++--- src/orcapod/types/semantic_type_registry.py | 23 ++- 8 files changed, 278 insertions(+), 178 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 9a30873..7c9a299 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -10,7 +10,7 @@ from orcapod.hashing import ContentIdentifiableBase from orcapod.types import Packet, Tag, TypeSpec -from orcapod.types.typespec import get_typespec_from_dict +from orcapod.types.typespec_utils import get_typespec_from_dict import logging diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index eb880b4..4271887 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -9,7 +9,7 @@ from orcapod.types import Packet, Tag, TypeSpec, default_registry from orcapod.types.typespec_utils import extract_function_typespecs -from orcapod.types.semantic_type_registry import PacketConverter +from orcapod.types.packets import PacketConverter from orcapod.hashing import ( FunctionInfoExtractor, diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index 4396223..c12f40a 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -1,15 +1,14 @@ from orcapod.core.pod import Pod, FunctionPod from orcapod.core import SyncStream, Source, Kernel from orcapod.store import ArrowDataStore -from orcapod.types import Tag, Packet, TypeSpec, default_registry +from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry from orcapod.types.typespec_utils import get_typespec_from_dict, union_typespecs, extract_function_typespecs -from orcapod.types.semantic_type_registry import create_arrow_table_with_meta +from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types import packets, schemas from orcapod.hashing import ObjectHasher, ArrowHasher from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher from typing import Any, Literal from collections.abc import Collection, Iterator -from orcapod.types.semantic_type_registry import TypeRegistry -from orcapod.types.packet_converter import PacketConverter import pyarrow as pa import polars as pl from orcapod.core.streams import SyncStreamFromGenerator @@ -18,12 +17,15 @@ logger = logging.getLogger(__name__) +def get_tag_typespec(tag: Tag) -> dict[str, type]: + return {k: str for k in tag} class PolarsSource(Source): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str] | None = None): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str], packet_keys: Collection[str]|None = None): self.df = df self.tag_keys = tag_keys + self.packet_keys = packet_keys def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if len(streams) != 0: @@ -31,19 +33,25 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: "PolarsSource does not support forwarding streams. " "It generates its own stream from the DataFrame." ) - return PolarsStream(self.df, self.tag_keys) + return PolarsStream(self.df, self.tag_keys, self.packet_keys) class PolarsStream(SyncStream): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str]): + def __init__(self, df: pl.DataFrame, tag_keys: Collection[str], packet_keys: Collection[str] | None = None): self.df = df - self.tag_keys = tag_keys + self.tag_keys = tuple(tag_keys) + self.packet_keys = tuple(packet_keys) if packet_keys is not None else None def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - for row in self.df.iter_rows(named=True): + df = self.df + # if self.packet_keys is not None: + # df = df.select(self.tag_keys + self.packet_keys) + for row in df.iter_rows(named=True): tag = {key: row[key] for key in self.tag_keys} - packet = {key: val for key, val in row.items() if key not in self.tag_keys} - yield tag, Packet(packet) + packet = {key: val for key, val in row.items() if key not in self.tag_keys and not key.startswith("_source_info_")} + # TODO: revisit and fix this rather hacky implementation + source_info = {key.removeprefix("_source_info_"):val for key, val in row.items() if key.startswith("_source_info_")} + yield tag, Packet(packet, source_info=source_info) class EmptyStream(SyncStream): @@ -134,6 +142,13 @@ def claims_unique_tags( *resolved_streams, trigger_run=trigger_run ) + + + def post_call(self, tag: Tag, packet: Packet) -> None: ... + + def output_iterator_completion_hook(self) -> None: ... + + class CachedKernelWrapper(KernelInvocationWrapper, Source): """ @@ -154,7 +169,7 @@ def __init__( output_store: ArrowDataStore, kernel_hasher: ObjectHasher | None = None, arrow_packet_hasher: ArrowHasher | None = None, - packet_type_registry: TypeRegistry | None = None, + packet_type_registry: SemanticTypeRegistry | None = None, **kwargs, ) -> None: super().__init__(kernel, input_streams, **kwargs) @@ -172,9 +187,7 @@ def __init__( packet_type_registry = default_registry self._packet_type_registry = packet_type_registry - self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) - self.tag_keys, self.packet_keys = self.keys(trigger_run=False) - self.output_converter = None + self.update_cached_values() self._cache_computed = False @@ -203,70 +216,75 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): def update_cached_values(self): self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) - self.output_converter = None + self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) + if self.tag_typespec is None or self.packet_typespec is None: + raise ValueError("Currently, cached kernel wrapper can only work with kernels that have typespecs defined.") + # TODO: clean up and make it unnecessary to convert packet typespec + packet_schema = schemas.PythonSchema(self.packet_typespec) + joined_typespec = union_typespecs(self.tag_typespec, packet_schema.with_source_info) + if joined_typespec is None: + raise ValueError( + "Joined typespec should not be None. " + "This may happen if the tag typespec and packet typespec are incompatible." + ) + # Add any additional fields to the output converter here + self.output_converter = packets.PacketConverter(joined_typespec, registry=self.registry, include_source_info=False) + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") if self.df is not None: - return PolarsStream(self.df, tag_keys=self.tag_keys) + if self.tag_keys is None: + raise ValueError( + "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" + ) + source_info_sig = ':'.join(self.source_info) + return PolarsStream(self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) resolved_streams = self.resolve_input_streams(*streams) - output_stream = self.kernel.forward(*resolved_streams, **kwargs) - tag_typespec, packet_typespec = output_stream.types(trigger_run=False) - if tag_typespec is not None and packet_typespec is not None: - joined_type = union_typespecs(tag_typespec, packet_typespec) - assert joined_type is not None, "Joined typespec should not be None" - all_type = dict(joined_type) - for k in packet_typespec: - all_type[f'_source_{k}'] = str - # - self.output_converter = PacketConverter(all_type, registry=self.registry) - # Cache the output stream of the underlying kernel # If an entry with same tag and packet already exists in the output store, # it will not be added again, thus avoiding duplicates. def generator() -> Iterator[tuple[Tag, Packet]]: logger.info(f"Computing and caching outputs for {self}") for tag, packet in output_stream: - merged_info = {**tag, **packet} - # add entries for source_info - for k, v in packet.source_info.items(): - merged_info[f'_source_{k}'] = v - - if self.output_converter is None: - # TODO: cleanup logic here - joined_type = get_typespec_from_dict(merged_info) - assert joined_type is not None, "Joined typespec should not be None" - all_type = dict(joined_type) - for k in packet: - all_type[f'_source_{k}'] = str - self.output_converter = PacketConverter( - all_type, registry=self.registry - ) - - # add entries for source_info - for k, v in packet.source_info.items(): - merged_info[f'_source_{k}'] = v - - output_table = self.output_converter.to_arrow_table(merged_info) - # TODO: revisit this logic - output_id = self.arrow_hasher.hash_table(output_table) - if not self.output_store.get_record(*self.source_info, output_id): - self.output_store.add_record( - *self.source_info, - output_id, - output_table, - ) + self.post_call(tag, packet) yield tag, packet - self._cache_computed = True + self.output_iterator_completion_hook() + + logger.info(f"Results cached for {self}") + self._cache_computed = True return SyncStreamFromGenerator(generator) + def post_call(self, tag: Tag, packet: Packet) -> None: + # Cache the output stream of the underlying kernel + # If an entry with same tag and packet already exists in the output store, + # it will not be added again, thus avoiding duplicates. + merged_info = {**tag, **packet.get_composite()} + output_table = self.output_converter.from_python_packet_to_arrow_table(merged_info) + # TODO: revisit this logic + output_id = self.arrow_hasher.hash_table(output_table) + if not self.output_store.get_record(*self.source_info, output_id): + self.output_store.add_record( + *self.source_info, + output_id, + output_table, + ) + + def output_iterator_completion_hook(self) -> None: + """ + Hook to be called when the generator is completed. + """ + logger.info(f"Results cached for {self}") + self._cache_computed = True + + @property def lazy_df(self) -> pl.LazyFrame | None: return self.output_store.get_all_records_as_polars(*self.source_info) @@ -333,7 +351,7 @@ def __init__( error_handling: Literal["raise", "ignore", "warn"] = "raise", object_hasher: ObjectHasher | None = None, arrow_hasher: ArrowHasher | None = None, - registry: TypeRegistry | None = None, + registry: SemanticTypeRegistry | None = None, **kwargs, ) -> None: super().__init__( @@ -391,11 +409,11 @@ def arrow_hasher(self, arrow_hasher: ArrowHasher | None = None): self.update_cached_values() @property - def registry(self) -> TypeRegistry: + def registry(self) -> SemanticTypeRegistry: return self._registry @registry.setter - def registry(self, registry: TypeRegistry | None = None): + def registry(self, registry: SemanticTypeRegistry | None = None): if registry is None: registry = default_registry self._registry = registry @@ -405,11 +423,29 @@ def registry(self, registry: TypeRegistry | None = None): def update_cached_values(self) -> None: self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) self.tag_keys, self.output_keys = self.keys(trigger_run=False) + if self.tag_keys is None or self.output_keys is None: + raise ValueError( + "Currently, cached function pod wrapper can only work with function pods that have keys defined." + ) + self.all_keys = tuple(self.tag_keys) + tuple(self.output_keys) + self.tag_typespec, self.output_typespec = self.types(trigger_run=False) + if self.tag_typespec is None or self.output_typespec is None: + raise ValueError( + "Currently, cached function pod wrapper can only work with function pods that have typespecs defined." + ) self.input_typespec, self.output_typespec = ( self.function_pod.get_function_typespecs() ) - self.input_converter = PacketConverter(self.input_typespec, self.registry) - self.output_converter = PacketConverter(self.output_typespec, self.registry) + + self.input_converter = packets.PacketConverter(self.input_typespec, self.registry, include_source_info=False) + self.output_converter = packets.PacketConverter(self.output_typespec, self.registry, include_source_info=True) + + input_packet_source_typespec = {f'_source_info_{k}': str for k in self.input_typespec} + + # prepare typespec for tag record: __packet_key, tag, input packet source_info, + tag_record_typespec = {"__packet_key": str, **self.tag_typespec, **input_packet_source_typespec} + self.tag_record_converter = packets.PacketConverter(tag_record_typespec, self.registry, include_source_info=False) + def reset_cache(self): self._cache_computed = False @@ -425,14 +461,17 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") if self.df is not None: - return PolarsStream(self.df, self.tag_keys) + if self.tag_keys is None: + raise ValueError("Tag keys are not set, cannot return PolarsStream") + + return PolarsStream(self.df, self.tag_keys, packet_keys=self.output_keys) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) logger.info(f"Computing and caching outputs for {self}") return super().forward(*streams, **kwargs) def get_packet_key(self, packet: Packet) -> str: - return self.arrow_hasher.hash_table(self.input_converter.to_arrow_table(packet)) + return self.arrow_hasher.hash_table(self.input_converter.from_python_packet_to_arrow_table(packet)) @property def source_info(self): @@ -455,11 +494,9 @@ def _add_pipeline_record_with_packet_key(self, tag: Tag, packet_key: str, packet combined_info = dict(tag) # ensure we don't modify the original tag combined_info["__packet_key"] = packet_key for k, v in packet_source_info.items(): - combined_info[f'__{k}_source'] = v + combined_info[f'_source_info_{k}'] = v - # TODO: consider making this more efficient - # convert tag to arrow table - columns are labeled with metadata source=tag - table = create_arrow_table_with_meta(combined_info, {"source": "tag"}) + table = self.tag_record_converter.from_python_packet_to_arrow_table(combined_info) entry_hash = self.arrow_hasher.hash_table(table) @@ -492,7 +529,7 @@ def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: ) if arrow_table is None: return None - packets = self.function_pod.output_converter.from_arrow_table(arrow_table) + packets = self.output_converter.from_arrow_table_to_python_packets(arrow_table) # since memoizing single packet, it should only contain one packet assert len(packets) == 1, ( f"Memoizing single packet return {len(packets)} packets!" @@ -509,10 +546,10 @@ def memoize( Returns the memoized packet. """ logger.debug("Memoizing packet") - return self._memoize_with_packet_key(self.get_packet_key(packet), output_packet) + return self._memoize_with_packet_key(self.get_packet_key(packet), output_packet.get_composite()) def _memoize_with_packet_key( - self, packet_key: str, output_packet: Packet + self, packet_key: str, output_packet: PacketLike ) -> Packet: """ Memoize the output packet in the data store, looking up by packet key. @@ -521,11 +558,11 @@ def _memoize_with_packet_key( logger.debug(f"Memoizing packet with key {packet_key}") # TODO: this logic goes through the entire store and retrieve cycle with two conversions # consider simpler alternative - packets = self.output_converter.from_arrow_table( + packets = self.output_converter.from_arrow_table_to_python_packets( self.output_store.add_record( *self.source_info, packet_key, - self.output_converter.to_arrow_table(output_packet), + self.output_converter.from_python_packet_to_arrow_table(output_packet), ) ) # since passed in a single packet, it should only return a single packet @@ -535,9 +572,7 @@ def _memoize_with_packet_key( packet = packets[0] # TODO: reconsider the right place to attach this information # attach provenance information - packet_source_id = ":".join(self.source_info + (packet_key,)) - source_info = {k: f'{packet_source_id}:{k}' for k in packet} - return Packet(packet, source_info=source_info) + return Packet(packet) def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: @@ -567,6 +602,10 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: if output_packet is not None and not self.skip_memoization: # output packet may be modified by the memoization process # e.g. if the output is a file, the path may be changed + # add source info to the output packet + source_info = {k: '-'.join(self.source_info) + "-" + packet_key for k in output_packet.source_info} + # TODO: fix and make this not access protected field directly + output_packet.source_info = source_info output_packet = self._memoize_with_packet_key(packet_key, output_packet) # type: ignore if output_packet is None: @@ -593,7 +632,7 @@ def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: return data.drop("__packet_key") if data is not None else None return data - def get_all_entries_with_tags(self) -> pl.LazyFrame | None: + def get_all_entries_with_tags(self, keep_hidden_fields: bool = False) -> pl.LazyFrame | None: """ Retrieve all entries from the tag store with their associated tags. Returns a DataFrame with columns for tag and packet key. @@ -612,9 +651,12 @@ def get_all_entries_with_tags(self) -> pl.LazyFrame | None: if result_packets is None: return None - return pl.concat([tag_records, result_packets], how="horizontal").drop( + pl_df = pl.concat([tag_records, result_packets], how="horizontal").drop( ["__packet_key"] ) + if not keep_hidden_fields: + pl_df = pl_df.select(self.all_keys) + return pl_df @property def df(self) -> pl.DataFrame | None: diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index a4615f5..03a3b4b 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,5 +1,5 @@ from .core import Tag, PathLike, PathSet, PodFunction, TypeSpec -from .packets import Packet +from .packets import Packet, PacketLike from .semantic_type_registry import SemanticTypeRegistry from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler from . import semantic_type_handlers @@ -20,7 +20,7 @@ "default_registry", "Tag", "Packet", - "PacketLike" + "PacketLike", "TypeSpec", "PathLike", "PathSet", diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/packets.py index a8d8775..4a3b192 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/packets.py @@ -36,6 +36,16 @@ def typespec(self) -> TypeSpec: @property def source_info(self) -> dict[str, str | None]: return {key: self._source_info.get(key, None) for key in self.keys()} + + @source_info.setter + def source_info(self, source_info: Mapping[str, str | None]): + self._source_info = {key: value for key, value in source_info.items() if value is not None} + + def get_composite(self) -> PacketLike: + composite = self.copy() + for k, v in self.source_info.items(): + composite[f"_source_info_{k}"] = v + return composite @@ -69,15 +79,20 @@ def __init__(self, *args, semantic_schema: schemas.SemanticSchema | None = None, source_info = {} self.source_info = source_info + def get_composite(self) -> dict[str, Any]: + composite = self.copy() + for k, v in self.source_info.items(): + composite[f"_source_info_{k}"] = v + return composite class PacketConverter: - def __init__(self, python_schema: schemas.PythonSchema, registry: SemanticTypeRegistry, include_source_info: bool = True): - self.python_schema = python_schema + def __init__(self, typespec: TypeSpec, registry: SemanticTypeRegistry, include_source_info: bool = True): + self.typespec = typespec self.registry = registry - self.semantic_schema = schemas.from_python_schema_to_semantic_schema( - python_schema, registry + self.semantic_schema = schemas.from_typespec_to_semantic_schema( + typespec, registry ) self.include_source_info = include_source_info @@ -90,7 +105,7 @@ def __init__(self, python_schema: schemas.PythonSchema, registry: SemanticTypeRe self.key_handlers: dict[str, TypeHandler] = {} - self.expected_key_set = set(self.python_schema.keys()) + self.expected_key_set = set(self.typespec.keys()) for key, (_, semantic_type) in self.semantic_schema.items(): if semantic_type is None: @@ -168,16 +183,11 @@ def from_semantic_packet_to_arrow_table(self, semantic_packet: SemanticPacket) - Returns: Arrow table representation of the packet """ - arrays = [] - for field in self.arrow_schema: - value = semantic_packet.get(field.name, None) - arrays.append(pa.array([value], type=field.type)) - if self.include_source_info: - for field, value in semantic_packet.source_info.items(): - arrays.append(pa.array([value], type=pa.large_string())) + return pa.Table.from_pylist([semantic_packet.get_composite()], schema=self.arrow_schema) + else: + return pa.Table.from_pylist([semantic_packet], schema=self.arrow_schema) - return pa.Table.from_arrays(arrays, schema=self.arrow_schema) def from_arrow_table_to_semantic_packets(self, arrow_table: pa.Table) -> Collection[SemanticPacket]: """Convert an Arrow table to a semantic packet. @@ -198,8 +208,8 @@ def from_arrow_table_to_semantic_packets(self, arrow_table: pa.Table) -> Collect semantic_packets = [] for all_packet_content in semantic_packets_contents: packet_content = {k: v for k, v in all_packet_content.items() if k in self.expected_key_set} - source_info = {k.strip('_source_info_'): v for k, v in all_packet_content.items() if k.startswith('_source_info_')} - semantic_packets.append(SemanticPacket(packet_content, _semantic_schema=self.semantic_schema, _source_info=source_info)) + source_info = {k.removeprefix('_source_info_'): v for k, v in all_packet_content.items() if k.startswith('_source_info_')} + semantic_packets.append(SemanticPacket(packet_content, semantic_schema=self.semantic_schema, source_info=source_info)) return semantic_packets @@ -213,7 +223,7 @@ def from_semantic_packet_to_python_packet(self, semantic_packet: SemanticPacket) Python packet representation of the semantic packet """ # Validate packet keys - python_packet = Packet(semantic_packet, typespec=self.python_schema, source_info=semantic_packet.source_info) + python_packet = Packet(semantic_packet, typespec=self.typespec, source_info=semantic_packet.source_info) packet_keys = set(python_packet.keys()) self._check_key_consistency(packet_keys) diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 4f78ca5..19e8a3b 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -39,27 +39,37 @@ def arrow_to_python_type(arrow_type: pa.DataType) -> type: -# class PythonSchema(dict[str, type]): -# """ -# A schema for Python data types, mapping string keys to Python types. +class PythonSchema(dict[str, type]): + """ + A schema for Python data types, mapping string keys to Python types. -# This is used to define the expected structure of data packets in OrcaPod. + This is used to define the expected structure of data packets in OrcaPod. -# Attributes -# ---------- -# keys : str -# The keys of the schema. -# values : type -# The types corresponding to each key. + Attributes + ---------- + keys : str + The keys of the schema. + values : type + The types corresponding to each key. -# Examples -# -------- -# >>> schema = PythonSchema(name=str, age=int) -# >>> print(schema) -# {'name': , 'age': } -# """ + Examples + -------- + >>> schema = PythonSchema(name=str, age=int) + >>> print(schema) + {'name': , 'age': } + """ + @property + def with_source_info(self) -> dict[str, type]: + """ + Get the schema with source info fields included. + + Returns + ------- + dict[str, type|None] + A new schema including source info fields. + """ + return {**self, **{f'_source_info_{k}': str for k in self.keys()}} -PythonSchema = TypeSpec class SemanticSchema(dict[str, tuple[type, str|None]]): @@ -112,10 +122,42 @@ def get_semantic_type(self, key: str) -> str | None: The semantic type associated with the key, or None if not found. """ return self.get(key, (None, None))[1] + + @property + def storage_schema(self) -> PythonSchema: + """ + Get the storage schema, which is a PythonSchema representation of the semantic schema. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return PythonSchema({k: v[0] for k, v in self.items()}) + + + @property + def storage_schema_with_source_info(self) -> dict[str, type]: + """ + Get the storage schema with source info fields included. + + Returns + ------- + dict[str, type] + A new schema including source info fields. + + Examples + -------- + >>> semantic_schema = SemanticSchema(name=(str, 'name'), age=(int, None)) + >>> storage_schema = semantic_schema.storage_schema_with_source_info + >>> print(storage_schema) + {'name': , 'age': , '_source_info_name': , '_source_info_age': } + """ + return self.storage_schema.with_source_info -def from_python_schema_to_semantic_schema( - python_schema: PythonSchema, +def from_typespec_to_semantic_schema( + typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry, ) -> SemanticSchema: """ @@ -123,8 +165,8 @@ def from_python_schema_to_semantic_schema( Parameters ---------- - python_schema : PythonSchema - The schema to convert, mapping keys to Python types. + typespec : TypeSpec + The typespec to convert, mapping keys to Python types. semantic_type_registry : SemanticTypeRegistry The registry containing semantic type information. @@ -135,13 +177,13 @@ def from_python_schema_to_semantic_schema( Examples -------- - >>> python_schema = PythonSchema(name=str, age=int) - >>> semantic_schema = from_python_schema_to_semantic_schema(python_schema, registry) + >>> typespec: TypeSpec = dict(name=str, age=int) + >>> semantic_schema = from_typespec_to_semanticn_schema(typespec, registry) >>> print(semantic_schema) {'name': (, None), 'age': (, None)} """ semantic_schema = {} - for key, python_type in python_schema.items(): + for key, python_type in typespec.items(): if python_type in semantic_type_registry: type_info = semantic_type_registry.get_type_info(python_type) assert type_info is not None, f"Type {python_type} should be found in the registry as `in` returned True" @@ -176,13 +218,13 @@ def from_semantic_schema_to_python_schema( >>> print(python_schema) {'name': , 'age': } """ - python_schema = {} + python_schema_content = {} for key, (python_type, semantic_type) in semantic_schema.items(): if semantic_type is not None: # If the semantic type is registered, use the corresponding Python type python_type = semantic_type_registry.get_python_type(semantic_type) - python_schema[key] = python_type - return python_schema + python_schema_content[key] = python_type + return PythonSchema(python_schema_content) def from_semantic_schema_to_arrow_schema( semantic_schema: SemanticSchema, @@ -223,7 +265,6 @@ def from_semantic_schema_to_arrow_schema( def from_arrow_schema_to_semantic_schema( arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry | None = None, ) -> SemanticSchema: """ Convert an Arrow schema to a semantic schema. @@ -253,15 +294,39 @@ def from_arrow_schema_to_semantic_schema( continue semantic_type = field.metadata.get(b'semantic_type', None) semantic_type = semantic_type.decode() if semantic_type else None - if semantic_type: - if semantic_type_registry is None: - raise ValueError("Semantic type registry must be provided for semantic types") - python_type = semantic_type_registry.get_python_type(semantic_type) - if python_type is None: - raise ValueError(f"Semantic type '{semantic_type}' is not registered in the registry") - else: - python_type = arrow_to_python_type(field.type) - + python_type = arrow_to_python_type(field.type) semantic_schema[field.name] = (python_type, semantic_type) return SemanticSchema(semantic_schema) +def from_typespec_to_arrow_schema(typespec: TypeSpec, + semantic_type_registry: SemanticTypeRegistry, include_source_info: bool = True) -> pa.Schema: + semantic_schema = from_typespec_to_semantic_schema(typespec, semantic_type_registry) + return from_semantic_schema_to_arrow_schema(semantic_schema, include_source_info=include_source_info) + + +def from_arrow_schema_to_python_schema( + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry, +) -> PythonSchema: + """ + Convert an Arrow schema to a Python schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The schema to convert, containing fields with metadata. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + + Examples + -------- + >>> arrow_schema = pa.schema([pa.field('name', pa.string()), pa.field('age', pa.int64())]) + >>> python_schema = from_arrow_schema_to_python_schema(arrow_schema) + >>> print(python_schema) + {'name': , 'age': } + """ + semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) + return from_semantic_schema_to_python_schema(semantic_schema, semantic_type_registry) \ No newline at end of file diff --git a/src/orcapod/types/semantic_type_handlers.py b/src/orcapod/types/semantic_type_handlers.py index a15f9d5..b3bc70c 100644 --- a/src/orcapod/types/semantic_type_handlers.py +++ b/src/orcapod/types/semantic_type_handlers.py @@ -12,8 +12,8 @@ class PathHandler: def python_type(self) -> type: return Path - def storage_type(self) -> pa.DataType: - return pa.string() + def storage_type(self) -> type: + return str def python_to_storage(self, value: Path) -> str: return str(value) @@ -28,8 +28,8 @@ class UUIDHandler: def python_type(self) -> type: return UUID - def storage_type(self) -> pa.DataType: - return pa.string() + def storage_type(self) -> type: + return str def python_to_storage(self, value: UUID) -> str: return str(value) @@ -44,8 +44,8 @@ class DecimalHandler: def python_type(self) -> type: return Decimal - def storage_type(self) -> pa.DataType: - return pa.string() + def storage_type(self) -> type: + return str def python_to_storage(self, value: Decimal) -> str: return str(value) @@ -57,34 +57,14 @@ def storage_to_python(self, value: str) -> Decimal | None: class SimpleMappingHandler: """Handler for basic types that map directly to Arrow.""" - def __init__(self, python_type: type, arrow_type: pa.DataType): + def __init__(self, python_type: type): self._python_type = python_type - self._arrow_type = arrow_type def python_type(self) -> type: return self._python_type - def storage_type(self) -> pa.DataType: - return self._arrow_type - - def python_to_storage(self, value: Any) -> Any: - return value # Direct mapping - - def storage_to_python(self, value: Any) -> Any: - return value # Direct mapping - - -class DirectArrowHandler: - """Handler for types that map directly to Arrow without conversion.""" - - def __init__(self, arrow_type: pa.DataType): - self._arrow_type = arrow_type - - def python_type(self) -> type: - return self._arrow_type - - def storage_type(self) -> pa.DataType: - return self._arrow_type + def storage_type(self) -> type: + return self._python_type def python_to_storage(self, value: Any) -> Any: return value # Direct mapping @@ -97,10 +77,10 @@ class DateTimeHandler: """Handler for datetime objects.""" def python_type(self) -> type: - return (datetime, date, time) # Handles multiple related types + return datetime - def storage_type(self) -> pa.DataType: - return pa.timestamp("us") # Store everything as timestamp + def storage_type(self) -> type: + return datetime def python_to_storage(self, value: datetime | date | time) -> Any: if isinstance(value, datetime): diff --git a/src/orcapod/types/semantic_type_registry.py b/src/orcapod/types/semantic_type_registry.py index d954891..d5a677f 100644 --- a/src/orcapod/types/semantic_type_registry.py +++ b/src/orcapod/types/semantic_type_registry.py @@ -1,11 +1,6 @@ -from collections.abc import Callable, Collection, Sequence, Mapping import logging -from optparse import Values -from typing import Any, get_origin, get_args -from types import UnionType import pyarrow as pa -from orcapod.types.packets import Packet, PacketLike -from .core import TypeHandler, TypeSpec +from .core import TypeHandler from dataclasses import dataclass # This mapping is expected to be stable @@ -77,16 +72,21 @@ def get_python_type(self, semantic_type: str) -> type | None: """Get Python type for a semantic type.""" return self._semantic_to_python_lut.get(semantic_type) - + def lookup_handler_info(self, python_type: type) -> tuple[TypeHandler, str] | None: + """Lookup handler info for a Python type.""" + for registered_type, (handler, semantic_type) in self._handlers.items(): + if issubclass(python_type, registered_type): + return (handler, semantic_type) + return None def get_semantic_type(self, python_type: type) -> str | None: """Get semantic type for a Python type.""" - handler_info = self._handlers.get(python_type) + handler_info = self.lookup_handler_info(python_type) return handler_info[1] if handler_info else None def get_handler(self, python_type: type) -> TypeHandler | None: """Get handler for a Python type.""" - handler_info = self._handlers.get(python_type) + handler_info = self.lookup_handler_info(python_type) return handler_info[0] if handler_info else None def get_handler_by_semantic_type(self, semantic_type: str) -> TypeHandler | None: @@ -110,7 +110,10 @@ def get_type_info(self, python_type: type) -> TypeInfo | None: def __contains__(self, python_type: type) -> bool: """Check if a Python type is registered.""" - return python_type in self._handlers + for registered_type in self._handlers: + if issubclass(python_type, registered_type): + return True + return False From d3b66de700871a6b0b2c6166ba0fe18e4613db2c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 01:24:14 +0000 Subject: [PATCH 027/224] feat: support map and join on packets with source info --- src/orcapod/core/operators.py | 14 ++--------- src/orcapod/types/packets.py | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index c68f34f..c26dc2d 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -262,11 +262,7 @@ def generator() -> Iterator[tuple[Tag, Packet]]: for left_tag, left_packet in left_stream_buffered: for right_tag, right_packet in right_stream_buffered: if (joined_tag := join_tags(left_tag, right_tag)) is not None: - if not check_packet_compatibility(left_packet, right_packet): - raise ValueError( - f"Packets are not compatible: {left_packet} and {right_packet}" - ) - yield joined_tag, Packet({**left_packet, **right_packet}) + yield joined_tag, left_packet.join(right_packet) return SyncStreamFromGenerator(generator) @@ -399,13 +395,7 @@ def forward(self, *streams: SyncStream) -> SyncStream: def generator(): for tag, packet in stream: - if self.drop_unmapped: - packet = Packet({ - v: packet[k] for k, v in self.key_map.items() if k in packet - }) - else: - packet = Packet({self.key_map.get(k, k): v for k, v in packet.items()}) - yield tag, packet + yield tag, packet.map_keys(self.key_map, self.drop_unmapped) return SyncStreamFromGenerator(generator) diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/packets.py index 4a3b192..a6621ee 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/packets.py @@ -46,7 +46,54 @@ def get_composite(self) -> PacketLike: for k, v in self.source_info.items(): composite[f"_source_info_{k}"] = v return composite + + def map_keys(self, mapping: Mapping[str, str], drop_unmapped: bool=False) -> 'Packet': + """ + Map the keys of the packet using the provided mapping. + + Args: + mapping: A dictionary mapping old keys to new keys. + + Returns: + A new Packet with keys mapped according to the provided mapping. + """ + if drop_unmapped: + new_content = { + v: self[k] for k, v in mapping.items() if k in self + } + new_typespec = { + v: self.typespec[k] for k, v in mapping.items() if k in self.typespec + } + new_source_info = { + v: self.source_info[k] for k, v in mapping.items() if k in self.source_info + } + else: + new_content = {mapping.get(k, k): v for k, v in self.items()} + new_typespec = {mapping.get(k, k): v for k, v in self.typespec.items()} + new_source_info = {mapping.get(k, k): v for k, v in self.source_info.items()} + return Packet(new_content, typespec=new_typespec, source_info=new_source_info) + + def join(self, other: 'Packet') -> 'Packet': + """ + Join another packet to this one, merging their keys and values. + + Args: + other: Another Packet to join with this one. + + Returns: + A new Packet with keys and values from both packets. + """ + # make sure there is no key collision + if not set(self.keys()).isdisjoint(other.keys()): + raise ValueError(f"Key collision detected: packets {self} and {other} have overlapping keys" + " and cannot be joined without losing information.") + + new_content = {**self, **other} + new_typespec = {**self.typespec, **other.typespec} + new_source_info = {**self.source_info, **other.source_info} + + return Packet(new_content, typespec=new_typespec, source_info=new_source_info) # a batch is a tuple of a tag and a list of packets From 0bafbaa08d6c534b5a1b53293d7ed5e5c0384e71 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 01:24:41 +0000 Subject: [PATCH 028/224] fix: keep all columns internally --- src/orcapod/core/streams.py | 2 ++ src/orcapod/pipeline/wrappers.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index 33f6b78..21060b1 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -96,6 +96,7 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: if not self.check_consistency: yield from self.generator_factory() + # TODO: add typespec handling def keys( self, *, trigger_run: bool = False ) -> tuple[Collection[str] | None, Collection[str] | None]: @@ -103,3 +104,4 @@ def keys( return super().keys(trigger_run=trigger_run) # If the keys are already set, return them return self.tag_keys.copy(), self.packet_keys.copy() + \ No newline at end of file diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index c12f40a..e999714 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -460,11 +460,11 @@ def generator_completion_hook(self, n_computed: int) -> None: def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") - if self.df is not None: + lazy_df = self.get_all_entries_with_tags(keep_hidden_fields=True) + if lazy_df is not None: if self.tag_keys is None: raise ValueError("Tag keys are not set, cannot return PolarsStream") - - return PolarsStream(self.df, self.tag_keys, packet_keys=self.output_keys) + return PolarsStream(lazy_df.collect(), self.tag_keys, packet_keys=self.output_keys) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) logger.info(f"Computing and caching outputs for {self}") From 6321467e88cd4f7b4e5cc9ebaf55fe24dfb21498 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 03:54:42 +0000 Subject: [PATCH 029/224] wip: update legacy file related tests and rename to stores --- src/orcapod/__init__.py | 8 +- src/orcapod/core/pod_legacy.py | 2 +- src/orcapod/hashing/__init__.py | 8 +- src/orcapod/hashing/defaults.py | 6 +- src/orcapod/hashing/file_hashers.py | 104 ++-- src/orcapod/hashing/legacy_core.py | 4 +- src/orcapod/hashing/types.py | 60 +- src/orcapod/pipeline/__init__.py | 5 + src/orcapod/pipeline/pipeline.py | 15 +- src/orcapod/pipeline/wrappers.py | 31 +- src/orcapod/{store => stores}/__init__.py | 0 .../{store => stores}/arrow_data_stores.py | 49 +- .../stores/delta_table_arrow_data_store.py | 559 ++++++++++++++++++ .../{store => stores}/dict_data_stores.py | 26 +- src/orcapod/{store => stores}/file_ops.py | 0 .../optimized_memory_store.py | 0 .../{store => stores}/safe_dir_data_store.py | 0 .../{store => stores}/transfer_data_store.py | 16 +- src/orcapod/{store => stores}/types.py | 27 +- .../test_basic_composite_hasher.py | 20 +- tests/test_hashing/test_cached_file_hasher.py | 6 +- tests/test_hashing/test_hasher_factory.py | 86 +-- tests/test_hashing/test_hasher_parity.py | 12 +- .../test_legacy_composite_hasher.py | 156 +++++ tests/test_hashing/test_packet_hasher.py | 38 +- tests/test_hashing/test_path_set_hasher.py | 20 +- tests/test_store/test_dir_data_store.py | 28 +- tests/test_store/test_integration.py | 6 +- tests/test_store/test_noop_data_store.py | 4 +- tests/test_store/test_transfer_data_store.py | 8 +- 30 files changed, 1041 insertions(+), 263 deletions(-) create mode 100644 src/orcapod/pipeline/__init__.py rename src/orcapod/{store => stores}/__init__.py (100%) rename src/orcapod/{store => stores}/arrow_data_stores.py (98%) create mode 100644 src/orcapod/stores/delta_table_arrow_data_store.py rename src/orcapod/{store => stores}/dict_data_stores.py (95%) rename src/orcapod/{store => stores}/file_ops.py (100%) rename src/orcapod/{store => stores}/optimized_memory_store.py (100%) rename src/orcapod/{store => stores}/safe_dir_data_store.py (100%) rename src/orcapod/{store => stores}/transfer_data_store.py (90%) rename src/orcapod/{store => stores}/types.py (79%) create mode 100644 tests/test_hashing/test_legacy_composite_hasher.py diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index db457e9..ad00035 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,11 +1,12 @@ from .core import operators, sources, streams from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -from . import hashing, store +from . import hashing, stores from .core.operators import Join, MapPackets, MapTags, packet, tag from .core.pod import FunctionPod, function_pod from .core.sources import GlobSource -from .store import DirDataStore, SafeDirDataStore +from .stores import DirDataStore, SafeDirDataStore from .core.tracker import GraphTracker +from .pipeline import Pipeline DEFAULT_TRACKER = GraphTracker() DEFAULT_TRACKER.activate() @@ -13,7 +14,7 @@ __all__ = [ "hashing", - "store", + "stores", "pod", "operators", "streams", @@ -31,4 +32,5 @@ "DEFAULT_TRACKER", "SyncStreamFromLists", "SyncStreamFromGenerator", + "Pipeline", ] diff --git a/src/orcapod/core/pod_legacy.py b/src/orcapod/core/pod_legacy.py index 32c8efb..18099c6 100644 --- a/src/orcapod/core/pod_legacy.py +++ b/src/orcapod/core/pod_legacy.py @@ -16,7 +16,7 @@ from orcapod.core.base import Kernel from orcapod.core.operators import Join from orcapod.core.streams import SyncStream, SyncStreamFromGenerator -from orcapod.store import DataStore, NoOpDataStore +from orcapod.stores import DataStore, NoOpDataStore logger = logging.getLogger(__name__) diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 2bdff2b..7aaf11b 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -17,22 +17,22 @@ ) from .types import ( FileContentHasher, - PacketHasher, + LegacyPacketHasher, ArrowHasher, ObjectHasher, StringCacher, FunctionInfoExtractor, - CompositeFileHasher, + LegacyCompositeFileHasher, ) from .content_identifiable import ContentIdentifiableBase __all__ = [ "FileContentHasher", - "PacketHasher", + "LegacyPacketHasher", "ArrowHasher", "StringCacher", "ObjectHasher", - "CompositeFileHasher", + "LegacyCompositeFileHasher", "FunctionInfoExtractor", "hash_file", "hash_pathset", diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 61539b5..8ba7c0b 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,7 +1,7 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. from orcapod.hashing.types import ( - CompositeFileHasher, + LegacyCompositeFileHasher, ArrowHasher, FileContentHasher, StringCacher, @@ -36,7 +36,7 @@ def get_default_arrow_hasher( return arrow_hasher -def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher: +def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: if with_cache: # use unlimited caching string_cacher = InMemoryCacher(max_size=None) @@ -44,7 +44,7 @@ def get_default_composite_file_hasher(with_cache=True) -> CompositeFileHasher: return LegacyPathLikeHasherFactory.create_basic_legacy_composite() -def get_default_composite_file_hasher_with_cacher(cacher=None) -> CompositeFileHasher: +def get_default_composite_file_hasher_with_cacher(cacher=None) -> LegacyCompositeFileHasher: if cacher is None: cacher = InMemoryCacher(max_size=None) return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index cd12e80..64f48f8 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -2,11 +2,12 @@ from orcapod.hashing.hash_utils import hash_file from orcapod.hashing.types import ( FileContentHasher, - PathSetHasher, StringCacher, - CompositeFileHasher, + LegacyFileHasher, + LegacyPathSetHasher, + LegacyCompositeFileHasher, ) -from orcapod.types import Packet, PathLike, PathSet +from orcapod.types import PacketLike, PathLike, PathSet class BasicFileHasher: @@ -51,7 +52,7 @@ def hash_file(self, file_path: PathLike) -> bytes: # ----------------Legacy implementations for backward compatibility----------------- -class LegacyFileHasher: +class LegacyDefaultFileHasher: def __init__( self, algorithm: str = "sha256", @@ -60,45 +61,65 @@ def __init__( self.algorithm = algorithm self.buffer_size = buffer_size - def hash_file(self, file_path: PathLike) -> bytes: - return bytes.fromhex( - legacy_core.hash_file( - file_path, algorithm=self.algorithm, buffer_size=self.buffer_size - ), + def hash_file(self, file_path: PathLike) -> str: + return legacy_core.hash_file( + file_path, algorithm=self.algorithm, buffer_size=self.buffer_size ) -class LegacyPathsetHasher: + +class LegacyCachedFileHasher: + """File hasher with caching.""" + + def __init__( + self, + file_hasher: LegacyFileHasher, + string_cacher: StringCacher, + ): + self.file_hasher = file_hasher + self.string_cacher = string_cacher + + def hash_file(self, file_path: PathLike) -> str: + cache_key = f"file:{file_path}" + cached_value = self.string_cacher.get_cached(cache_key) + if cached_value is not None: + return cached_value + + value = self.file_hasher.hash_file(file_path) + self.string_cacher.set_cached(cache_key, value) + return value + + + +class LegacyDefaultPathsetHasher: """Default pathset hasher that composes file hashing.""" def __init__( self, - file_hasher: FileContentHasher, + file_hasher: LegacyFileHasher, char_count: int | None = 32, ): self.file_hasher = file_hasher self.char_count = char_count def _hash_file_to_hex(self, file_path: PathLike) -> str: - return self.file_hasher.hash_file(file_path).hex() + return self.file_hasher.hash_file(file_path) - def hash_pathset(self, pathset: PathSet) -> bytes: + def hash_pathset(self, pathset: PathSet) -> str: """Hash a pathset using the injected file hasher.""" - return bytes.fromhex( - legacy_core.hash_pathset( + return legacy_core.hash_pathset( pathset, char_count=self.char_count, - file_hasher=self._hash_file_to_hex, # Inject the method + file_hasher=self.file_hasher.hash_file, # Inject the method ) - ) -class LegacyPacketHasher: +class LegacyDefaultPacketHasher: """Default packet hasher that composes pathset hashing.""" def __init__( self, - pathset_hasher: PathSetHasher, + pathset_hasher: LegacyPathSetHasher, char_count: int | None = 32, prefix: str = "", ): @@ -107,9 +128,9 @@ def __init__( self.prefix = prefix def _hash_pathset_to_hex(self, pathset: PathSet): - return self.pathset_hasher.hash_pathset(pathset).hex() + return self.pathset_hasher.hash_pathset(pathset) - def hash_packet(self, packet: Packet) -> str: + def hash_packet(self, packet: PacketLike) -> str: """Hash a packet using the injected pathset hasher.""" hash_str = legacy_core.hash_packet( packet, @@ -121,28 +142,28 @@ def hash_packet(self, packet: Packet) -> str: # Convenience composite implementation -class LegacyCompositeFileHasher: +class LegacyDefaultCompositeFileHasher: """Composite hasher that implements all interfaces.""" def __init__( self, - file_hasher: FileContentHasher, + file_hasher: LegacyFileHasher, char_count: int | None = 32, packet_prefix: str = "", ): self.file_hasher = file_hasher - self.pathset_hasher = LegacyPathsetHasher(self.file_hasher, char_count) - self.packet_hasher = LegacyPacketHasher( + self.pathset_hasher = LegacyDefaultPathsetHasher(self.file_hasher, char_count) + self.packet_hasher = LegacyDefaultPacketHasher( self.pathset_hasher, char_count, packet_prefix ) - def hash_file(self, file_path: PathLike) -> bytes: + def hash_file(self, file_path: PathLike) -> str: return self.file_hasher.hash_file(file_path) - def hash_pathset(self, pathset: PathSet) -> bytes: + def hash_pathset(self, pathset: PathSet) -> str: return self.pathset_hasher.hash_pathset(pathset) - def hash_packet(self, packet: Packet) -> str: + def hash_packet(self, packet: PacketLike) -> str: return self.packet_hasher.hash_packet(packet) @@ -155,11 +176,11 @@ def create_basic_legacy_composite( algorithm: str = "sha256", buffer_size: int = 65536, char_count: int | None = 32, - ) -> CompositeFileHasher: + ) -> LegacyCompositeFileHasher: """Create a basic composite hasher.""" - file_hasher = LegacyFileHasher(algorithm, buffer_size) + file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) # use algorithm as the prefix for the packet hasher - return LegacyCompositeFileHasher( + return LegacyDefaultCompositeFileHasher( file_hasher, char_count, packet_prefix=algorithm ) @@ -169,13 +190,26 @@ def create_cached_legacy_composite( algorithm: str = "sha256", buffer_size: int = 65536, char_count: int | None = 32, - ) -> CompositeFileHasher: + ) -> LegacyCompositeFileHasher: """Create a composite hasher with file caching.""" - basic_file_hasher = LegacyFileHasher(algorithm, buffer_size) - cached_file_hasher = CachedFileHasher(basic_file_hasher, string_cacher) - return LegacyCompositeFileHasher( + basic_file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) + cached_file_hasher = LegacyCachedFileHasher(basic_file_hasher, string_cacher) + return LegacyDefaultCompositeFileHasher( cached_file_hasher, char_count, packet_prefix=algorithm ) + + @staticmethod + def create_legacy_file_hasher( + string_cacher: StringCacher | None = None, + algorithm: str = "sha256", + buffer_size: int = 65536, + ) -> LegacyFileHasher: + """Create just a file hasher, optionally with caching.""" + default_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) + if string_cacher is None: + return default_hasher + else: + return LegacyCachedFileHasher(default_hasher, string_cacher) @staticmethod def create_file_hasher( diff --git a/src/orcapod/hashing/legacy_core.py b/src/orcapod/hashing/legacy_core.py index cfe9c56..a5b4319 100644 --- a/src/orcapod/hashing/legacy_core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -33,7 +33,7 @@ import xxhash -from orcapod.types import Packet, PathSet +from orcapod.types import Packet, PacketLike, PathSet from orcapod.utils.name import find_noncolliding_name # Configure logging with __name__ for proper hierarchy @@ -681,7 +681,7 @@ def hash_packet_with_psh( def hash_packet( - packet: Packet, + packet: PacketLike, algorithm: str = "sha256", buffer_size: int = 65536, char_count: Optional[int] = 32, diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 10ed267..c7d79da 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -5,7 +5,7 @@ from typing import Any, Protocol, runtime_checkable import uuid -from orcapod.types import Packet, PathLike, PathSet, TypeSpec +from orcapod.types import PacketLike, PathLike, PathSet, TypeSpec import pyarrow as pa @@ -85,21 +85,6 @@ class FileContentHasher(Protocol): def hash_file(self, file_path: PathLike) -> bytes: ... -# Higher-level operations that compose file hashing -@runtime_checkable -class PathSetHasher(Protocol): - """Protocol for hashing pathsets (files, directories, collections).""" - - def hash_pathset(self, pathset: PathSet) -> bytes: ... - - -@runtime_checkable -class PacketHasher(Protocol): - """Protocol for hashing packets.""" - - def hash_packet(self, packet: Packet) -> str: ... - - @runtime_checkable class ArrowHasher(Protocol): """Protocol for hashing arrow packets.""" @@ -116,14 +101,6 @@ def set_cached(self, cache_key: str, value: str) -> None: ... def clear_cache(self) -> None: ... -# Combined interface for convenience (optional) -@runtime_checkable -class CompositeFileHasher(FileContentHasher, PathSetHasher, PacketHasher, Protocol): - """Combined interface for all file-related hashing operations.""" - - pass - - # Function hasher protocol @runtime_checkable class FunctionInfoExtractor(Protocol): @@ -153,3 +130,38 @@ def hash_column( def set_cacher(self, cacher: StringCacher) -> None: """Add a string cacher for caching hash values.""" pass + + +#---------------Legacy implementations and protocols to be deprecated--------------------- + + +@runtime_checkable +class LegacyFileHasher(Protocol): + """Protocol for file-related hashing.""" + + def hash_file(self, file_path: PathLike) -> str: ... + + +# Higher-level operations that compose file hashing +@runtime_checkable +class LegacyPathSetHasher(Protocol): + """Protocol for hashing pathsets (files, directories, collections).""" + + def hash_pathset(self, pathset: PathSet) -> str: ... + + +@runtime_checkable +class LegacyPacketHasher(Protocol): + """Protocol for hashing packets.""" + + def hash_packet(self, packet: PacketLike) -> str: ... + + +# Combined interface for convenience (optional) +@runtime_checkable +class LegacyCompositeFileHasher(LegacyFileHasher, LegacyPathSetHasher, LegacyPacketHasher, Protocol): + """Combined interface for all file-related hashing operations.""" + + pass + + diff --git a/src/orcapod/pipeline/__init__.py b/src/orcapod/pipeline/__init__.py new file mode 100644 index 0000000..2bba49b --- /dev/null +++ b/src/orcapod/pipeline/__init__.py @@ -0,0 +1,5 @@ +from .pipeline import Pipeline + +__all__ = [ + "Pipeline", +] \ No newline at end of file diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 74eb998..864f649 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -14,7 +14,7 @@ from orcapod.hashing import hash_to_hex from orcapod.core.tracker import GraphTracker -from orcapod.store import ArrowDataStore +from orcapod.stores import ArrowDataStore logger = logging.getLogger(__name__) @@ -33,15 +33,17 @@ class Pipeline(GraphTracker): def __init__( self, - name: str, - results_store: ArrowDataStore, + name: str | tuple[str, ...], pipeline_store: ArrowDataStore, + results_store: ArrowDataStore, auto_compile: bool = True, ) -> None: super().__init__() - self.name = name or f"pipeline_{id(self)}" - self.results_store = results_store + if not isinstance(name, tuple): + name = (name,) + self.name = name self.pipeline_store = pipeline_store + self.results_store = results_store self.labels_to_nodes = {} self.auto_compile = auto_compile self._dirty = False @@ -92,8 +94,9 @@ def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node input_nodes, output_store=self.results_store, tag_store=self.pipeline_store, + store_path_prefix=self.name, ) - return KernelNode(kernel, input_nodes, output_store=self.pipeline_store) + return KernelNode(kernel, input_nodes, output_store=self.pipeline_store, store_path_prefix=self.name) def compile(self): import networkx as nx diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/wrappers.py index e999714..720609e 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/wrappers.py @@ -1,6 +1,6 @@ from orcapod.core.pod import Pod, FunctionPod from orcapod.core import SyncStream, Source, Kernel -from orcapod.store import ArrowDataStore +from orcapod.stores import ArrowDataStore from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry from orcapod.types.typespec_utils import get_typespec_from_dict, union_typespecs, extract_function_typespecs from orcapod.types.semantic_type_registry import SemanticTypeRegistry @@ -167,6 +167,7 @@ def __init__( kernel: Kernel, input_streams: Collection[SyncStream], output_store: ArrowDataStore, + store_path_prefix: tuple[str, ...] | None = None, kernel_hasher: ObjectHasher | None = None, arrow_packet_hasher: ArrowHasher | None = None, packet_type_registry: SemanticTypeRegistry | None = None, @@ -175,6 +176,7 @@ def __init__( super().__init__(kernel, input_streams, **kwargs) self.output_store = output_store + self.store_path_prefix = store_path_prefix or () # These are configurable but are not expected to be modified except for special circumstances if kernel_hasher is None: @@ -214,7 +216,7 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): self.update_cached_values() def update_cached_values(self): - self.source_info = self.label, self.kernel_hasher.hash_to_hex(self.kernel) + self.source_info = self.store_path_prefix + (self.label, self.kernel_hasher.hash_to_hex(self.kernel)) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.packet_typespec is None: @@ -270,9 +272,9 @@ def post_call(self, tag: Tag, packet: Packet) -> None: output_table = self.output_converter.from_python_packet_to_arrow_table(merged_info) # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table) - if not self.output_store.get_record(*self.source_info, output_id): + if not self.output_store.get_record(self.source_info, output_id): self.output_store.add_record( - *self.source_info, + self.source_info, output_id, output_table, ) @@ -287,7 +289,7 @@ def output_iterator_completion_hook(self) -> None: @property def lazy_df(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(*self.source_info) + return self.output_store.get_all_records_as_polars(self.source_info) @property def df(self) -> pl.DataFrame | None: @@ -345,6 +347,7 @@ def __init__( output_store: ArrowDataStore, tag_store: ArrowDataStore | None = None, label: str | None = None, + store_path_prefix: tuple[str, ...] | None = None, skip_memoization_lookup: bool = False, skip_memoization: bool = False, skip_tag_record: bool = False, @@ -361,6 +364,7 @@ def __init__( error_handling=error_handling, **kwargs, ) + self.store_path_prefix = store_path_prefix or () self.output_store = output_store self.tag_store = tag_store @@ -502,9 +506,9 @@ def _add_pipeline_record_with_packet_key(self, tag: Tag, packet_key: str, packet # TODO: add error handling # check if record already exists: - retrieved_table = self.tag_store.get_record(*self.source_info, entry_hash) + retrieved_table = self.tag_store.get_record(self.source_info, entry_hash) if retrieved_table is None: - self.tag_store.add_record(*self.source_info, entry_hash, table) + self.tag_store.add_record(self.source_info, entry_hash, table) return tag @@ -523,8 +527,7 @@ def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: """ logger.debug(f"Retrieving memoized packet with key {packet_key}") arrow_table = self.output_store.get_record( - self.function_pod.function_name, - self.function_pod_hash, + self.source_info, packet_key, ) if arrow_table is None: @@ -560,7 +563,7 @@ def _memoize_with_packet_key( # consider simpler alternative packets = self.output_converter.from_arrow_table_to_python_packets( self.output_store.add_record( - *self.source_info, + self.source_info, packet_key, self.output_converter.from_python_packet_to_arrow_table(output_packet), ) @@ -622,12 +625,12 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: return tag, output_packet def get_all_outputs(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(*self.source_info) + return self.output_store.get_all_records_as_polars(self.source_info) def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: if self.tag_store is None: raise ValueError("Tag store is not set, no tag record can be retrieved") - data = self.tag_store.get_all_records_as_polars(*self.source_info) + data = self.tag_store.get_all_records_as_polars(self.source_info) if not with_packet_id: return data.drop("__packet_key") if data is not None else None return data @@ -640,11 +643,11 @@ def get_all_entries_with_tags(self, keep_hidden_fields: bool = False) -> pl.Lazy if self.tag_store is None: raise ValueError("Tag store is not set, no tag record can be retrieved") - tag_records = self.tag_store.get_all_records_as_polars(*self.source_info) + tag_records = self.tag_store.get_all_records_as_polars(self.source_info) if tag_records is None: return None result_packets = self.output_store.get_records_by_ids_as_polars( - *self.source_info, + self.source_info, tag_records.collect()["__packet_key"], preserve_input_order=True, ) diff --git a/src/orcapod/store/__init__.py b/src/orcapod/stores/__init__.py similarity index 100% rename from src/orcapod/store/__init__.py rename to src/orcapod/stores/__init__.py diff --git a/src/orcapod/store/arrow_data_stores.py b/src/orcapod/stores/arrow_data_stores.py similarity index 98% rename from src/orcapod/store/arrow_data_stores.py rename to src/orcapod/stores/arrow_data_stores.py index e2c1376..2608cbc 100644 --- a/src/orcapod/store/arrow_data_stores.py +++ b/src/orcapod/stores/arrow_data_stores.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timedelta import logging -from orcapod.store.types import DuplicateError +from orcapod.stores.types import DuplicateError # Module-level logger logger = logging.getLogger(__name__) @@ -24,30 +24,30 @@ def __init__(self): logger.info("Initialized MockArrowDataStore") def add_record( - self, source_name: str, source_id: str, entry_id: str, arrow_data: pa.Table + self, source_pathh: tuple[str, ...], source_id: str, entry_id: str, arrow_data: pa.Table ) -> pa.Table: """Add a record to the mock store.""" return arrow_data def get_record( - self, source_name: str, source_id: str, entry_id: str + self, source_path: tuple[str, ...], source_id: str, entry_id: str ) -> pa.Table | None: """Get a specific record.""" return None - def get_all_records(self, source_name: str, source_id: str) -> pa.Table | None: + def get_all_records(self, source_path: tuple[str, ...], source_id: str) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" return None def get_all_records_as_polars( - self, source_name: str, source_id: str + self, source_path: tuple[str, ...], source_id: str ) -> pl.LazyFrame | None: """Retrieve all records for a given source as a single Polars LazyFrame.""" return None def get_records_by_ids( self, - source_name: str, + source_path: tuple[str,...], source_id: str, entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, @@ -77,7 +77,7 @@ def get_records_by_ids( def get_records_by_ids_as_polars( self, - source_name: str, + source_path: tuple[str,...], source_id: str, entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, @@ -115,14 +115,13 @@ def __init__(self, duplicate_entry_behavior: str = "error"): f"Initialized InMemoryArrowDataStore with duplicate_entry_behavior='{duplicate_entry_behavior}'" ) - def _get_source_key(self, source_name: str, source_id: str) -> str: + def _get_source_key(self, source_path: tuple[str, ...]) -> str: """Generate key for source storage.""" - return f"{source_name}:{source_id}" + return "/".join(source_path) def add_record( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_id: str, arrow_data: pa.Table, ignore_duplicate: bool = False, @@ -142,7 +141,7 @@ def add_record( Raises: ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' """ - source_key = self._get_source_key(source_name, source_id) + source_key = self._get_source_key(source_path) # Initialize source if it doesn't exist if source_key not in self._in_memory_store: @@ -154,7 +153,7 @@ def add_record( if entry_id in local_data: if not ignore_duplicate and self.duplicate_entry_behavior == "error": raise ValueError( - f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " + f"Entry '{entry_id}' already exists in {source_key}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) @@ -166,18 +165,18 @@ def add_record( return arrow_data def get_record( - self, source_name: str, source_id: str, entry_id: str + self, source_path: tuple[str, ...], entry_id: str ) -> pa.Table | None: """Get a specific record.""" - source_key = self._get_source_key(source_name, source_id) + source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) return local_data.get(entry_id) def get_all_records( - self, source_name: str, source_id: str, add_entry_id_column: bool | str = False + self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False ) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" - source_key = self._get_source_key(source_name, source_id) + source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) if not local_data: @@ -199,18 +198,17 @@ def get_all_records( return None def get_all_records_as_polars( - self, source_name: str, source_id: str + self, source_path: tuple[str, ...] ) -> pl.LazyFrame | None: """Retrieve all records for a given source as a single Polars LazyFrame.""" - all_records = self.get_all_records(source_name, source_id) + all_records = self.get_all_records(source_path) if all_records is None: return None return pl.LazyFrame(all_records) def get_records_by_ids( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, @@ -253,7 +251,7 @@ def get_records_by_ids( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" ) - source_key = self._get_source_key(source_name, source_id) + source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) if not local_data: @@ -340,8 +338,7 @@ def get_records_by_ids( def get_records_by_ids_as_polars( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, @@ -368,7 +365,7 @@ def get_records_by_ids_as_polars( """ # Get Arrow result and convert to Polars arrow_result = self.get_records_by_ids( - source_name, source_id, entry_ids, add_entry_id_column, preserve_input_order + source_path, entry_ids, add_entry_id_column, preserve_input_order ) if arrow_result is None: @@ -464,7 +461,7 @@ def load_from_parquet(self, base_path: str | Path) -> None: continue source_id = source_id_dir.name - source_key = self._get_source_key(source_name, source_id) + source_key = self._get_source_key((source_name, source_id)) # Look for Parquet files in this directory parquet_files = list(source_id_dir.glob("*.parquet")) diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/delta_table_arrow_data_store.py new file mode 100644 index 0000000..d4fcaf3 --- /dev/null +++ b/src/orcapod/stores/delta_table_arrow_data_store.py @@ -0,0 +1,559 @@ +import pyarrow as pa +import polars as pl +from pathlib import Path +from typing import Any, Union +import logging +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError + +# Module-level logger +logger = logging.getLogger(__name__) + + +class DeltaTableArrowDataStore: + """ + Delta Table-based Arrow data store with flexible hierarchical path support. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10 + ): + """ + Initialize the DeltaTableArrowDataStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError(f"Base path {self.base_path} does not exist and create_base_path=False") + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}'" + ) + + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + """ + Validate source path components. + + Args: + source_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not source_path: + raise ValueError("Source path cannot be empty") + + if len(source_path) > self.max_hierarchy_depth: + raise ValueError(f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}") + + # Validate path components + for i, component in enumerate(source_path): + if not component or not isinstance(component, str): + raise ValueError(f"Source path component {i} is invalid: {repr(component)}") + + # Check for filesystem-unsafe characters + unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\0'] + if any(char in component for char in unsafe_chars): + raise ValueError(f"Source path component contains invalid characters: {repr(component)}") + + def _get_source_key(self, source_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(source_path) + + def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for component in source_path: + path = path / component + return path + + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: + """Ensure the table has an __entry_id column.""" + if "__entry_id" not in arrow_data.column_names: + # Add entry_id column at the beginning + key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + return arrow_data + + def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the __entry_id column if it exists.""" + if "__entry_id" in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_entry_id_column( + self, + arrow_data: pa.Table, + add_entry_id_column: bool | str = False + ) -> pa.Table: + """ + Handle entry_id column based on add_entry_id_column parameter. + + Args: + arrow_data: Arrow table with __entry_id column + add_entry_id_column: Control entry ID column inclusion: + - False: Remove __entry_id column + - True: Keep __entry_id column as is + - str: Rename __entry_id column to custom name + """ + if add_entry_id_column is False: + # Remove the __entry_id column + return self._remove_entry_id_column(arrow_data) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + if "__entry_id" in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + # If add_entry_id_column is True, keep __entry_id as is + return arrow_data + + def add_record( + self, + source_path: tuple[str, ...], + entry_id: str, + arrow_data: pa.Table, + ignore_duplicate: bool = False, + ) -> pa.Table: + """ + Add a record to the Delta table. + + Args: + source_path: Tuple of path components (e.g., ("org", "project", "dataset")) + entry_id: Unique identifier for this record + arrow_data: The Arrow table data to store + ignore_duplicate: If True, ignore duplicate entry error + + Returns: + The Arrow table data that was stored + + Raises: + ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + # Ensure directory exists + table_path.mkdir(parents=True, exist_ok=True) + + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) + + # Check for existing entry if needed + if not ignore_duplicate and self.duplicate_entry_behavior == "error": + existing_record = self.get_record(source_path, entry_id) + if existing_record is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in {'/'.join(source_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + + if self.duplicate_entry_behavior == "overwrite": + # Delete existing record if it exists, then append new one + try: + # First, delete existing record with this entry_id + delta_table.delete(f"__entry_id = '{entry_id}'") + logger.debug(f"Deleted existing record {entry_id} from {source_key}") + except Exception as e: + # If delete fails (e.g., record doesn't exist), that's fine + logger.debug(f"No existing record to delete for {entry_id}: {e}") + + # Append new record + write_deltalake( + str(table_path), + data_with_entry_id, + mode="append", + schema_mode="merge" + ) + + except TableNotFoundError: + # Table doesn't exist, create it + write_deltalake( + str(table_path), + data_with_entry_id, + mode="overwrite" + ) + logger.debug(f"Created new Delta table for {source_key}") + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + logger.debug(f"Added record {entry_id} to {source_key}") + return arrow_data + + def get_record( + self, source_path: tuple[str, ...], entry_id: str + ) -> pa.Table | None: + """ + Get a specific record by entry_id. + + Args: + source_path: Tuple of path components + entry_id: Unique identifier for the record + + Returns: + Arrow table for the record, or None if not found + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + + try: + delta_table = DeltaTable(str(table_path)) + + # Query for the specific entry_id + result = delta_table.to_pyarrow_table( + filter=f"__entry_id = '{entry_id}'" + ) + + if len(result) == 0: + return None + + # Remove the __entry_id column before returning + return self._remove_entry_id_column(result) + + except TableNotFoundError: + return None + except Exception as e: + logger.error(f"Error getting record {entry_id} from {'/'.join(source_path)}: {e}") + return None + + def get_all_records( + self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False + ) -> pa.Table | None: + """ + Retrieve all records for a given source path as a single table. + + Args: + source_path: Tuple of path components + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + + Returns: + Arrow table containing all records, or None if no records found + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + + try: + delta_table = DeltaTable(str(table_path)) + result = delta_table.to_pyarrow_table() + + if len(result) == 0: + return None + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except TableNotFoundError: + return None + except Exception as e: + logger.error(f"Error getting all records from {'/'.join(source_path)}: {e}") + return None + + def get_all_records_as_polars( + self, source_path: tuple[str, ...] + ) -> pl.LazyFrame | None: + """ + Retrieve all records for a given source path as a single Polars LazyFrame. + + Args: + source_path: Tuple of path components + + Returns: + Polars LazyFrame containing all records, or None if no records found + """ + all_records = self.get_all_records(source_path) + if all_records is None: + return None + return pl.LazyFrame(all_records) + + def get_records_by_ids( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records, or None if no records found + """ + self._validate_source_path(source_path) + + # Convert input to list of strings for consistency + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + table_path = self._get_table_path(source_path) + + try: + delta_table = DeltaTable(str(table_path)) + + # Create filter for the entry IDs - escape single quotes in IDs + escaped_ids = [id_.replace("'", "''") for id_ in entry_ids_list] + id_filter = " OR ".join([f"__entry_id = '{id_}'" for id_ in escaped_ids]) + + result = delta_table.to_pyarrow_table(filter=id_filter) + + if len(result) == 0: + return None + + if preserve_input_order: + # Need to reorder results and add nulls for missing entries + import pandas as pd + + df = result.to_pandas() + df = df.set_index('__entry_id') + + # Create a DataFrame with the desired order, filling missing with NaN + ordered_df = df.reindex(entry_ids_list) + + # Convert back to Arrow + result = pa.Table.from_pandas(ordered_df.reset_index()) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except TableNotFoundError: + return None + except Exception as e: + logger.error(f"Error getting records by IDs from {'/'.join(source_path)}: {e}") + return None + + def get_records_by_ids_as_polars( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + ) -> pl.LazyFrame | None: + """ + Retrieve records by entry IDs as a single Polars LazyFrame. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Polars LazyFrame containing all found records, or None if no records found + """ + arrow_result = self.get_records_by_ids( + source_path, entry_ids, add_entry_id_column, preserve_input_order + ) + + if arrow_result is None: + return None + + # Convert to Polars LazyFrame + return pl.LazyFrame(arrow_result) + + # Additional utility methods + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, source_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + source_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + if not table_path.exists(): + return False + + try: + # Remove from cache + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + """ + Delete a specific record. + + Args: + source_path: Tuple of path components + entry_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + + try: + delta_table = DeltaTable(str(table_path)) + + # Check if record exists + escaped_entry_id = entry_id.replace("'", "''") + existing = delta_table.to_pyarrow_table(filter=f"__entry_id = '{escaped_entry_id}'") + if len(existing) == 0: + return False + + # Delete the record + delta_table.delete(f"__entry_id = '{escaped_entry_id}'") + + # Update cache + source_key = self._get_source_key(source_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + return True + + except TableNotFoundError: + return False + except Exception as e: + logger.error(f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}") + return False + + def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + source_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_source_path(source_path) + + table_path = self._get_table_path(source_path) + + try: + delta_table = DeltaTable(str(table_path)) + + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + + return { + "path": str(table_path), + "source_path": source_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + } + + except TableNotFoundError: + return None + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + return None \ No newline at end of file diff --git a/src/orcapod/store/dict_data_stores.py b/src/orcapod/stores/dict_data_stores.py similarity index 95% rename from src/orcapod/store/dict_data_stores.py rename to src/orcapod/stores/dict_data_stores.py index c41dd55..edb44e5 100644 --- a/src/orcapod/store/dict_data_stores.py +++ b/src/orcapod/stores/dict_data_stores.py @@ -5,10 +5,10 @@ from pathlib import Path from orcapod.hashing import hash_packet +from orcapod.hashing.types import LegacyPacketHasher from orcapod.hashing.defaults import get_default_composite_file_hasher -from orcapod.hashing.types import PacketHasher -from orcapod.store.types import DataStore -from orcapod.types import Packet +from orcapod.stores.types import DataStore +from orcapod.types import Packet, PacketLike logger = logging.getLogger(__name__) @@ -30,15 +30,15 @@ def memoize( self, function_name: str, function_hash: str, - packet: Packet, - output_packet: Packet, + packet: PacketLike, + output_packet: PacketLike, overwrite: bool = False, - ) -> Packet: + ) -> PacketLike: return output_packet def retrieve_memoized( - self, function_name: str, function_hash: str, packet: Packet - ) -> Packet | None: + self, function_name: str, function_hash: str, packet: PacketLike + ) -> PacketLike | None: return None @@ -46,7 +46,7 @@ class DirDataStore(DataStore): def __init__( self, store_dir: str | PathLike = "./pod_data", - packet_hasher: PacketHasher | None = None, + packet_hasher: LegacyPacketHasher | None = None, copy_files=True, preserve_filename=True, overwrite=False, @@ -71,9 +71,9 @@ def memoize( self, function_name: str, function_hash: str, - packet: Packet, - output_packet: Packet, - ) -> Packet: + packet: PacketLike, + output_packet: PacketLike, + ) -> PacketLike: if self.legacy_mode: packet_hash = hash_packet(packet, algorithm=self.legacy_algorithm) else: @@ -139,7 +139,7 @@ def memoize( return retrieved_output_packet def retrieve_memoized( - self, function_name: str, function_hash: str, packet: Packet + self, function_name: str, function_hash: str, packet: PacketLike ) -> Packet | None: if self.legacy_mode: packet_hash = hash_packet(packet, algorithm=self.legacy_algorithm) diff --git a/src/orcapod/store/file_ops.py b/src/orcapod/stores/file_ops.py similarity index 100% rename from src/orcapod/store/file_ops.py rename to src/orcapod/stores/file_ops.py diff --git a/src/orcapod/store/optimized_memory_store.py b/src/orcapod/stores/optimized_memory_store.py similarity index 100% rename from src/orcapod/store/optimized_memory_store.py rename to src/orcapod/stores/optimized_memory_store.py diff --git a/src/orcapod/store/safe_dir_data_store.py b/src/orcapod/stores/safe_dir_data_store.py similarity index 100% rename from src/orcapod/store/safe_dir_data_store.py rename to src/orcapod/stores/safe_dir_data_store.py diff --git a/src/orcapod/store/transfer_data_store.py b/src/orcapod/stores/transfer_data_store.py similarity index 90% rename from src/orcapod/store/transfer_data_store.py rename to src/orcapod/stores/transfer_data_store.py index c9a4e5d..0c8e215 100644 --- a/src/orcapod/store/transfer_data_store.py +++ b/src/orcapod/stores/transfer_data_store.py @@ -1,7 +1,7 @@ # Implements transfer data store that lets you transfer memoized packets between data stores. -from orcapod.store.types import DataStore -from orcapod.types import Packet +from orcapod.stores.types import DataStore +from orcapod.types import PacketLike class TransferDataStore(DataStore): @@ -14,7 +14,7 @@ def __init__(self, source_store: DataStore, target_store: DataStore) -> None: self.source_store = source_store self.target_store = target_store - def transfer(self, function_name: str, content_hash: str, packet: Packet) -> Packet: + def transfer(self, function_name: str, content_hash: str, packet: PacketLike) -> PacketLike: """ Transfer a memoized packet from the source store to the target store. """ @@ -29,8 +29,8 @@ def transfer(self, function_name: str, content_hash: str, packet: Packet) -> Pac ) def retrieve_memoized( - self, function_name: str, function_hash: str, packet: Packet - ) -> Packet | None: + self, function_name: str, function_hash: str, packet: PacketLike + ) -> PacketLike | None: """ Retrieve a memoized packet from the target store. """ @@ -57,9 +57,9 @@ def memoize( self, function_name: str, function_hash: str, - packet: Packet, - output_packet: Packet, - ) -> Packet: + packet: PacketLike, + output_packet: PacketLike, + ) -> PacketLike: """ Memoize a packet in the target store. """ diff --git a/src/orcapod/store/types.py b/src/orcapod/stores/types.py similarity index 79% rename from src/orcapod/store/types.py rename to src/orcapod/stores/types.py index 49d9a70..c588856 100644 --- a/src/orcapod/store/types.py +++ b/src/orcapod/stores/types.py @@ -1,6 +1,6 @@ from typing import Protocol, runtime_checkable -from orcapod.types import Tag, Packet +from orcapod.types import Tag, PacketLike import pyarrow as pa import polars as pl @@ -21,13 +21,13 @@ def memoize( self, function_name: str, function_hash: str, - packet: Packet, - output_packet: Packet, - ) -> Packet: ... + packet: PacketLike, + output_packet: PacketLike, + ) -> PacketLike: ... def retrieve_memoized( - self, function_name: str, function_hash: str, packet: Packet - ) -> Packet | None: ... + self, function_name: str, function_hash: str, packet: PacketLike + ) -> PacketLike | None: ... @runtime_checkable @@ -41,31 +41,29 @@ def __init__(self, *args, **kwargs) -> None: ... def add_record( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_id: str, arrow_data: pa.Table, ignore_duplicate: bool = False, ) -> pa.Table: ... def get_record( - self, source_name: str, source_id: str, entry_id: str + self, source_path: tuple[str,...], entry_id: str ) -> pa.Table | None: ... - def get_all_records(self, source_name: str, source_id: str) -> pa.Table | None: + def get_all_records(self, source_path: tuple[str,...]) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" ... def get_all_records_as_polars( - self, source_name: str, source_id: str + self, source_path: tuple[str,...] ) -> pl.LazyFrame | None: """Retrieve all records for a given source as a single Polars DataFrame.""" ... def get_records_by_ids( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, @@ -75,8 +73,7 @@ def get_records_by_ids( def get_records_by_ids_as_polars( self, - source_name: str, - source_id: str, + source_path: tuple[str, ...], entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, diff --git a/tests/test_hashing/test_basic_composite_hasher.py b/tests/test_hashing/test_basic_composite_hasher.py index 2ef9cf6..f2da406 100644 --- a/tests/test_hashing/test_basic_composite_hasher.py +++ b/tests/test_hashing/test_basic_composite_hasher.py @@ -12,7 +12,7 @@ import pytest -from orcapod.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.file_hashers import LegacyPathLikeHasherFactory def load_hash_lut(): @@ -82,7 +82,7 @@ def verify_path_exists(rel_path): def test_default_file_hasher_file_hash_consistency(): """Test that DefaultFileHasher.hash_file produces consistent results for the sample files.""" hash_lut = load_hash_lut() - hasher = PathLikeHasherFactory.create_basic_composite() + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite() for filename, info in hash_lut.items(): rel_path = info["file"] @@ -104,7 +104,7 @@ def test_default_file_hasher_file_hash_consistency(): def test_default_file_hasher_pathset_hash_consistency(): """Test that DefaultFileHasher.hash_pathset produces consistent results for the sample pathsets.""" hash_lut = load_pathset_hash_lut() - hasher = PathLikeHasherFactory.create_basic_composite() + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite() for name, info in hash_lut.items(): paths_rel = info["paths"] @@ -137,7 +137,7 @@ def test_default_file_hasher_pathset_hash_consistency(): def test_default_file_hasher_packet_hash_consistency(): """Test that DefaultFileHasher.hash_packet produces consistent results for the sample packets.""" hash_lut = load_packet_hash_lut() - hasher = PathLikeHasherFactory.create_basic_composite() + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite() for name, info in hash_lut.items(): structure = info["structure"] @@ -181,7 +181,7 @@ def test_default_file_hasher_file_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = PathLikeHasherFactory.create_basic_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) hash1 = hasher.hash_file(file_path) hash2 = hasher.hash_file(file_path) assert hash1 == hash2, f"Hash inconsistent for algorithm {algorithm}" @@ -193,7 +193,7 @@ def test_default_file_hasher_file_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = PathLikeHasherFactory.create_basic_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) hash1 = hasher.hash_file(file_path) hash2 = hasher.hash_file(file_path) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" @@ -222,7 +222,7 @@ def test_default_file_hasher_pathset_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = PathLikeHasherFactory.create_basic_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) hash1 = hasher.hash_pathset(pathset) hash2 = hasher.hash_pathset(pathset) assert hash1 == hash2, f"Hash inconsistent for algorithm {algorithm}" @@ -234,7 +234,7 @@ def test_default_file_hasher_pathset_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = PathLikeHasherFactory.create_basic_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) hash1 = hasher.hash_pathset(pathset) hash2 = hasher.hash_pathset(pathset) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" @@ -266,7 +266,7 @@ def test_default_file_hasher_packet_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = PathLikeHasherFactory.create_basic_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) hash1 = hasher.hash_packet(packet) hash2 = hasher.hash_packet(packet) @@ -285,7 +285,7 @@ def test_default_file_hasher_packet_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = PathLikeHasherFactory.create_basic_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) hash1 = hasher.hash_packet(packet) hash2 = hasher.hash_packet(packet) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" diff --git a/tests/test_hashing/test_cached_file_hasher.py b/tests/test_hashing/test_cached_file_hasher.py index 42c9380..d7514a5 100644 --- a/tests/test_hashing/test_cached_file_hasher.py +++ b/tests/test_hashing/test_cached_file_hasher.py @@ -14,7 +14,7 @@ CachedFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.hashing.types import FileHasher, StringCacher +from orcapod.hashing.types import LegacyFileHasher, StringCacher def verify_path_exists(rel_path): @@ -81,7 +81,7 @@ def test_cached_file_hasher_construction(): assert cached_hasher1.string_cacher == string_cacher # Test that CachedFileHasher implements FileHasher protocol - assert isinstance(cached_hasher1, FileHasher) + assert isinstance(cached_hasher1, LegacyFileHasher) def test_cached_file_hasher_file_caching(): @@ -136,7 +136,7 @@ def test_cached_file_hasher_call_counts(): try: # Mock the file_hasher to track calls - mock_file_hasher = MagicMock(spec=FileHasher) + mock_file_hasher = MagicMock(spec=LegacyFileHasher) mock_file_hasher.hash_file.return_value = "mock_file_hash" # Real cacher diff --git a/tests/test_hashing/test_hasher_factory.py b/tests/test_hashing/test_hasher_factory.py index afd2392..5776a2d 100644 --- a/tests/test_hashing/test_hasher_factory.py +++ b/tests/test_hashing/test_hasher_factory.py @@ -5,9 +5,9 @@ from pathlib import Path from orcapod.hashing.file_hashers import ( - BasicFileHasher, - CachedFileHasher, - PathLikeHasherFactory, + LegacyDefaultFileHasher, + LegacyCachedFileHasher, + LegacyPathLikeHasherFactory, ) from orcapod.hashing.string_cachers import FileCacher, InMemoryCacher @@ -17,11 +17,11 @@ class TestPathLikeHasherFactoryCreateFileHasher: def test_create_file_hasher_without_cacher(self): """Test creating a file hasher without string cacher (returns BasicFileHasher).""" - hasher = PathLikeHasherFactory.create_file_hasher() + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher() - # Should return BasicFileHasher - assert isinstance(hasher, BasicFileHasher) - assert not isinstance(hasher, CachedFileHasher) + # Should return LegacyDefaultFileHasher + assert isinstance(hasher, LegacyDefaultFileHasher) + assert not isinstance(hasher, LegacyCachedFileHasher) # Check default parameters assert hasher.algorithm == "sha256" @@ -30,60 +30,63 @@ def test_create_file_hasher_without_cacher(self): def test_create_file_hasher_with_cacher(self): """Test creating a file hasher with string cacher (returns CachedFileHasher).""" cacher = InMemoryCacher() - hasher = PathLikeHasherFactory.create_file_hasher(string_cacher=cacher) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(string_cacher=cacher) - # Should return CachedFileHasher - assert isinstance(hasher, CachedFileHasher) + # Should return LegacyCachedFileHasher + assert isinstance(hasher, LegacyCachedFileHasher) assert hasher.string_cacher is cacher - # The underlying file hasher should be BasicFileHasher with defaults - assert isinstance(hasher.file_hasher, BasicFileHasher) + # The underlying file hasher should be LegacyDefaultFileHasher with defaults + assert isinstance(hasher.file_hasher, LegacyDefaultFileHasher) assert hasher.file_hasher.algorithm == "sha256" assert hasher.file_hasher.buffer_size == 65536 def test_create_file_hasher_custom_algorithm(self): """Test creating file hasher with custom algorithm.""" # Without cacher - hasher = PathLikeHasherFactory.create_file_hasher(algorithm="md5") - assert isinstance(hasher, BasicFileHasher) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(algorithm="md5") + assert isinstance(hasher, LegacyDefaultFileHasher) assert hasher.algorithm == "md5" assert hasher.buffer_size == 65536 # With cacher cacher = InMemoryCacher() - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, algorithm="sha512" ) - assert isinstance(hasher, CachedFileHasher) - assert hasher.file_hasher.algorithm == "sha512" - assert hasher.file_hasher.buffer_size == 65536 + assert isinstance(hasher, LegacyCachedFileHasher) + assert isinstance(hasher.file_hasher, LegacyDefaultFileHasher) + assert hasher.file_hasher.algorithm == "sha512" + assert hasher.file_hasher.buffer_size == 65536 def test_create_file_hasher_custom_buffer_size(self): """Test creating file hasher with custom buffer size.""" # Without cacher - hasher = PathLikeHasherFactory.create_file_hasher(buffer_size=32768) - assert isinstance(hasher, BasicFileHasher) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(buffer_size=32768) + assert isinstance(hasher, LegacyDefaultFileHasher) assert hasher.algorithm == "sha256" assert hasher.buffer_size == 32768 # With cacher cacher = InMemoryCacher() - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, buffer_size=8192 ) - assert isinstance(hasher, CachedFileHasher) + assert isinstance(hasher, LegacyCachedFileHasher) + assert isinstance(hasher.file_hasher, LegacyDefaultFileHasher) assert hasher.file_hasher.algorithm == "sha256" assert hasher.file_hasher.buffer_size == 8192 def test_create_file_hasher_all_custom_parameters(self): """Test creating file hasher with all custom parameters.""" cacher = InMemoryCacher(max_size=500) - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_file_hasher( string_cacher=cacher, algorithm="blake2b", buffer_size=16384 ) - assert isinstance(hasher, CachedFileHasher) + assert isinstance(hasher, LegacyCachedFileHasher) assert hasher.string_cacher is cacher + assert isinstance(hasher.file_hasher, LegacyDefaultFileHasher) assert hasher.file_hasher.algorithm == "blake2b" assert hasher.file_hasher.buffer_size == 16384 @@ -91,17 +94,17 @@ def test_create_file_hasher_different_cacher_types(self): """Test creating file hasher with different types of string cachers.""" # InMemoryCacher memory_cacher = InMemoryCacher() - hasher1 = PathLikeHasherFactory.create_file_hasher(string_cacher=memory_cacher) - assert isinstance(hasher1, CachedFileHasher) + hasher1 = LegacyPathLikeHasherFactory.create_file_hasher(string_cacher=memory_cacher) + assert isinstance(hasher1, LegacyCachedFileHasher) assert hasher1.string_cacher is memory_cacher # FileCacher with tempfile.NamedTemporaryFile(delete=False) as tmp_file: file_cacher = FileCacher(tmp_file.name) - hasher2 = PathLikeHasherFactory.create_file_hasher( + hasher2 = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=file_cacher ) - assert isinstance(hasher2, CachedFileHasher) + assert isinstance(hasher2, LegacyCachedFileHasher) assert hasher2.string_cacher is file_cacher # Clean up @@ -109,7 +112,7 @@ def test_create_file_hasher_different_cacher_types(self): def test_create_file_hasher_functional_without_cache(self): """Test that created file hasher actually works for hashing files.""" - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( algorithm="sha256", buffer_size=1024 ) @@ -136,7 +139,7 @@ def test_create_file_hasher_functional_without_cache(self): def test_create_file_hasher_functional_with_cache(self): """Test that created cached file hasher works and caches results.""" cacher = InMemoryCacher() - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, algorithm="sha256" ) @@ -164,44 +167,51 @@ def test_create_file_hasher_functional_with_cache(self): def test_create_file_hasher_none_cacher_explicit(self): """Test explicitly passing None for string_cacher.""" - hasher = PathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=None, algorithm="sha1", buffer_size=4096 ) - assert isinstance(hasher, BasicFileHasher) - assert not isinstance(hasher, CachedFileHasher) + assert isinstance(hasher, LegacyDefaultFileHasher) + assert not isinstance(hasher, LegacyCachedFileHasher) assert hasher.algorithm == "sha1" assert hasher.buffer_size == 4096 def test_create_file_hasher_parameter_edge_cases(self): """Test edge cases for parameters.""" # Very small buffer size - hasher1 = PathLikeHasherFactory.create_file_hasher(buffer_size=1) + hasher1 = LegacyPathLikeHasherFactory.create_legacy_file_hasher(buffer_size=1) + assert isinstance(hasher1, LegacyDefaultFileHasher) assert hasher1.buffer_size == 1 # Large buffer size - hasher2 = PathLikeHasherFactory.create_file_hasher(buffer_size=1024 * 1024) + hasher2 = LegacyPathLikeHasherFactory.create_legacy_file_hasher(buffer_size=1024 * 1024) + assert isinstance(hasher2, LegacyDefaultFileHasher) assert hasher2.buffer_size == 1024 * 1024 # Different algorithms for algorithm in ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - hasher = PathLikeHasherFactory.create_file_hasher(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(algorithm=algorithm) + assert isinstance(hasher, LegacyDefaultFileHasher) assert hasher.algorithm == algorithm def test_create_file_hasher_cache_independence(self): """Test that different cached hashers with same cacher are independent.""" cacher = InMemoryCacher() - hasher1 = PathLikeHasherFactory.create_file_hasher( + hasher1 = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, algorithm="sha256" ) - hasher2 = PathLikeHasherFactory.create_file_hasher( + hasher2 = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, algorithm="md5" ) # Both should use the same cacher but be different instances + assert isinstance(hasher1, LegacyCachedFileHasher) + assert isinstance(hasher2, LegacyCachedFileHasher) assert hasher1.string_cacher is cacher assert hasher2.string_cacher is cacher assert hasher1 is not hasher2 assert hasher1.file_hasher is not hasher2.file_hasher + assert isinstance(hasher1.file_hasher, LegacyDefaultFileHasher) + assert isinstance(hasher2.file_hasher, LegacyDefaultFileHasher) assert hasher1.file_hasher.algorithm != hasher2.file_hasher.algorithm diff --git a/tests/test_hashing/test_hasher_parity.py b/tests/test_hashing/test_hasher_parity.py index 64a6004..a278a92 100644 --- a/tests/test_hashing/test_hasher_parity.py +++ b/tests/test_hashing/test_hasher_parity.py @@ -13,8 +13,8 @@ import pytest -from orcapod.hashing.core import hash_file, hash_packet, hash_pathset -from orcapod.hashing.file_hashers import PathLikeHasherFactory +from orcapod.hashing.legacy_core import hash_file, hash_packet, hash_pathset +from orcapod.hashing.file_hashers import LegacyPathLikeHasherFactory def load_hash_lut(): @@ -73,7 +73,7 @@ def verify_path_exists(rel_path): def test_hasher_core_parity_file_hash(): """Test that BasicFileHasher.hash_file produces the same results as hash_file.""" hash_lut = load_hash_lut() - hasher = PathLikeHasherFactory.create_basic_composite() + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite() # Test all sample files for filename, info in hash_lut.items(): @@ -102,7 +102,7 @@ def test_hasher_core_parity_file_hash(): for buffer_size in buffer_sizes: try: # Create a hasher with specific parameters - hasher = PathLikeHasherFactory.create_basic_composite( + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( algorithm=algorithm, buffer_size=buffer_size ) @@ -147,7 +147,7 @@ def test_hasher_core_parity_pathset_hash(): for buffer_size in buffer_sizes: for char_count in char_counts: # Create a hasher with specific parameters - hasher = PathLikeHasherFactory.create_basic_composite( + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( algorithm=algorithm, buffer_size=buffer_size, char_count=char_count, @@ -201,7 +201,7 @@ def test_hasher_core_parity_packet_hash(): for buffer_size in buffer_sizes: for char_count in char_counts: # Create a hasher with specific parameters - hasher = PathLikeHasherFactory.create_basic_composite( + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( algorithm=algorithm, buffer_size=buffer_size, char_count=char_count, diff --git a/tests/test_hashing/test_legacy_composite_hasher.py b/tests/test_hashing/test_legacy_composite_hasher.py new file mode 100644 index 0000000..c9a3ee2 --- /dev/null +++ b/tests/test_hashing/test_legacy_composite_hasher.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +"""Tests for the CompositeFileHasher implementation.""" + +from unittest.mock import patch + +import pytest + +from orcapod.hashing.legacy_core import hash_to_hex +from orcapod.hashing.file_hashers import BasicFileHasher, LegacyDefaultCompositeFileHasher +from orcapod.hashing.types import LegacyFileHasher, LegacyPacketHasher, LegacyPathSetHasher + + +# Custom implementation of hash_file for tests that doesn't check for file existence +def mock_hash_file(file_path, algorithm="sha256", buffer_size=65536) -> str: + """Mock implementation of hash_file that doesn't check for file existence.""" + # Simply return a deterministic hash based on the file path + return hash_to_hex(f"mock_file_hash_{file_path}_{algorithm}") + + +# Custom implementation of hash_pathset for tests that doesn't check for file existence +def mock_hash_pathset( + pathset, algorithm="sha256", buffer_size=65536, char_count=32, file_hasher=None +): + """Mock implementation of hash_pathset that doesn't check for file existence.""" + from collections.abc import Collection + from os import PathLike + from pathlib import Path + + # If file_hasher is None, we'll need to handle it differently + if file_hasher is None: + # Just return a mock hash for testing + if isinstance(pathset, (str, Path, PathLike)): + return f"mock_{pathset}" + return "mock_hash" + + # Handle dictionary case for nested paths + if isinstance(pathset, dict): + hash_dict = {} + for key, value in pathset.items(): + hash_dict[key] = mock_hash_pathset( + value, algorithm, buffer_size, char_count, file_hasher + ) + return hash_to_hex(str(hash_dict)) + + # Handle collection case (list, set, etc.) + if isinstance(pathset, Collection) and not isinstance( + pathset, (str, Path, PathLike) + ): + hash_list = [] + for item in pathset: + hash_list.append( + mock_hash_pathset(item, algorithm, buffer_size, char_count, file_hasher) + ) + return hash_to_hex(str(hash_list)) + + # Handle simple string or Path case + if isinstance(pathset, (str, Path, PathLike)): + if hasattr(file_hasher, "__self__"): # For bound methods + return file_hasher(str(pathset)) + else: + return file_hasher(str(pathset)) + + return "mock_hash" + + +# Custom implementation of hash_packet for tests that doesn't check for file existence +def mock_hash_packet( + packet, + algorithm="sha256", + buffer_size=65536, + char_count=32, + prefix_algorithm=True, + pathset_hasher=None, +): + """Mock implementation of hash_packet that doesn't check for file existence.""" + # Create a simple hash based on the packet structure + hash_value = hash_to_hex(str(packet)) + + # Format it like the real function would + if prefix_algorithm and algorithm: + return ( + f"{algorithm}-{hash_value[: char_count if char_count else len(hash_value)]}" + ) + else: + return hash_value[: char_count if char_count else len(hash_value)] + + +@pytest.fixture(autouse=True) +def patch_hash_functions(): + """Patch the hash functions in the core module for all tests.""" + with ( + patch("orcapod.hashing.core.hash_file", side_effect=mock_hash_file), + patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset), + patch("orcapod.hashing.core.hash_packet", side_effect=mock_hash_packet), + ): + yield + + +def test_default_composite_hasher_implements_all_protocols(): + """Test that CompositeFileHasher implements all three protocols.""" + # Create a basic file hasher to be used within the composite hasher + file_hasher = BasicFileHasher() + + # Create the composite hasher + composite_hasher = LegacyDefaultCompositeFileHasher(file_hasher) + + # Verify it implements all three protocols + assert isinstance(composite_hasher, LegacyFileHasher) + assert isinstance(composite_hasher, LegacyPathSetHasher) + assert isinstance(composite_hasher, LegacyPacketHasher) + + +def test_default_composite_hasher_file_hashing(): + """Test CompositeFileHasher's file hashing functionality.""" + # We can use a mock path since our mocks don't require real files + file_path = "/path/to/mock_file.txt" + + # Create a custom mock file hasher + class MockFileHasher: + def hash_file(self, file_path): + return mock_hash_file(file_path) + + file_hasher = MockFileHasher() + composite_hasher = LegacyDefaultCompositeFileHasher(file_hasher) + + # Get hash from the composite hasher and directly from the file hasher + direct_hash = file_hasher.hash_file(file_path) + composite_hash = composite_hasher.hash_file(file_path) + + # The hashes should be identical + assert direct_hash == composite_hash + + +def test_default_composite_hasher_pathset_hashing(): + """Test CompositeFileHasher's path set hashing functionality.""" + + # Create a custom mock file hasher that doesn't check for file existence + class MockFileHasher: + def hash_file(self, file_path) -> str: + return mock_hash_file(file_path) + + file_hasher = MockFileHasher() + composite_hasher = LegacyDefaultCompositeFileHasher(file_hasher) + + # Simple path set with non-existent paths + pathset = ["/path/to/file1.txt", "/path/to/file2.txt"] + + # Hash the pathset + result = composite_hasher.hash_pathset(pathset) + + # The result should be a string hash + assert isinstance(result, str) + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/tests/test_hashing/test_packet_hasher.py b/tests/test_hashing/test_packet_hasher.py index 69b89d0..80a16ed 100644 --- a/tests/test_hashing/test_packet_hasher.py +++ b/tests/test_hashing/test_packet_hasher.py @@ -3,11 +3,11 @@ import pytest -from orcapod.hashing.file_hashers import DefaultPacketHasher -from orcapod.hashing.types import PathSetHasher +from orcapod.hashing.file_hashers import LegacyDefaultPacketHasher +from orcapod.hashing.types import LegacyPathSetHasher -class MockPathSetHasher(PathSetHasher): +class MockPathSetHasher(LegacyPathSetHasher): """Simple mock PathSetHasher for testing.""" def __init__(self, hash_value="mock_hash"): @@ -19,10 +19,10 @@ def hash_pathset(self, pathset): return f"{self.hash_value}_{pathset}" -def test_default_packet_hasher_empty_packet(): - """Test DefaultPacketHasher with an empty packet.""" +def test_legacy_packet_hasher_empty_packet(): + """Test LegacyPacketHasher with an empty packet.""" pathset_hasher = MockPathSetHasher() - packet_hasher = DefaultPacketHasher(pathset_hasher) + packet_hasher = LegacyDefaultPacketHasher(pathset_hasher) # Test with empty packet packet = {} @@ -36,10 +36,10 @@ def test_default_packet_hasher_empty_packet(): assert isinstance(result, str) -def test_default_packet_hasher_single_entry(): - """Test DefaultPacketHasher with a packet containing a single entry.""" +def test_legacy_packet_hasher_single_entry(): + """Test LegacyPacketHasher with a packet containing a single entry.""" pathset_hasher = MockPathSetHasher() - packet_hasher = DefaultPacketHasher(pathset_hasher) + packet_hasher = LegacyDefaultPacketHasher(pathset_hasher) # Test with a single entry packet = {"input": "/path/to/file.txt"} @@ -54,10 +54,10 @@ def test_default_packet_hasher_single_entry(): assert isinstance(result, str) -def test_default_packet_hasher_multiple_entries(): - """Test DefaultPacketHasher with a packet containing multiple entries.""" +def test_legacy_packet_hasher_multiple_entries(): + """Test LegacyPacketHasher with a packet containing multiple entries.""" pathset_hasher = MockPathSetHasher() - packet_hasher = DefaultPacketHasher(pathset_hasher) + packet_hasher = LegacyDefaultPacketHasher(pathset_hasher) # Test with multiple entries packet = { @@ -78,10 +78,10 @@ def test_default_packet_hasher_multiple_entries(): assert isinstance(result, str) -def test_default_packet_hasher_nested_structure(): - """Test DefaultPacketHasher with a deeply nested packet structure.""" +def test_legacy_packet_hasher_nested_structure(): + """Test LegacyPacketHasher with a deeply nested packet structure.""" pathset_hasher = MockPathSetHasher() - packet_hasher = DefaultPacketHasher(pathset_hasher) + packet_hasher = LegacyDefaultPacketHasher(pathset_hasher) # Test with nested packet structure packet = { @@ -103,16 +103,16 @@ def test_default_packet_hasher_nested_structure(): assert isinstance(result, str) -def test_default_packet_hasher_with_char_count(): - """Test DefaultPacketHasher with different char_count values.""" +def test_legacy_packet_hasher_with_char_count(): + """Test LegacyPacketHasher with different char_count values.""" pathset_hasher = MockPathSetHasher() # Test with default char_count (32) - default_hasher = DefaultPacketHasher(pathset_hasher) + default_hasher = LegacyDefaultPacketHasher(pathset_hasher) default_result = default_hasher.hash_packet({"input": "/path/to/file.txt"}) # Test with custom char_count - custom_hasher = DefaultPacketHasher(pathset_hasher, char_count=16) + custom_hasher = LegacyDefaultPacketHasher(pathset_hasher, char_count=16) custom_result = custom_hasher.hash_packet({"input": "/path/to/file.txt"}) # Results should be different based on char_count diff --git a/tests/test_hashing/test_path_set_hasher.py b/tests/test_hashing/test_path_set_hasher.py index 65e626a..9286f82 100644 --- a/tests/test_hashing/test_path_set_hasher.py +++ b/tests/test_hashing/test_path_set_hasher.py @@ -9,11 +9,11 @@ import pytest import orcapod.hashing.legacy_core -from orcapod.hashing.file_hashers import DefaultPathsetHasher -from orcapod.hashing.types import FileHasher +from orcapod.hashing.file_hashers import LegacyDefaultPathsetHasher +from orcapod.hashing.types import LegacyFileHasher -class MockFileHasher(FileHasher): +class MockFileHasher(LegacyFileHasher): """Simple mock FileHasher for testing.""" def __init__(self, hash_value="mock_hash"): @@ -90,10 +90,10 @@ def patch_hash_pathset(): yield -def test_default_pathset_hasher_single_file(): - """Test DefaultPathsetHasher with a single file path.""" +def test_legacy_pathset_hasher_single_file(): + """Test LegacyPathsetHasher with a single file path.""" file_hasher = MockFileHasher() - pathset_hasher = DefaultPathsetHasher(file_hasher) + pathset_hasher = LegacyDefaultPathsetHasher(file_hasher) # Create a real file for testing file_path = create_temp_file() @@ -116,7 +116,7 @@ def test_default_pathset_hasher_single_file(): def test_default_pathset_hasher_multiple_files(): """Test DefaultPathsetHasher with multiple files in a list.""" file_hasher = MockFileHasher() - pathset_hasher = DefaultPathsetHasher(file_hasher) + pathset_hasher = LegacyDefaultPathsetHasher(file_hasher) # Create real files for testing file_paths = [create_temp_file(f"content {i}") for i in range(3)] @@ -195,7 +195,7 @@ def test_default_pathset_hasher_nested_paths(): def test_default_pathset_hasher_with_nonexistent_files(): """Test DefaultPathsetHasher with both existent and non-existent files.""" file_hasher = MockFileHasher() - pathset_hasher = DefaultPathsetHasher(file_hasher) + pathset_hasher = LegacyDefaultPathsetHasher(file_hasher) # Reset the file_hasher's call list file_hasher.file_hash_calls = [] @@ -249,14 +249,14 @@ def test_default_pathset_hasher_with_char_count(): try: # Test with default char_count (32) - default_hasher = DefaultPathsetHasher(file_hasher) + default_hasher = LegacyDefaultPathsetHasher(file_hasher) default_result = default_hasher.hash_pathset(file_path) # Reset call list file_hasher.file_hash_calls = [] # Test with custom char_count - custom_hasher = DefaultPathsetHasher(file_hasher, char_count=16) + custom_hasher = LegacyDefaultPathsetHasher(file_hasher, char_count=16) custom_result = custom_hasher.hash_pathset(file_path) # Both should have called the file_hasher once diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index 32d8618..d6cc106 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -8,15 +8,15 @@ import pytest from orcapod.hashing.types import ( - CompositeFileHasher, - FileHasher, - PacketHasher, - PathSetHasher, + LegacyCompositeFileHasher, + LegacyFileHasher, + LegacyPacketHasher, + LegacyPathSetHasher, ) -from orcapod.store.dict_data_stores import DirDataStore +from orcapod.stores.dict_data_stores import DirDataStore -class MockFileHasher(FileHasher): +class MockFileHasher(LegacyFileHasher): """Mock FileHasher for testing.""" def __init__(self, hash_value="mock_hash"): @@ -28,19 +28,19 @@ def hash_file(self, file_path): return f"{self.hash_value}_file" -class MockPathSetHasher(PathSetHasher): +class MockPathSetHasher(LegacyPathSetHasher): """Mock PathSetHasher for testing.""" def __init__(self, hash_value="mock_hash"): self.hash_value = hash_value self.pathset_hash_calls = [] - def hash_pathset(self, pathset): + def hash_pathset(self, pathset) -> str: self.pathset_hash_calls.append(pathset) return f"{self.hash_value}_pathset" -class MockPacketHasher(PacketHasher): +class MockPacketHasher(LegacyPacketHasher): """Mock PacketHasher for testing.""" def __init__(self, hash_value="mock_hash"): @@ -52,7 +52,7 @@ def hash_packet(self, packet): return f"{self.hash_value}_packet" -class MockCompositeHasher(CompositeFileHasher): +class MockCompositeHasher(LegacyCompositeFileHasher): """Mock CompositeHasher that implements all three hash protocols.""" def __init__(self, hash_value="mock_hash"): @@ -61,15 +61,15 @@ def __init__(self, hash_value="mock_hash"): self.pathset_hash_calls = [] self.packet_hash_calls = [] - def hash_file(self, file_path): + def hash_file_content(self, file_path): self.file_hash_calls.append(file_path) return f"{self.hash_value}_file" - def hash_pathset(self, pathset): + def hash_pathset(self, pathset) -> str: self.pathset_hash_calls.append(pathset) return f"{self.hash_value}_pathset" - def hash_packet(self, packet): + def hash_packet(self, packet) -> str: self.packet_hash_calls.append(packet) return f"{self.hash_value}_packet" @@ -86,7 +86,7 @@ def test_dir_data_store_init_default_hasher(temp_dir): assert store_dir.is_dir() # Verify the default PacketHasher is used - assert isinstance(store.packet_hasher, PacketHasher) + assert isinstance(store.packet_hasher, LegacyPacketHasher) # Check default parameters assert store.copy_files is True diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 48e0703..00d3b99 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -9,10 +9,10 @@ from orcapod.hashing.file_hashers import ( BasicFileHasher, CachedFileHasher, - DefaultCompositeFileHasher, + LegacyCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.store.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): @@ -28,7 +28,7 @@ def test_integration_with_cached_file_hasher(temp_dir, sample_files): ) # Create a CompositeFileHasher that will use the CachedFileHasher - composite_hasher = DefaultCompositeFileHasher(file_hasher) + composite_hasher = LegacyCompositeFileHasher(file_hasher) # Create the store with CompositeFileHasher store = DirDataStore(store_dir=store_dir, packet_hasher=composite_hasher) diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index ab0eecd..4ff838f 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -3,7 +3,7 @@ import pytest -from orcapod.store.dict_data_stores import NoOpDataStore +from orcapod.stores.dict_data_stores import NoOpDataStore def test_noop_data_store_memoize(): @@ -43,7 +43,7 @@ def test_noop_data_store_retrieve_memoized(): def test_noop_data_store_is_data_store_subclass(): """Test that NoOpDataStore is a subclass of DataStore.""" - from orcapod.store import DataStore + from orcapod.stores import DataStore store = NoOpDataStore() assert isinstance(store, DataStore) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 191da89..21ed4c9 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -5,12 +5,12 @@ import pytest -from orcapod.hashing.types import PacketHasher -from orcapod.store.dict_data_stores import DirDataStore, NoOpDataStore -from orcapod.store.transfer_data_store import TransferDataStore +from orcapod.hashing.types import LegacyPacketHasher +from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.transfer_data_store import TransferDataStore -class MockPacketHasher(PacketHasher): +class MockPacketHasher(LegacyPacketHasher): """Mock PacketHasher for testing.""" def __init__(self, hash_value="mock_hash"): From 41f1b63061247d4a88e4063e4a72c0b906207fd3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 03:58:57 +0000 Subject: [PATCH 030/224] test: fix legacy tests --- tests/test_hashing/test_legacy_composite_hasher.py | 10 +++++----- tests/test_store/test_dir_data_store.py | 2 +- tests/test_store/test_integration.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_hashing/test_legacy_composite_hasher.py b/tests/test_hashing/test_legacy_composite_hasher.py index c9a3ee2..f3a8de4 100644 --- a/tests/test_hashing/test_legacy_composite_hasher.py +++ b/tests/test_hashing/test_legacy_composite_hasher.py @@ -6,7 +6,7 @@ import pytest from orcapod.hashing.legacy_core import hash_to_hex -from orcapod.hashing.file_hashers import BasicFileHasher, LegacyDefaultCompositeFileHasher +from orcapod.hashing.file_hashers import LegacyDefaultFileHasher, LegacyDefaultCompositeFileHasher from orcapod.hashing.types import LegacyFileHasher, LegacyPacketHasher, LegacyPathSetHasher @@ -89,9 +89,9 @@ def mock_hash_packet( def patch_hash_functions(): """Patch the hash functions in the core module for all tests.""" with ( - patch("orcapod.hashing.core.hash_file", side_effect=mock_hash_file), - patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset), - patch("orcapod.hashing.core.hash_packet", side_effect=mock_hash_packet), + patch("orcapod.hashing.legacy_core.hash_file", side_effect=mock_hash_file), + patch("orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset), + patch("orcapod.hashing.legacy_core.hash_packet", side_effect=mock_hash_packet), ): yield @@ -99,7 +99,7 @@ def patch_hash_functions(): def test_default_composite_hasher_implements_all_protocols(): """Test that CompositeFileHasher implements all three protocols.""" # Create a basic file hasher to be used within the composite hasher - file_hasher = BasicFileHasher() + file_hasher = LegacyDefaultFileHasher() # Create the composite hasher composite_hasher = LegacyDefaultCompositeFileHasher(file_hasher) diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index d6cc106..eae39eb 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -461,7 +461,7 @@ def test_dir_data_store_with_default_packet_hasher(temp_dir, sample_files): store = DirDataStore(store_dir=store_dir) # Verify that default PacketHasher was created - assert isinstance(store.packet_hasher, PacketHasher) + assert isinstance(store.packet_hasher, LegacyPacketHasher) # Test memoization and retrieval packet = {"input_file": sample_files["input"]["file1"]} diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 00d3b99..2a6e253 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -9,7 +9,7 @@ from orcapod.hashing.file_hashers import ( BasicFileHasher, CachedFileHasher, - LegacyCompositeFileHasher, + LegacyDefaultCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore @@ -28,7 +28,7 @@ def test_integration_with_cached_file_hasher(temp_dir, sample_files): ) # Create a CompositeFileHasher that will use the CachedFileHasher - composite_hasher = LegacyCompositeFileHasher(file_hasher) + composite_hasher = LegacyDefaultCompositeFileHasher(file_hasher) # Create the store with CompositeFileHasher store = DirDataStore(store_dir=store_dir, packet_hasher=composite_hasher) From fe423f7abb429d6e5fd78609a2c9fa77457542a2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 05:39:30 +0000 Subject: [PATCH 031/224] fix: make all tests functional --- tests/test_hashing/test_cached_file_hasher.py | 24 +++++++++---------- tests/test_hashing/test_hasher_factory.py | 4 ++-- tests/test_hashing/test_path_set_hasher.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/test_hashing/test_cached_file_hasher.py b/tests/test_hashing/test_cached_file_hasher.py index d7514a5..8b9ce30 100644 --- a/tests/test_hashing/test_cached_file_hasher.py +++ b/tests/test_hashing/test_cached_file_hasher.py @@ -10,8 +10,8 @@ import pytest from orcapod.hashing.file_hashers import ( - BasicFileHasher, - CachedFileHasher, + LegacyDefaultFileHasher, + LegacyCachedFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.hashing.types import LegacyFileHasher, StringCacher @@ -73,10 +73,10 @@ def load_packet_hash_lut(): def test_cached_file_hasher_construction(): """Test that CachedFileHasher can be constructed with various parameters.""" # Test with default parameters - file_hasher = BasicFileHasher() + file_hasher = LegacyDefaultFileHasher() string_cacher = InMemoryCacher() - cached_hasher1 = CachedFileHasher(file_hasher, string_cacher) + cached_hasher1 = LegacyCachedFileHasher(file_hasher, string_cacher) assert cached_hasher1.file_hasher == file_hasher assert cached_hasher1.string_cacher == string_cacher @@ -99,8 +99,8 @@ def test_cached_file_hasher_file_caching(): mock_string_cacher = MagicMock(spec=StringCacher) mock_string_cacher.get_cached.return_value = None # Initially no cached value - file_hasher = BasicFileHasher() - cached_hasher = CachedFileHasher(file_hasher, mock_string_cacher) + file_hasher = LegacyDefaultFileHasher() + cached_hasher = LegacyCachedFileHasher(file_hasher, mock_string_cacher) # First call should compute the hash and cache it result1 = cached_hasher.hash_file(file_path) @@ -143,7 +143,7 @@ def test_cached_file_hasher_call_counts(): string_cacher = InMemoryCacher() # Create the cached file hasher with all caching enabled - cached_hasher = CachedFileHasher( + cached_hasher = LegacyCachedFileHasher( mock_file_hasher, string_cacher, ) @@ -181,11 +181,11 @@ def test_cached_file_hasher_performance(): file_path = verify_path_exists(info["file"]) # Setup non-cached hasher - file_hasher = BasicFileHasher() + file_hasher = LegacyDefaultFileHasher() # Setup cached hasher string_cacher = InMemoryCacher() - cached_hasher = CachedFileHasher(file_hasher, string_cacher) + cached_hasher = LegacyCachedFileHasher(file_hasher, string_cacher) # Measure time for multiple hash operations with non-cached hasher start_time = time.time() @@ -221,11 +221,11 @@ def test_cached_file_hasher_with_different_cachers(): try: file_path = temp_file.name - file_hasher = BasicFileHasher() + file_hasher = LegacyDefaultFileHasher() # Test with InMemoryCacher mem_cacher = InMemoryCacher(max_size=10) - cached_hasher1 = CachedFileHasher(file_hasher, mem_cacher) + cached_hasher1 = LegacyCachedFileHasher(file_hasher, mem_cacher) # First hash call hash1 = cached_hasher1.hash_file(file_path) @@ -249,7 +249,7 @@ def clear_cache(self) -> None: self.storage.clear() custom_cacher = CustomCacher() - cached_hasher2 = CachedFileHasher(file_hasher, custom_cacher) + cached_hasher2 = LegacyCachedFileHasher(file_hasher, custom_cacher) # Get hash with custom cacher hash2 = cached_hasher2.hash_file(file_path) diff --git a/tests/test_hashing/test_hasher_factory.py b/tests/test_hashing/test_hasher_factory.py index 5776a2d..69804a3 100644 --- a/tests/test_hashing/test_hasher_factory.py +++ b/tests/test_hashing/test_hasher_factory.py @@ -80,7 +80,7 @@ def test_create_file_hasher_custom_buffer_size(self): def test_create_file_hasher_all_custom_parameters(self): """Test creating file hasher with all custom parameters.""" cacher = InMemoryCacher(max_size=500) - hasher = LegacyPathLikeHasherFactory.create_file_hasher( + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( string_cacher=cacher, algorithm="blake2b", buffer_size=16384 ) @@ -94,7 +94,7 @@ def test_create_file_hasher_different_cacher_types(self): """Test creating file hasher with different types of string cachers.""" # InMemoryCacher memory_cacher = InMemoryCacher() - hasher1 = LegacyPathLikeHasherFactory.create_file_hasher(string_cacher=memory_cacher) + hasher1 = LegacyPathLikeHasherFactory.create_legacy_file_hasher(string_cacher=memory_cacher) assert isinstance(hasher1, LegacyCachedFileHasher) assert hasher1.string_cacher is memory_cacher diff --git a/tests/test_hashing/test_path_set_hasher.py b/tests/test_hashing/test_path_set_hasher.py index 9286f82..0a48acb 100644 --- a/tests/test_hashing/test_path_set_hasher.py +++ b/tests/test_hashing/test_path_set_hasher.py @@ -86,7 +86,7 @@ def mock_hash_pathset( @pytest.fixture(autouse=True) def patch_hash_pathset(): """Patch the hash_pathset function in the hashing module for all tests.""" - with patch("orcapod.hashing.core.hash_pathset", side_effect=mock_hash_pathset): + with patch("orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset): yield @@ -225,7 +225,7 @@ def custom_hash_nonexistent(pathset, **kwargs): # Patch hash_pathset just for this test with patch( - "orcapod.hashing.core.hash_pathset", side_effect=custom_hash_nonexistent + "orcapod.hashing.legacy_core.hash_pathset", side_effect=custom_hash_nonexistent ): result = pathset_hasher.hash_pathset(pathset) From ba1f45d6096a504096b97c5d5673cb2e851133d9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 06:53:23 +0000 Subject: [PATCH 032/224] refactor: cleanup imports and use versioned object hasher --- src/orcapod/core/base.py | 4 -- src/orcapod/core/operators.py | 1 - src/orcapod/core/pod.py | 2 +- src/orcapod/core/sources.py | 2 +- src/orcapod/hashing/__init__.py | 13 ----- src/orcapod/hashing/arrow_hashers.py | 5 +- src/orcapod/hashing/defaults.py | 37 ++++++------- src/orcapod/hashing/object_hashers.py | 7 ++- src/orcapod/hashing/types.py | 16 ++++-- src/orcapod/hashing/versioned_hashers.py | 53 ++++++++++++++++--- .../pipeline/{wrappers.py => nodes.py} | 22 ++++---- src/orcapod/pipeline/pipeline.py | 3 +- src/orcapod/stores/dict_data_stores.py | 2 +- src/orcapod/utils/object_spec.py | 19 +++++++ 14 files changed, 120 insertions(+), 66 deletions(-) rename src/orcapod/pipeline/{wrappers.py => nodes.py} (97%) create mode 100644 src/orcapod/utils/object_spec.py diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 7c9a299..f0d5362 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -4,10 +4,6 @@ from collections.abc import Callable, Collection, Iterator from typing import Any - -from orcapod.hashing import HashableMixin, ObjectHasher -from orcapod.hashing import get_default_object_hasher - from orcapod.hashing import ContentIdentifiableBase from orcapod.types import Packet, Tag, TypeSpec from orcapod.types.typespec_utils import get_typespec_from_dict diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index c26dc2d..598f2e3 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -5,7 +5,6 @@ from orcapod.types import Packet, Tag, TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from orcapod.hashing import function_content_hash, hash_function from orcapod.core.base import Kernel, SyncStream, Operator from orcapod.core.streams import SyncStreamFromGenerator from orcapod.utils.stream_utils import ( diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index 4271887..ae6778d 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -13,8 +13,8 @@ from orcapod.hashing import ( FunctionInfoExtractor, - get_function_signature, ) +from orcapod.hashing.legacy_core import get_function_signature from orcapod.core import Kernel from orcapod.core.operators import Join from orcapod.core.streams import ( diff --git a/src/orcapod/core/sources.py b/src/orcapod/core/sources.py index 21adae9..3d79e7a 100644 --- a/src/orcapod/core/sources.py +++ b/src/orcapod/core/sources.py @@ -4,7 +4,7 @@ from typing import Any, Literal from orcapod.core.base import Source -from orcapod.hashing import hash_function +from orcapod.hashing.legacy_core import hash_function from orcapod.core.streams import SyncStream, SyncStreamFromGenerator from orcapod.types import Packet, Tag diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index 7aaf11b..b1e5849 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -1,17 +1,4 @@ -from .legacy_core import ( - HashableMixin, - function_content_hash, - get_function_signature, - hash_file, - hash_function, - hash_packet, - hash_pathset, - hash_to_hex, - hash_to_int, - hash_to_uuid, -) from .defaults import ( - get_default_composite_file_hasher, get_default_object_hasher, get_default_arrow_hasher, ) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index c50ebfc..3545911 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -73,7 +73,6 @@ def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: This is a no-op for SemanticArrowHasher since it hashes column contents directly. """ - # SemanticArrowHasher does not use string caching, so this is a no-op if semantic_type in self.semantic_type_hashers: self.semantic_type_hashers[semantic_type].set_cacher(cacher) else: @@ -179,7 +178,7 @@ def _serialize_table_ipc(self, table: pa.Table) -> bytes: return buffer.getvalue() - def hash_table(self, table: pa.Table, add_prefix: bool = True) -> str: + def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: """ Compute stable hash of Arrow table. @@ -208,7 +207,7 @@ def hash_table(self, table: pa.Table, add_prefix: bool = True) -> str: hasher.update(serialized_bytes) hash_str = hasher.hexdigest() - if add_prefix: + if prefix_hasher_id: hash_str = f"{self.get_hasher_id()}:{hash_str}" return hash_str diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 8ba7c0b..f9dee37 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -9,11 +9,11 @@ from orcapod.hashing.file_hashers import BasicFileHasher, LegacyPathLikeHasherFactory from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.hashing.object_hashers import ObjectHasher -from orcapod.hashing.object_hashers import DefaultObjectHasher, LegacyObjectHasher +from orcapod.hashing.object_hashers import LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.arrow_hashers import SemanticArrowHasher from orcapod.hashing.semantic_type_hashers import PathHasher -from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher +from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher, get_versioned_object_hasher def get_default_arrow_hasher( @@ -36,6 +36,21 @@ def get_default_arrow_hasher( return arrow_hasher +def get_default_object_hasher() -> ObjectHasher: + object_hasher = get_versioned_object_hasher() + return object_hasher + + + +def get_legacy_object_hasher() -> ObjectHasher: + function_info_extractor = ( + FunctionInfoExtractorFactory.create_function_info_extractor( + strategy="signature" + ) + ) + return LegacyObjectHasher(function_info_extractor=function_info_extractor) + + def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: if with_cache: # use unlimited caching @@ -48,21 +63,3 @@ def get_default_composite_file_hasher_with_cacher(cacher=None) -> LegacyComposit if cacher is None: cacher = InMemoryCacher(max_size=None) return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) - - -def get_default_object_hasher() -> ObjectHasher: - function_info_extractor = ( - FunctionInfoExtractorFactory.create_function_info_extractor( - strategy="signature" - ) - ) - return DefaultObjectHasher(function_info_extractor=function_info_extractor) - - -def get_legacy_object_hasher() -> ObjectHasher: - function_info_extractor = ( - FunctionInfoExtractorFactory.create_function_info_extractor( - strategy="signature" - ) - ) - return LegacyObjectHasher(function_info_extractor=function_info_extractor) diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 7e35ccb..bdd0169 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -4,17 +4,22 @@ from .hash_utils import hash_object -class DefaultObjectHasher(ObjectHasher): +class BasicObjectHasher(ObjectHasher): """ Default object hasher used throughout the codebase. """ def __init__( self, + hasher_id: str, function_info_extractor: FunctionInfoExtractor | None = None, ): + self._hasher_id = hasher_id self.function_info_extractor = function_info_extractor + def get_hasher_id(self) -> str: + return self._hasher_id + def hash(self, obj: object) -> bytes: """ Hash an object to a byte representation. diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index c7d79da..24afdbc 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -43,7 +43,13 @@ def hash(self, obj: Any) -> bytes: """ ... - def hash_to_hex(self, obj: Any, char_count: int | None = None) -> str: + @abstractmethod + def get_hasher_id(self) -> str: + """ + Returns a unique identifier/name assigned to the hasher + """ + + def hash_to_hex(self, obj: Any, char_count: int | None = None, prefix_hasher_id:bool=False) -> str: hash_bytes = self.hash(obj) hex_str = hash_bytes.hex() @@ -53,7 +59,9 @@ def hash_to_hex(self, obj: Any, char_count: int | None = None) -> str: raise ValueError( f"Cannot truncate to {char_count} chars, hash only has {len(hex_str)}" ) - return hex_str[:char_count] + hex_str = hex_str[:char_count] + if prefix_hasher_id: + hex_str = self.get_hasher_id() + ":" + hex_str return hex_str def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: @@ -74,7 +82,6 @@ def hash_to_uuid( self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID ) -> uuid.UUID: """Convert hash to proper UUID5.""" - # Use the hex representation as input to UUID5 return uuid.uuid5(namespace, self.hash(obj)) @@ -88,8 +95,9 @@ def hash_file(self, file_path: PathLike) -> bytes: ... @runtime_checkable class ArrowHasher(Protocol): """Protocol for hashing arrow packets.""" + def get_hasher_id(self) -> str: ... - def hash_table(self, table: pa.Table, add_prefix: bool = True) -> str: ... + def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... @runtime_checkable diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 22c715e..18c8680 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,15 +1,16 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher +from .types import ObjectHasher, ArrowHasher import importlib from typing import Any CURRENT_VERSION = "v0.1" -versioned_hashers = { +versioned_semantic_arrow_hashers = { "v0.1": { "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", "config": { - "hasher_id": "default_v0.1", + "hasher_id": "arrow_v0.1", "hash_algorithm": "sha256", "chunk_size": 8192, "semantic_type_hashers": { @@ -29,6 +30,23 @@ } } +versioned_object_hashers = { + "v0.1": { + "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", + "config": { + "hasher_id": "object_v0.1", + "function_info_extractor" : { + "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", + "config": { + "include_module": True, + "include_defaults": True + } + + } + } + } +} + def parse_objectspec(obj_spec: dict) -> Any: if "_class" in obj_spec: @@ -51,7 +69,7 @@ def parse_objectspec(obj_spec: dict) -> Any: def get_versioned_semantic_arrow_hasher( version: str | None = None, -) -> SemanticArrowHasher: +) -> ArrowHasher: """ Get the versioned hasher for the specified version. @@ -59,13 +77,36 @@ def get_versioned_semantic_arrow_hasher( version (str): The version of the hasher to retrieve. Returns: - SemanticArrowHasher: An instance of the hasher for the specified version. + ArrowHasher: An instance of the arrow hasher of the specified version. + """ + if version is None: + version = CURRENT_VERSION + + if version not in versioned_semantic_arrow_hashers: + raise ValueError(f"Unsupported hasher version: {version}") + + hasher_spec = versioned_semantic_arrow_hashers[version] + return parse_objectspec(hasher_spec) + + +def get_versioned_object_hasher( + version: str | None = None, +) -> ObjectHasher: + """ + Get an object hasher for the specified version. + + Args: + version (str): The version of the hasher to retrieve. + + Returns: + Object: An instance of the object hasher of the specified version. """ if version is None: version = CURRENT_VERSION - if version not in versioned_hashers: + if version not in versioned_object_hashers: raise ValueError(f"Unsupported hasher version: {version}") - hasher_spec = versioned_hashers[version] + hasher_spec = versioned_object_hashers[version] return parse_objectspec(hasher_spec) + diff --git a/src/orcapod/pipeline/wrappers.py b/src/orcapod/pipeline/nodes.py similarity index 97% rename from src/orcapod/pipeline/wrappers.py rename to src/orcapod/pipeline/nodes.py index 720609e..2ac3763 100644 --- a/src/orcapod/pipeline/wrappers.py +++ b/src/orcapod/pipeline/nodes.py @@ -149,7 +149,6 @@ def post_call(self, tag: Tag, packet: Packet) -> None: ... def output_iterator_completion_hook(self) -> None: ... - class CachedKernelWrapper(KernelInvocationWrapper, Source): """ A Kernel wrapper that wraps a kernel and stores the outputs of the kernel. @@ -216,7 +215,7 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): self.update_cached_values() def update_cached_values(self): - self.source_info = self.store_path_prefix + (self.label, self.kernel_hasher.hash_to_hex(self.kernel)) + self.source_info = self.store_path_prefix + (self.label, self.kernel_hasher.hash_to_hex(self.kernel, prefix_hasher_id=True)) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.packet_typespec is None: @@ -271,7 +270,7 @@ def post_call(self, tag: Tag, packet: Packet) -> None: merged_info = {**tag, **packet.get_composite()} output_table = self.output_converter.from_python_packet_to_arrow_table(merged_info) # TODO: revisit this logic - output_id = self.arrow_hasher.hash_table(output_table) + output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) if not self.output_store.get_record(self.source_info, output_id): self.output_store.add_record( self.source_info, @@ -425,13 +424,18 @@ def registry(self, registry: SemanticTypeRegistry | None = None): self.update_cached_values() def update_cached_values(self) -> None: - self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod) + self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod, prefix_hasher_id=True) + self.input_typespec, self.output_typespec = self.function_pod.get_function_typespecs() self.tag_keys, self.output_keys = self.keys(trigger_run=False) + + + if self.tag_keys is None or self.output_keys is None: raise ValueError( "Currently, cached function pod wrapper can only work with function pods that have keys defined." ) - self.all_keys = tuple(self.tag_keys) + tuple(self.output_keys) + self.tag_keys = tuple(self.tag_keys) + self.output_keys = tuple(self.output_keys) self.tag_typespec, self.output_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.output_typespec is None: raise ValueError( @@ -475,7 +479,7 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: return super().forward(*streams, **kwargs) def get_packet_key(self, packet: Packet) -> str: - return self.arrow_hasher.hash_table(self.input_converter.from_python_packet_to_arrow_table(packet)) + return self.arrow_hasher.hash_table(self.input_converter.from_python_packet_to_arrow_table(packet), prefix_hasher_id=True) @property def source_info(self): @@ -502,7 +506,7 @@ def _add_pipeline_record_with_packet_key(self, tag: Tag, packet_key: str, packet table = self.tag_record_converter.from_python_packet_to_arrow_table(combined_info) - entry_hash = self.arrow_hasher.hash_table(table) + entry_hash = self.arrow_hasher.hash_table(table, prefix_hasher_id=True) # TODO: add error handling # check if record already exists: @@ -658,8 +662,8 @@ def get_all_entries_with_tags(self, keep_hidden_fields: bool = False) -> pl.Lazy ["__packet_key"] ) if not keep_hidden_fields: - pl_df = pl_df.select(self.all_keys) - return pl_df + pl_df = pl_df.select(self.tag_keys + self.output_keys) + return pl_df.lazy() @property def df(self) -> pl.DataFrame | None: diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 864f649..394a454 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -10,9 +10,8 @@ from orcapod.core import Invocation, Kernel, SyncStream from orcapod.core.pod import FunctionPod -from orcapod.pipeline.wrappers import KernelNode, FunctionPodNode, Node +from orcapod.pipeline.nodes import KernelNode, FunctionPodNode, Node -from orcapod.hashing import hash_to_hex from orcapod.core.tracker import GraphTracker from orcapod.stores import ArrowDataStore diff --git a/src/orcapod/stores/dict_data_stores.py b/src/orcapod/stores/dict_data_stores.py index edb44e5..c4eff60 100644 --- a/src/orcapod/stores/dict_data_stores.py +++ b/src/orcapod/stores/dict_data_stores.py @@ -4,7 +4,7 @@ from os import PathLike from pathlib import Path -from orcapod.hashing import hash_packet +from orcapod.hashing.legacy_core import hash_packet from orcapod.hashing.types import LegacyPacketHasher from orcapod.hashing.defaults import get_default_composite_file_hasher from orcapod.stores.types import DataStore diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py new file mode 100644 index 0000000..f359a8c --- /dev/null +++ b/src/orcapod/utils/object_spec.py @@ -0,0 +1,19 @@ +import importlib + +def parse_objectspec(obj_spec: dict) -> Any: + if "_class" in obj_spec: + # if _class is specified, treat the dict as an object specification + module_name, class_name = obj_spec["_class"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + configs = parse_objectspec(obj_spec.get("config", {})) + return cls(**configs) + else: + # otherwise, parse through the dictionary recursively + parsed_object = obj_spec + for k, v in obj_spec.items(): + if isinstance(v, dict): + parsed_object[k] = parse_objectspec(v) + else: + parsed_object[k] = v + return parsed_object \ No newline at end of file From e689d0dad0a27e505755cb80d816e1536002d7ee Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 07:09:12 +0000 Subject: [PATCH 033/224] fix: failure to reset cache due to mro mixup --- src/orcapod/pipeline/nodes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 2ac3763..ddc38b5 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -747,14 +747,14 @@ def __init__(self, kernel: Kernel, input_nodes: Collection["Node"], **kwargs): def reset_cache(self) -> None: ... -class KernelNode(Node, CachedKernelWrapper): +class KernelNode(CachedKernelWrapper, Node): """ A node that wraps a Kernel and provides a Node interface. This is useful for creating nodes in a pipeline that can be executed. """ -class FunctionPodNode(Node, CachedFunctionPodWrapper): +class FunctionPodNode(CachedFunctionPodWrapper, Node): """ A node that wraps a FunctionPod and provides a Node interface. This is useful for creating nodes in a pipeline that can be executed. From 62220649833f7c9b7543e928e08ae9ec0426ba12 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 16:22:30 +0000 Subject: [PATCH 034/224] style: apply ruff format --- src/orcapod/core/base.py | 4 +- src/orcapod/core/operators.py | 21 +- src/orcapod/core/pod.py | 18 +- src/orcapod/core/streams.py | 1 - src/orcapod/core/tracker.py | 29 ++- src/orcapod/hashing/defaults.py | 10 +- src/orcapod/hashing/file_hashers.py | 14 +- src/orcapod/hashing/types.py | 15 +- src/orcapod/hashing/versioned_hashers.py | 15 +- src/orcapod/pipeline/__init__.py | 2 +- src/orcapod/pipeline/nodes.py | 140 ++++++++--- src/orcapod/pipeline/pipeline.py | 7 +- src/orcapod/stores/arrow_data_stores.py | 14 +- .../stores/delta_table_arrow_data_store.py | 228 +++++++++--------- src/orcapod/stores/optimized_memory_store.py | 120 ++++----- src/orcapod/stores/transfer_data_store.py | 4 +- src/orcapod/stores/types.py | 6 +- src/orcapod/types/core.py | 6 +- src/orcapod/types/packet_converter.py | 13 +- src/orcapod/types/packets.py | 141 +++++++---- src/orcapod/types/schemas.py | 115 +++++---- src/orcapod/types/semantic_type_registry.py | 48 ++-- src/orcapod/types/typespec_utils.py | 14 +- src/orcapod/utils/object_spec.py | 3 +- src/orcapod/utils/stream_utils.py | 9 +- .../test_basic_composite_hasher.py | 24 +- tests/test_hashing/test_hasher_factory.py | 24 +- .../test_legacy_composite_hasher.py | 15 +- tests/test_hashing/test_path_set_hasher.py | 7 +- .../test_string_cacher/test_redis_cacher.py | 4 +- 30 files changed, 643 insertions(+), 428 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index f0d5362..2144c34 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -386,7 +386,9 @@ def types(self, *, trigger_run=False) -> tuple[TypeSpec | None, TypeSpec | None] # otherwise, use the keys from the first packet in the stream # note that this may be computationally expensive tag, packet = next(iter(self)) - return tag_types or get_typespec_from_dict(tag), packet_types or get_typespec_from_dict(packet) + return tag_types or get_typespec_from_dict( + tag + ), packet_types or get_typespec_from_dict(packet) def claims_unique_tags(self, *, trigger_run=False) -> bool | None: """ diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index 598f2e3..bcf63e3 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -13,11 +13,10 @@ check_packet_compatibility, join_tags, semijoin_tags, - fill_missing + fill_missing, ) - class Repeat(Operator): """ A Mapper that repeats the packets in the stream a specified number of times. @@ -185,6 +184,7 @@ def claims_unique_tags( return True + def union_lists(left, right): if left is None or right is None: return None @@ -193,7 +193,7 @@ def union_lists(left, right): if item not in output: output.append(item) return output - + class Join(Operator): def identity_structure(self, *streams): @@ -423,7 +423,7 @@ def keys( stream = streams[0] tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) if tag_keys is None or packet_keys is None: - super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) + super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) tag_keys = tag_keys or super_tag_keys packet_keys = packet_keys or super_packet_keys @@ -583,10 +583,12 @@ def keys( return mapped_tag_keys, packet_keys + class SemiJoin(Operator): """ Perform semi-join on the left stream tags with the tags of the right stream """ + def identity_structure(self, *streams): # Restrict DOES depend on the order of the streams -- maintain as a tuple return (self.__class__.__name__,) + streams @@ -625,7 +627,9 @@ def forward(self, *streams: SyncStream) -> SyncStream: left_tag_typespec, left_packet_typespec = left_stream.types() right_tag_typespec, right_packet_typespec = right_stream.types() - common_tag_typespec = intersection_typespecs(left_tag_typespec, right_tag_typespec) + common_tag_typespec = intersection_typespecs( + left_tag_typespec, right_tag_typespec + ) common_tag_keys = None if common_tag_typespec is not None: common_tag_keys = list(common_tag_typespec.keys()) @@ -646,6 +650,7 @@ def generator() -> Iterator[tuple[Tag, Packet]]: def __repr__(self) -> str: return "SemiJoin()" + class Filter(Operator): """ A Mapper that filters the packets in the stream based on a predicate function. @@ -848,9 +853,9 @@ def generator() -> Iterator[tuple[Tag, Packet]]: if k not in new_tag: new_tag[k] = [t.get(k, None) for t, _ in packets] # combine all packets into a single packet - combined_packet: Packet = Packet({ - k: [p.get(k, None) for _, p in packets] for k in packet_keys - }) + combined_packet: Packet = Packet( + {k: [p.get(k, None) for _, p in packets] for k in packet_keys} + ) yield new_tag, combined_packet return SyncStreamFromGenerator(generator) diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index ae6778d..d64bafa 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -52,7 +52,6 @@ def set_active(self, active: bool) -> None: """ self._active = active - def process_stream(self, *streams: SyncStream) -> tuple[SyncStream, ...]: """ Prepare the incoming streams for execution in the pod. This default implementation @@ -72,7 +71,7 @@ def pre_forward_hook( self, *streams: SyncStream, **kwargs ) -> tuple[SyncStream, ...]: return self.process_stream(*streams) - + def generator_completion_hook(self, n_computed: int) -> None: """ Hook that is called when the generator is completed. This can be used to @@ -215,7 +214,9 @@ def __init__( ) ) - self.input_converter = PacketConverter(self.function_input_typespec, self.registry) + self.input_converter = PacketConverter( + self.function_input_typespec, self.registry + ) self.output_converter = PacketConverter( self.function_output_typespec, self.registry ) @@ -223,13 +224,16 @@ def __init__( def get_function_typespecs(self) -> tuple[TypeSpec, TypeSpec]: return self.function_input_typespec, self.function_output_typespec - def __repr__(self) -> str: return f"FunctionPod:{self.function!r}" def __str__(self) -> str: include_module = self.function.__module__ != "__main__" - func_sig = get_function_signature(self.function, name_override=self.function_name, include_module=include_module) + func_sig = get_function_signature( + self.function, + name_override=self.function_name, + include_module=include_module, + ) return f"FunctionPod:{func_sig}" def call(self, tag, packet) -> tuple[Tag, Packet | None]: @@ -258,7 +262,9 @@ def call(self, tag, packet) -> tuple[Tag, Packet | None]: f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" ) - output_packet: Packet = Packet({k: v for k, v in zip(self.output_keys, output_values)}) + output_packet: Packet = Packet( + {k: v for k, v in zip(self.output_keys, output_values)} + ) return tag, output_packet def identity_structure(self, *streams) -> Any: diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index 21060b1..243a1f4 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -104,4 +104,3 @@ def keys( return super().keys(trigger_run=trigger_run) # If the keys are already set, return them return self.tag_keys.copy(), self.packet_keys.copy() - \ No newline at end of file diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py index 337c027..8f07ae3 100644 --- a/src/orcapod/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -3,6 +3,7 @@ from collections.abc import Collection, Iterator from typing import Any + class StreamWrapper(SyncStream): """ A wrapper for a SyncStream that allows it to be used as a Source. @@ -14,12 +15,16 @@ def __init__(self, stream: SyncStream, **kwargs): super().__init__(**kwargs) self.stream = stream - def keys(self, *streams: SyncStream, **kwargs) -> tuple[Collection[str]|None, Collection[str]|None]: + def keys( + self, *streams: SyncStream, **kwargs + ) -> tuple[Collection[str] | None, Collection[str] | None]: return self.stream.keys(*streams, **kwargs) - def types(self, *streams: SyncStream, **kwargs) -> tuple[TypeSpec|None, TypeSpec|None]: + def types( + self, *streams: SyncStream, **kwargs + ) -> tuple[TypeSpec | None, TypeSpec | None]: return self.stream.types(*streams, **kwargs) - + def computed_label(self) -> str | None: return self.stream.label @@ -28,8 +33,7 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: Iterate over the stream, yielding tuples of (tags, packets). """ yield from self.stream - - + class StreamSource(Source): def __init__(self, stream: SyncStream, **kwargs): @@ -43,25 +47,28 @@ def forward(self, *streams: SyncStream) -> SyncStream: "It generates its own stream from the file system." ) return StreamWrapper(self.stream) - + def identity_structure(self, *streams) -> Any: if len(streams) != 0: raise ValueError( "StreamSource does not support forwarding streams. " "It generates its own stream from the file system." ) - + return (self.__class__.__name__, self.stream) - def types(self, *streams: SyncStream, **kwargs) -> tuple[TypeSpec|None, TypeSpec|None]: + def types( + self, *streams: SyncStream, **kwargs + ) -> tuple[TypeSpec | None, TypeSpec | None]: return self.stream.types() - - def keys(self, *streams: SyncStream, **kwargs) -> tuple[Collection[str]|None, Collection[str]|None]: + + def keys( + self, *streams: SyncStream, **kwargs + ) -> tuple[Collection[str] | None, Collection[str] | None]: return self.stream.keys() def computed_label(self) -> str | None: return self.stream.label - class GraphTracker(Tracker): diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index f9dee37..a9aebcd 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -13,7 +13,10 @@ from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.arrow_hashers import SemanticArrowHasher from orcapod.hashing.semantic_type_hashers import PathHasher -from orcapod.hashing.versioned_hashers import get_versioned_semantic_arrow_hasher, get_versioned_object_hasher +from orcapod.hashing.versioned_hashers import ( + get_versioned_semantic_arrow_hasher, + get_versioned_object_hasher, +) def get_default_arrow_hasher( @@ -39,7 +42,6 @@ def get_default_arrow_hasher( def get_default_object_hasher() -> ObjectHasher: object_hasher = get_versioned_object_hasher() return object_hasher - def get_legacy_object_hasher() -> ObjectHasher: @@ -59,7 +61,9 @@ def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHas return LegacyPathLikeHasherFactory.create_basic_legacy_composite() -def get_default_composite_file_hasher_with_cacher(cacher=None) -> LegacyCompositeFileHasher: +def get_default_composite_file_hasher_with_cacher( + cacher=None, +) -> LegacyCompositeFileHasher: if cacher is None: cacher = InMemoryCacher(max_size=None) return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 64f48f8..f0ca8d1 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -67,7 +67,6 @@ def hash_file(self, file_path: PathLike) -> str: ) - class LegacyCachedFileHasher: """File hasher with caching.""" @@ -90,7 +89,6 @@ def hash_file(self, file_path: PathLike) -> str: return value - class LegacyDefaultPathsetHasher: """Default pathset hasher that composes file hashing.""" @@ -107,11 +105,11 @@ def _hash_file_to_hex(self, file_path: PathLike) -> str: def hash_pathset(self, pathset: PathSet) -> str: """Hash a pathset using the injected file hasher.""" - return legacy_core.hash_pathset( - pathset, - char_count=self.char_count, - file_hasher=self.file_hasher.hash_file, # Inject the method - ) + return legacy_core.hash_pathset( + pathset, + char_count=self.char_count, + file_hasher=self.file_hasher.hash_file, # Inject the method + ) class LegacyDefaultPacketHasher: @@ -197,7 +195,7 @@ def create_cached_legacy_composite( return LegacyDefaultCompositeFileHasher( cached_file_hasher, char_count, packet_prefix=algorithm ) - + @staticmethod def create_legacy_file_hasher( string_cacher: StringCacher | None = None, diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 24afdbc..fabf812 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -44,12 +44,14 @@ def hash(self, obj: Any) -> bytes: ... @abstractmethod - def get_hasher_id(self) -> str: + def get_hasher_id(self) -> str: """ Returns a unique identifier/name assigned to the hasher """ - def hash_to_hex(self, obj: Any, char_count: int | None = None, prefix_hasher_id:bool=False) -> str: + def hash_to_hex( + self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + ) -> str: hash_bytes = self.hash(obj) hex_str = hash_bytes.hex() @@ -95,6 +97,7 @@ def hash_file(self, file_path: PathLike) -> bytes: ... @runtime_checkable class ArrowHasher(Protocol): """Protocol for hashing arrow packets.""" + def get_hasher_id(self) -> str: ... def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... @@ -140,7 +143,7 @@ def set_cacher(self, cacher: StringCacher) -> None: pass -#---------------Legacy implementations and protocols to be deprecated--------------------- +# ---------------Legacy implementations and protocols to be deprecated--------------------- @runtime_checkable @@ -167,9 +170,9 @@ def hash_packet(self, packet: PacketLike) -> str: ... # Combined interface for convenience (optional) @runtime_checkable -class LegacyCompositeFileHasher(LegacyFileHasher, LegacyPathSetHasher, LegacyPacketHasher, Protocol): +class LegacyCompositeFileHasher( + LegacyFileHasher, LegacyPathSetHasher, LegacyPacketHasher, Protocol +): """Combined interface for all file-related hashing operations.""" pass - - diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 18c8680..e6095a0 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -35,15 +35,11 @@ "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", "config": { "hasher_id": "object_v0.1", - "function_info_extractor" : { + "function_info_extractor": { "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", - "config": { - "include_module": True, - "include_defaults": True - } - - } - } + "config": {"include_module": True, "include_defaults": True}, + }, + }, } } @@ -91,7 +87,7 @@ def get_versioned_semantic_arrow_hasher( def get_versioned_object_hasher( version: str | None = None, -) -> ObjectHasher: +) -> ObjectHasher: """ Get an object hasher for the specified version. @@ -109,4 +105,3 @@ def get_versioned_object_hasher( hasher_spec = versioned_object_hashers[version] return parse_objectspec(hasher_spec) - diff --git a/src/orcapod/pipeline/__init__.py b/src/orcapod/pipeline/__init__.py index 2bba49b..9a99f89 100644 --- a/src/orcapod/pipeline/__init__.py +++ b/src/orcapod/pipeline/__init__.py @@ -2,4 +2,4 @@ __all__ = [ "Pipeline", -] \ No newline at end of file +] diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index ddc38b5..b5bd54e 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -2,7 +2,11 @@ from orcapod.core import SyncStream, Source, Kernel from orcapod.stores import ArrowDataStore from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry -from orcapod.types.typespec_utils import get_typespec_from_dict, union_typespecs, extract_function_typespecs +from orcapod.types.typespec_utils import ( + get_typespec_from_dict, + union_typespecs, + extract_function_typespecs, +) from orcapod.types.semantic_type_registry import SemanticTypeRegistry from orcapod.types import packets, schemas from orcapod.hashing import ObjectHasher, ArrowHasher @@ -17,12 +21,18 @@ logger = logging.getLogger(__name__) + def get_tag_typespec(tag: Tag) -> dict[str, type]: return {k: str for k in tag} class PolarsSource(Source): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str], packet_keys: Collection[str]|None = None): + def __init__( + self, + df: pl.DataFrame, + tag_keys: Collection[str], + packet_keys: Collection[str] | None = None, + ): self.df = df self.tag_keys = tag_keys self.packet_keys = packet_keys @@ -37,7 +47,12 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: class PolarsStream(SyncStream): - def __init__(self, df: pl.DataFrame, tag_keys: Collection[str], packet_keys: Collection[str] | None = None): + def __init__( + self, + df: pl.DataFrame, + tag_keys: Collection[str], + packet_keys: Collection[str] | None = None, + ): self.df = df self.tag_keys = tuple(tag_keys) self.packet_keys = tuple(packet_keys) if packet_keys is not None else None @@ -48,9 +63,17 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: # df = df.select(self.tag_keys + self.packet_keys) for row in df.iter_rows(named=True): tag = {key: row[key] for key in self.tag_keys} - packet = {key: val for key, val in row.items() if key not in self.tag_keys and not key.startswith("_source_info_")} + packet = { + key: val + for key, val in row.items() + if key not in self.tag_keys and not key.startswith("_source_info_") + } # TODO: revisit and fix this rather hacky implementation - source_info = {key.removeprefix("_source_info_"):val for key, val in row.items() if key.startswith("_source_info_")} + source_info = { + key.removeprefix("_source_info_"): val + for key, val in row.items() + if key.startswith("_source_info_") + } yield tag, Packet(packet, source_info=source_info) @@ -142,8 +165,6 @@ def claims_unique_tags( *resolved_streams, trigger_run=trigger_run ) - - def post_call(self, tag: Tag, packet: Packet) -> None: ... def output_iterator_completion_hook(self) -> None: ... @@ -215,23 +236,31 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): self.update_cached_values() def update_cached_values(self): - self.source_info = self.store_path_prefix + (self.label, self.kernel_hasher.hash_to_hex(self.kernel, prefix_hasher_id=True)) + self.source_info = self.store_path_prefix + ( + self.label, + self.kernel_hasher.hash_to_hex(self.kernel, prefix_hasher_id=True), + ) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.packet_typespec is None: - raise ValueError("Currently, cached kernel wrapper can only work with kernels that have typespecs defined.") + raise ValueError( + "Currently, cached kernel wrapper can only work with kernels that have typespecs defined." + ) # TODO: clean up and make it unnecessary to convert packet typespec packet_schema = schemas.PythonSchema(self.packet_typespec) - joined_typespec = union_typespecs(self.tag_typespec, packet_schema.with_source_info) + joined_typespec = union_typespecs( + self.tag_typespec, packet_schema.with_source_info + ) if joined_typespec is None: raise ValueError( "Joined typespec should not be None. " "This may happen if the tag typespec and packet typespec are incompatible." ) # Add any additional fields to the output converter here - self.output_converter = packets.PacketConverter(joined_typespec, registry=self.registry, include_source_info=False) + self.output_converter = packets.PacketConverter( + joined_typespec, registry=self.registry, include_source_info=False + ) - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") @@ -240,8 +269,10 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: raise ValueError( "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" ) - source_info_sig = ':'.join(self.source_info) - return PolarsStream(self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys) + source_info_sig = ":".join(self.source_info) + return PolarsStream( + self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys + ) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) @@ -268,7 +299,9 @@ def post_call(self, tag: Tag, packet: Packet) -> None: # If an entry with same tag and packet already exists in the output store, # it will not be added again, thus avoiding duplicates. merged_info = {**tag, **packet.get_composite()} - output_table = self.output_converter.from_python_packet_to_arrow_table(merged_info) + output_table = self.output_converter.from_python_packet_to_arrow_table( + merged_info + ) # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) if not self.output_store.get_record(self.source_info, output_id): @@ -285,7 +318,6 @@ def output_iterator_completion_hook(self) -> None: logger.info(f"Results cached for {self}") self._cache_computed = True - @property def lazy_df(self) -> pl.LazyFrame | None: return self.output_store.get_all_records_as_polars(self.source_info) @@ -424,11 +456,13 @@ def registry(self, registry: SemanticTypeRegistry | None = None): self.update_cached_values() def update_cached_values(self) -> None: - self.function_pod_hash = self.object_hasher.hash_to_hex(self.function_pod, prefix_hasher_id=True) - self.input_typespec, self.output_typespec = self.function_pod.get_function_typespecs() + self.function_pod_hash = self.object_hasher.hash_to_hex( + self.function_pod, prefix_hasher_id=True + ) + self.input_typespec, self.output_typespec = ( + self.function_pod.get_function_typespecs() + ) self.tag_keys, self.output_keys = self.keys(trigger_run=False) - - if self.tag_keys is None or self.output_keys is None: raise ValueError( @@ -445,15 +479,26 @@ def update_cached_values(self) -> None: self.function_pod.get_function_typespecs() ) - self.input_converter = packets.PacketConverter(self.input_typespec, self.registry, include_source_info=False) - self.output_converter = packets.PacketConverter(self.output_typespec, self.registry, include_source_info=True) + self.input_converter = packets.PacketConverter( + self.input_typespec, self.registry, include_source_info=False + ) + self.output_converter = packets.PacketConverter( + self.output_typespec, self.registry, include_source_info=True + ) - input_packet_source_typespec = {f'_source_info_{k}': str for k in self.input_typespec} + input_packet_source_typespec = { + f"_source_info_{k}": str for k in self.input_typespec + } # prepare typespec for tag record: __packet_key, tag, input packet source_info, - tag_record_typespec = {"__packet_key": str, **self.tag_typespec, **input_packet_source_typespec} - self.tag_record_converter = packets.PacketConverter(tag_record_typespec, self.registry, include_source_info=False) - + tag_record_typespec = { + "__packet_key": str, + **self.tag_typespec, + **input_packet_source_typespec, + } + self.tag_record_converter = packets.PacketConverter( + tag_record_typespec, self.registry, include_source_info=False + ) def reset_cache(self): self._cache_computed = False @@ -472,14 +517,19 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if lazy_df is not None: if self.tag_keys is None: raise ValueError("Tag keys are not set, cannot return PolarsStream") - return PolarsStream(lazy_df.collect(), self.tag_keys, packet_keys=self.output_keys) + return PolarsStream( + lazy_df.collect(), self.tag_keys, packet_keys=self.output_keys + ) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) logger.info(f"Computing and caching outputs for {self}") return super().forward(*streams, **kwargs) def get_packet_key(self, packet: Packet) -> str: - return self.arrow_hasher.hash_table(self.input_converter.from_python_packet_to_arrow_table(packet), prefix_hasher_id=True) + return self.arrow_hasher.hash_table( + self.input_converter.from_python_packet_to_arrow_table(packet), + prefix_hasher_id=True, + ) @property def source_info(self): @@ -493,18 +543,24 @@ def add_pipeline_record(self, tag: Tag, packet: Packet) -> Tag: Record the tag for the packet in the record store. This is used to keep track of the tags associated with memoized packets. """ - return self._add_pipeline_record_with_packet_key(tag, self.get_packet_key(packet), packet.source_info) + return self._add_pipeline_record_with_packet_key( + tag, self.get_packet_key(packet), packet.source_info + ) - def _add_pipeline_record_with_packet_key(self, tag: Tag, packet_key: str, packet_source_info: dict[str, str | None]) -> Tag: + def _add_pipeline_record_with_packet_key( + self, tag: Tag, packet_key: str, packet_source_info: dict[str, str | None] + ) -> Tag: if self.tag_store is None: raise ValueError("Recording of tag requires tag_store but none provided") combined_info = dict(tag) # ensure we don't modify the original tag combined_info["__packet_key"] = packet_key for k, v in packet_source_info.items(): - combined_info[f'_source_info_{k}'] = v + combined_info[f"_source_info_{k}"] = v - table = self.tag_record_converter.from_python_packet_to_arrow_table(combined_info) + table = self.tag_record_converter.from_python_packet_to_arrow_table( + combined_info + ) entry_hash = self.arrow_hasher.hash_table(table, prefix_hasher_id=True) @@ -553,7 +609,9 @@ def memoize( Returns the memoized packet. """ logger.debug("Memoizing packet") - return self._memoize_with_packet_key(self.get_packet_key(packet), output_packet.get_composite()) + return self._memoize_with_packet_key( + self.get_packet_key(packet), output_packet.get_composite() + ) def _memoize_with_packet_key( self, packet_key: str, output_packet: PacketLike @@ -581,7 +639,6 @@ def _memoize_with_packet_key( # attach provenance information return Packet(packet) - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: packet_key = "" if ( @@ -609,8 +666,11 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: if output_packet is not None and not self.skip_memoization: # output packet may be modified by the memoization process # e.g. if the output is a file, the path may be changed - # add source info to the output packet - source_info = {k: '-'.join(self.source_info) + "-" + packet_key for k in output_packet.source_info} + # add source info to the output packet + source_info = { + k: "-".join(self.source_info) + "-" + packet_key + for k in output_packet.source_info + } # TODO: fix and make this not access protected field directly output_packet.source_info = source_info output_packet = self._memoize_with_packet_key(packet_key, output_packet) # type: ignore @@ -624,7 +684,9 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: # result was successfully computed -- save the tag if not self.skip_tag_record and self.tag_store is not None: - self._add_pipeline_record_with_packet_key(tag, packet_key, packet.source_info) + self._add_pipeline_record_with_packet_key( + tag, packet_key, packet.source_info + ) return tag, output_packet @@ -639,7 +701,9 @@ def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: return data.drop("__packet_key") if data is not None else None return data - def get_all_entries_with_tags(self, keep_hidden_fields: bool = False) -> pl.LazyFrame | None: + def get_all_entries_with_tags( + self, keep_hidden_fields: bool = False + ) -> pl.LazyFrame | None: """ Retrieve all entries from the tag store with their associated tags. Returns a DataFrame with columns for tag and packet key. diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 394a454..7e04d96 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -95,7 +95,12 @@ def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node tag_store=self.pipeline_store, store_path_prefix=self.name, ) - return KernelNode(kernel, input_nodes, output_store=self.pipeline_store, store_path_prefix=self.name) + return KernelNode( + kernel, + input_nodes, + output_store=self.pipeline_store, + store_path_prefix=self.name, + ) def compile(self): import networkx as nx diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/arrow_data_stores.py index 2608cbc..2897ead 100644 --- a/src/orcapod/stores/arrow_data_stores.py +++ b/src/orcapod/stores/arrow_data_stores.py @@ -24,7 +24,11 @@ def __init__(self): logger.info("Initialized MockArrowDataStore") def add_record( - self, source_pathh: tuple[str, ...], source_id: str, entry_id: str, arrow_data: pa.Table + self, + source_pathh: tuple[str, ...], + source_id: str, + entry_id: str, + arrow_data: pa.Table, ) -> pa.Table: """Add a record to the mock store.""" return arrow_data @@ -35,7 +39,9 @@ def get_record( """Get a specific record.""" return None - def get_all_records(self, source_path: tuple[str, ...], source_id: str) -> pa.Table | None: + def get_all_records( + self, source_path: tuple[str, ...], source_id: str + ) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" return None @@ -47,7 +53,7 @@ def get_all_records_as_polars( def get_records_by_ids( self, - source_path: tuple[str,...], + source_path: tuple[str, ...], source_id: str, entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, @@ -77,7 +83,7 @@ def get_records_by_ids( def get_records_by_ids_as_polars( self, - source_path: tuple[str,...], + source_path: tuple[str, ...], source_id: str, entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/delta_table_arrow_data_store.py index d4fcaf3..c05dea9 100644 --- a/src/orcapod/stores/delta_table_arrow_data_store.py +++ b/src/orcapod/stores/delta_table_arrow_data_store.py @@ -13,7 +13,7 @@ class DeltaTableArrowDataStore: """ Delta Table-based Arrow data store with flexible hierarchical path support. - + Uses tuple-based source paths for robust parameter handling: - ("source_name", "source_id") -> source_name/source_id/ - ("org", "project", "dataset") -> org/project/dataset/ @@ -21,11 +21,11 @@ class DeltaTableArrowDataStore: """ def __init__( - self, - base_path: str | Path, + self, + base_path: str | Path, duplicate_entry_behavior: str = "error", create_base_path: bool = True, - max_hierarchy_depth: int = 10 + max_hierarchy_depth: int = 10, ): """ Initialize the DeltaTableArrowDataStore. @@ -41,19 +41,21 @@ def __init__( # Validate duplicate behavior if duplicate_entry_behavior not in ["error", "overwrite"]: raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - + self.duplicate_entry_behavior = duplicate_entry_behavior self.base_path = Path(base_path) self.max_hierarchy_depth = max_hierarchy_depth - + if create_base_path: self.base_path.mkdir(parents=True, exist_ok=True) elif not self.base_path.exists(): - raise ValueError(f"Base path {self.base_path} does not exist and create_base_path=False") - + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + # Cache for Delta tables to avoid repeated initialization self._delta_table_cache: dict[str, DeltaTable] = {} - + logger.info( f"Initialized DeltaTableArrowDataStore at {self.base_path} " f"with duplicate_entry_behavior='{duplicate_entry_behavior}'" @@ -62,28 +64,34 @@ def __init__( def _validate_source_path(self, source_path: tuple[str, ...]) -> None: """ Validate source path components. - + Args: source_path: Tuple of path components - + Raises: ValueError: If path is invalid """ if not source_path: raise ValueError("Source path cannot be empty") - + if len(source_path) > self.max_hierarchy_depth: - raise ValueError(f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}") - + raise ValueError( + f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + # Validate path components for i, component in enumerate(source_path): if not component or not isinstance(component, str): - raise ValueError(f"Source path component {i} is invalid: {repr(component)}") - + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + # Check for filesystem-unsafe characters - unsafe_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\0'] + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] if any(char in component for char in unsafe_chars): - raise ValueError(f"Source path component contains invalid characters: {repr(component)}") + raise ValueError( + f"Source path component contains invalid characters: {repr(component)}" + ) def _get_source_key(self, source_path: tuple[str, ...]) -> str: """Generate cache key for source storage.""" @@ -115,13 +123,11 @@ def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: return arrow_data def _handle_entry_id_column( - self, - arrow_data: pa.Table, - add_entry_id_column: bool | str = False + self, arrow_data: pa.Table, add_entry_id_column: bool | str = False ) -> pa.Table: """ Handle entry_id column based on add_entry_id_column parameter. - + Args: arrow_data: Arrow table with __entry_id column add_entry_id_column: Control entry ID column inclusion: @@ -167,16 +173,16 @@ def add_record( ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) - + # Ensure directory exists table_path.mkdir(parents=True, exist_ok=True) - + # Add entry_id column to the data data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) - + # Check for existing entry if needed if not ignore_duplicate and self.duplicate_entry_behavior == "error": existing_record = self.get_record(source_path, entry_id) @@ -185,41 +191,36 @@ def add_record( f"Entry '{entry_id}' already exists in {'/'.join(source_path)}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - + try: # Try to load existing table delta_table = DeltaTable(str(table_path)) - + if self.duplicate_entry_behavior == "overwrite": # Delete existing record if it exists, then append new one try: # First, delete existing record with this entry_id delta_table.delete(f"__entry_id = '{entry_id}'") - logger.debug(f"Deleted existing record {entry_id} from {source_key}") + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) except Exception as e: # If delete fails (e.g., record doesn't exist), that's fine logger.debug(f"No existing record to delete for {entry_id}: {e}") - + # Append new record write_deltalake( - str(table_path), - data_with_entry_id, - mode="append", - schema_mode="merge" + str(table_path), data_with_entry_id, mode="append", schema_mode="merge" ) - + except TableNotFoundError: # Table doesn't exist, create it - write_deltalake( - str(table_path), - data_with_entry_id, - mode="overwrite" - ) + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") logger.debug(f"Created new Delta table for {source_key}") - + # Update cache self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - + logger.debug(f"Added record {entry_id} to {source_key}") return arrow_data @@ -228,36 +229,36 @@ def get_record( ) -> pa.Table | None: """ Get a specific record by entry_id. - + Args: source_path: Tuple of path components entry_id: Unique identifier for the record - + Returns: Arrow table for the record, or None if not found """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) - + try: delta_table = DeltaTable(str(table_path)) - + # Query for the specific entry_id - result = delta_table.to_pyarrow_table( - filter=f"__entry_id = '{entry_id}'" - ) - + result = delta_table.to_pyarrow_table(filter=f"__entry_id = '{entry_id}'") + if len(result) == 0: return None - + # Remove the __entry_id column before returning return self._remove_entry_id_column(result) - + except TableNotFoundError: return None except Exception as e: - logger.error(f"Error getting record {entry_id} from {'/'.join(source_path)}: {e}") + logger.error( + f"Error getting record {entry_id} from {'/'.join(source_path)}: {e}" + ) return None def get_all_records( @@ -265,31 +266,31 @@ def get_all_records( ) -> pa.Table | None: """ Retrieve all records for a given source path as a single table. - + Args: source_path: Tuple of path components add_entry_id_column: Control entry ID column inclusion: - False: Don't include entry ID column (default) - True: Include entry ID column as "__entry_id" - str: Include entry ID column with custom name - + Returns: Arrow table containing all records, or None if no records found """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) - + try: delta_table = DeltaTable(str(table_path)) result = delta_table.to_pyarrow_table() - + if len(result) == 0: return None - + # Handle entry_id column based on parameter return self._handle_entry_id_column(result, add_entry_id_column) - + except TableNotFoundError: return None except Exception as e: @@ -301,10 +302,10 @@ def get_all_records_as_polars( ) -> pl.LazyFrame | None: """ Retrieve all records for a given source path as a single Polars LazyFrame. - + Args: source_path: Tuple of path components - + Returns: Polars LazyFrame containing all records, or None if no records found """ @@ -333,7 +334,7 @@ def get_records_by_ids( Arrow table containing all found records, or None if no records found """ self._validate_source_path(source_path) - + # Convert input to list of strings for consistency if isinstance(entry_ids, list): if not entry_ids: @@ -353,39 +354,41 @@ def get_records_by_ids( ) table_path = self._get_table_path(source_path) - + try: delta_table = DeltaTable(str(table_path)) - + # Create filter for the entry IDs - escape single quotes in IDs escaped_ids = [id_.replace("'", "''") for id_ in entry_ids_list] id_filter = " OR ".join([f"__entry_id = '{id_}'" for id_ in escaped_ids]) - + result = delta_table.to_pyarrow_table(filter=id_filter) - + if len(result) == 0: return None - + if preserve_input_order: # Need to reorder results and add nulls for missing entries import pandas as pd - + df = result.to_pandas() - df = df.set_index('__entry_id') - + df = df.set_index("__entry_id") + # Create a DataFrame with the desired order, filling missing with NaN ordered_df = df.reindex(entry_ids_list) - + # Convert back to Arrow result = pa.Table.from_pandas(ordered_df.reset_index()) - + # Handle entry_id column based on parameter return self._handle_entry_id_column(result, add_entry_id_column) - + except TableNotFoundError: return None except Exception as e: - logger.error(f"Error getting records by IDs from {'/'.join(source_path)}: {e}") + logger.error( + f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + ) return None def get_records_by_ids_as_polars( @@ -397,13 +400,13 @@ def get_records_by_ids_as_polars( ) -> pl.LazyFrame | None: """ Retrieve records by entry IDs as a single Polars LazyFrame. - + Args: source_path: Tuple of path components entry_ids: Entry IDs to retrieve add_entry_id_column: Control entry ID column inclusion preserve_input_order: If True, return results in input order with nulls for missing - + Returns: Polars LazyFrame containing all found records, or None if no records found """ @@ -421,20 +424,20 @@ def get_records_by_ids_as_polars( def list_sources(self) -> list[tuple[str, ...]]: """ List all available source paths. - + Returns: List of source path tuples """ sources = [] - + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): """Recursively scan for Delta tables.""" for item in current_path.iterdir(): if not item.is_dir(): continue - + new_path_components = path_components + (item.name,) - + # Check if this directory contains a Delta table try: DeltaTable(str(item)) @@ -443,40 +446,41 @@ def _scan_directory(current_path: Path, path_components: tuple[str, ...]): # Not a Delta table, continue scanning subdirectories if len(new_path_components) < self.max_hierarchy_depth: _scan_directory(item, new_path_components) - + _scan_directory(self.base_path, ()) return sources def delete_source(self, source_path: tuple[str, ...]) -> bool: """ Delete an entire source (all records for a source path). - + Args: source_path: Tuple of path components - + Returns: True if source was deleted, False if it didn't exist """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) - + if not table_path.exists(): return False - + try: # Remove from cache if source_key in self._delta_table_cache: del self._delta_table_cache[source_key] - + # Remove directory import shutil + shutil.rmtree(table_path) - + logger.info(f"Deleted source {source_key}") return True - + except Exception as e: logger.error(f"Error deleting source {source_key}: {e}") return False @@ -484,64 +488,68 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: """ Delete a specific record. - + Args: source_path: Tuple of path components entry_id: ID of the record to delete - + Returns: True if record was deleted, False if it didn't exist """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) - + try: delta_table = DeltaTable(str(table_path)) - + # Check if record exists escaped_entry_id = entry_id.replace("'", "''") - existing = delta_table.to_pyarrow_table(filter=f"__entry_id = '{escaped_entry_id}'") + existing = delta_table.to_pyarrow_table( + filter=f"__entry_id = '{escaped_entry_id}'" + ) if len(existing) == 0: return False - + # Delete the record delta_table.delete(f"__entry_id = '{escaped_entry_id}'") - + # Update cache source_key = self._get_source_key(source_path) self._delta_table_cache[source_key] = delta_table - + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") return True - + except TableNotFoundError: return False except Exception as e: - logger.error(f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}") + logger.error( + f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + ) return False def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: """ Get metadata information about a Delta table. - + Args: source_path: Tuple of path components - + Returns: Dictionary with table metadata, or None if table doesn't exist """ self._validate_source_path(source_path) - + table_path = self._get_table_path(source_path) - + try: delta_table = DeltaTable(str(table_path)) - + # Get basic info schema = delta_table.schema() history = delta_table.history() - + return { "path": str(table_path), "source_path": source_path, @@ -551,9 +559,9 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: "history_length": len(history), "latest_commit": history[0] if history else None, } - + except TableNotFoundError: return None except Exception as e: logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") - return None \ No newline at end of file + return None diff --git a/src/orcapod/stores/optimized_memory_store.py b/src/orcapod/stores/optimized_memory_store.py index ff962e9..1859113 100644 --- a/src/orcapod/stores/optimized_memory_store.py +++ b/src/orcapod/stores/optimized_memory_store.py @@ -11,7 +11,7 @@ class ArrowBatchedPolarsDataStore: """ Arrow-batched Polars data store that minimizes Arrow<->Polars conversions. - + Key optimizations: 1. Keep data in Arrow format during batching 2. Only convert to Polars when consolidating or querying @@ -32,22 +32,22 @@ def __init__(self, duplicate_entry_behavior: str = "error", batch_size: int = 10 """ if duplicate_entry_behavior not in ["error", "overwrite"]: raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - + self.duplicate_entry_behavior = duplicate_entry_behavior self.batch_size = batch_size # Arrow batch buffer: {source_key: [(entry_id, arrow_table), ...]} self._arrow_batches: Dict[str, List[Tuple[str, pa.Table]]] = defaultdict(list) - + # Consolidated Polars store: {source_key: polars_dataframe} self._polars_store: Dict[str, pl.DataFrame] = {} - + # Entry ID index for fast lookups: {source_key: set[entry_ids]} self._entry_index: Dict[str, set] = defaultdict(set) - + # Schema cache self._schema_cache: Dict[str, pa.Schema] = {} - + logger.info( f"Initialized ArrowBatchedPolarsDataStore with " f"duplicate_entry_behavior='{duplicate_entry_behavior}', batch_size={batch_size}" @@ -61,7 +61,7 @@ def _add_entry_id_to_arrow_table(self, table: pa.Table, entry_id: str) -> pa.Tab """Add entry_id column to Arrow table efficiently.""" # Create entry_id array with the same length as the table entry_id_array = pa.array([entry_id] * len(table), type=pa.string()) - + # Add column at the beginning for consistent ordering return table.add_column(0, "__entry_id", entry_id_array) @@ -69,36 +69,40 @@ def _consolidate_arrow_batch(self, source_key: str) -> None: """Consolidate Arrow batch into Polars DataFrame.""" if source_key not in self._arrow_batches or not self._arrow_batches[source_key]: return - - logger.debug(f"Consolidating {len(self._arrow_batches[source_key])} Arrow tables for {source_key}") - + + logger.debug( + f"Consolidating {len(self._arrow_batches[source_key])} Arrow tables for {source_key}" + ) + # Prepare all Arrow tables with entry_id columns arrow_tables_with_id = [] - + for entry_id, arrow_table in self._arrow_batches[source_key]: table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) arrow_tables_with_id.append(table_with_id) - + # Concatenate all Arrow tables at once (very fast) if len(arrow_tables_with_id) == 1: consolidated_arrow = arrow_tables_with_id[0] else: consolidated_arrow = pa.concat_tables(arrow_tables_with_id) - + # Single conversion to Polars new_polars_df = cast(pl.DataFrame, pl.from_arrow(consolidated_arrow)) - + # Combine with existing Polars DataFrame if it exists if source_key in self._polars_store: existing_df = self._polars_store[source_key] self._polars_store[source_key] = pl.concat([existing_df, new_polars_df]) else: self._polars_store[source_key] = new_polars_df - + # Clear the Arrow batch self._arrow_batches[source_key].clear() - - logger.debug(f"Consolidated to Polars DataFrame with {len(self._polars_store[source_key])} total rows") + + logger.debug( + f"Consolidated to Polars DataFrame with {len(self._polars_store[source_key])} total rows" + ) def _force_consolidation(self, source_key: str) -> None: """Force consolidation of Arrow batches.""" @@ -119,7 +123,7 @@ def add_record( ) -> pa.Table: """ Add a record to the store using Arrow batching. - + This is the fastest path - no conversions, just Arrow table storage. """ source_key = self._get_source_key(source_name, source_id) @@ -135,15 +139,16 @@ def add_record( # Handle overwrite: remove from both Arrow batch and Polars store # Remove from Arrow batch self._arrow_batches[source_key] = [ - (eid, table) for eid, table in self._arrow_batches[source_key] + (eid, table) + for eid, table in self._arrow_batches[source_key] if eid != entry_id ] - + # Remove from Polars store if it exists if source_key in self._polars_store: - self._polars_store[source_key] = self._polars_store[source_key].filter( - pl.col("__entry_id") != entry_id - ) + self._polars_store[source_key] = self._polars_store[ + source_key + ].filter(pl.col("__entry_id") != entry_id) # Schema validation (cached) if source_key in self._schema_cache: @@ -159,7 +164,7 @@ def add_record( # Add to Arrow batch (no conversion yet!) self._arrow_batches[source_key].append((entry_id, arrow_data)) self._entry_index[source_key].add(entry_id) - + # Consolidate if batch is full if len(self._arrow_batches[source_key]) >= self.batch_size: self._consolidate_arrow_batch(source_key) @@ -172,16 +177,16 @@ def get_record( ) -> pa.Table | None: """Get a specific record with optimized lookup.""" source_key = self._get_source_key(source_name, source_id) - + # Quick existence check if entry_id not in self._entry_index[source_key]: return None - + # Check Arrow batch first (most recent data) for batch_entry_id, arrow_table in self._arrow_batches[source_key]: if batch_entry_id == entry_id: return arrow_table - + # Check consolidated Polars store df = self._get_consolidated_dataframe(source_key) if df is None: @@ -189,7 +194,7 @@ def get_record( # Filter and convert back to Arrow filtered_df = df.filter(pl.col("__entry_id") == entry_id).drop("__entry_id") - + if filtered_df.height == 0: return None @@ -200,7 +205,7 @@ def get_all_records( ) -> pa.Table | None: """Retrieve all records as a single Arrow table.""" source_key = self._get_source_key(source_name, source_id) - + # Force consolidation to include all data df = self._get_consolidated_dataframe(source_key) if df is None or df.height == 0: @@ -223,7 +228,7 @@ def get_all_records_as_polars( ) -> pl.LazyFrame | None: """Retrieve all records as a Polars LazyFrame.""" source_key = self._get_source_key(source_name, source_id) - + df = self._get_consolidated_dataframe(source_key) if df is None or df.height == 0: return None @@ -256,20 +261,21 @@ def get_records_by_ids( raise TypeError(f"entry_ids must be list[str], pl.Series, or pa.Array") source_key = self._get_source_key(source_name, source_id) - + # Quick filter using index existing_entries = [ - entry_id for entry_id in entry_ids_list + entry_id + for entry_id in entry_ids_list if entry_id in self._entry_index[source_key] ] - + if not existing_entries and not preserve_input_order: return None # Collect from Arrow batch first batch_tables = [] found_in_batch = set() - + for entry_id, arrow_table in self._arrow_batches[source_key]: if entry_id in entry_ids_list: table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) @@ -278,7 +284,7 @@ def get_records_by_ids( # Get remaining from consolidated store remaining_ids = [eid for eid in existing_entries if eid not in found_in_batch] - + consolidated_tables = [] if remaining_ids: df = self._get_consolidated_dataframe(source_key) @@ -288,13 +294,13 @@ def get_records_by_ids( result_df = ordered_df.join(df, on="__entry_id", how="left") else: result_df = df.filter(pl.col("__entry_id").is_in(remaining_ids)) - + if result_df.height > 0: consolidated_tables.append(result_df.to_arrow()) # Combine all results all_tables = batch_tables + consolidated_tables - + if not all_tables: return None @@ -309,7 +315,9 @@ def get_records_by_ids( # Remove __entry_id column column_names = result_table.column_names if "__entry_id" in column_names: - indices = [i for i, name in enumerate(column_names) if name != "__entry_id"] + indices = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] result_table = result_table.select(indices) elif isinstance(add_entry_id_column, str): # Rename __entry_id column @@ -337,7 +345,7 @@ def get_records_by_ids_as_polars( if arrow_result is None: return None - + pl_result = cast(pl.DataFrame, pl.from_arrow(arrow_result)) return pl_result.lazy() @@ -370,7 +378,7 @@ def force_consolidation(self) -> None: def clear_source(self, source_name: str, source_id: str) -> None: """Clear all data for a source.""" source_key = self._get_source_key(source_name, source_id) - + if source_key in self._arrow_batches: del self._arrow_batches[source_key] if source_key in self._polars_store: @@ -379,7 +387,7 @@ def clear_source(self, source_name: str, source_id: str) -> None: del self._entry_index[source_key] if source_key in self._schema_cache: del self._schema_cache[source_key] - + logger.debug(f"Cleared source {source_key}") def clear_all(self) -> None: @@ -394,25 +402,29 @@ def get_stats(self) -> dict[str, Any]: """Get comprehensive statistics.""" total_records = sum(len(entries) for entries in self._entry_index.values()) total_batched = sum(len(batch) for batch in self._arrow_batches.values()) - total_consolidated = sum( - len(df) for df in self._polars_store.values() - ) if self._polars_store else 0 - + total_consolidated = ( + sum(len(df) for df in self._polars_store.values()) + if self._polars_store + else 0 + ) + source_stats = [] for source_key in self._entry_index.keys(): record_count = len(self._entry_index[source_key]) batched_count = len(self._arrow_batches.get(source_key, [])) consolidated_count = 0 - + if source_key in self._polars_store: consolidated_count = len(self._polars_store[source_key]) - - source_stats.append({ - "source_key": source_key, - "total_records": record_count, - "batched_records": batched_count, - "consolidated_records": consolidated_count, - }) + + source_stats.append( + { + "source_key": source_key, + "total_records": record_count, + "batched_records": batched_count, + "consolidated_records": consolidated_count, + } + ) return { "total_records": total_records, @@ -430,4 +442,4 @@ def optimize_for_reads(self) -> None: self.force_consolidation() # Clear Arrow batches to save memory self._arrow_batches.clear() - logger.info("Optimization complete") \ No newline at end of file + logger.info("Optimization complete") diff --git a/src/orcapod/stores/transfer_data_store.py b/src/orcapod/stores/transfer_data_store.py index 0c8e215..9e393e0 100644 --- a/src/orcapod/stores/transfer_data_store.py +++ b/src/orcapod/stores/transfer_data_store.py @@ -14,7 +14,9 @@ def __init__(self, source_store: DataStore, target_store: DataStore) -> None: self.source_store = source_store self.target_store = target_store - def transfer(self, function_name: str, content_hash: str, packet: PacketLike) -> PacketLike: + def transfer( + self, function_name: str, content_hash: str, packet: PacketLike + ) -> PacketLike: """ Transfer a memoized packet from the source store to the target store. """ diff --git a/src/orcapod/stores/types.py b/src/orcapod/stores/types.py index c588856..da7e492 100644 --- a/src/orcapod/stores/types.py +++ b/src/orcapod/stores/types.py @@ -48,15 +48,15 @@ def add_record( ) -> pa.Table: ... def get_record( - self, source_path: tuple[str,...], entry_id: str + self, source_path: tuple[str, ...], entry_id: str ) -> pa.Table | None: ... - def get_all_records(self, source_path: tuple[str,...]) -> pa.Table | None: + def get_all_records(self, source_path: tuple[str, ...]) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" ... def get_all_records_as_polars( - self, source_path: tuple[str,...] + self, source_path: tuple[str, ...] ) -> pl.LazyFrame | None: """Retrieve all records for a given source as a single Polars DataFrame.""" ... diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index dd02141..12448f8 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -5,7 +5,6 @@ from collections.abc import Collection, Mapping - DataType: TypeAlias = type TypeSpec: TypeAlias = Mapping[ @@ -34,8 +33,9 @@ # Extended data values that can be stored in packets # Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = PathSet | SupportedNativePythonData | None | Collection["DataValue"] - +DataValue: TypeAlias = ( + PathSet | SupportedNativePythonData | None | Collection["DataValue"] +) class PodFunction(Protocol): diff --git a/src/orcapod/types/packet_converter.py b/src/orcapod/types/packet_converter.py index 0a8389d..e486222 100644 --- a/src/orcapod/types/packet_converter.py +++ b/src/orcapod/types/packet_converter.py @@ -1,6 +1,11 @@ from orcapod.types.core import TypeSpec, TypeHandler from orcapod.types.packets import Packet, PacketLike -from orcapod.types.semantic_type_registry import SemanticTypeRegistry, TypeInfo, get_metadata_from_schema, arrow_to_dicts +from orcapod.types.semantic_type_registry import ( + SemanticTypeRegistry, + TypeInfo, + get_metadata_from_schema, + arrow_to_dicts, +) from typing import Any from collections.abc import Mapping, Sequence import pyarrow as pa @@ -10,7 +15,9 @@ def is_packet_supported( - python_type_info: TypeSpec, registry: SemanticTypeRegistry, type_lut: dict | None = None + python_type_info: TypeSpec, + registry: SemanticTypeRegistry, + type_lut: dict | None = None, ) -> bool: """Check if all types in the packet are supported by the registry or known to the default lut.""" if type_lut is None: @@ -21,7 +28,6 @@ def is_packet_supported( ) - class PacketConverter: def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): self.python_type_spec = python_type_spec @@ -174,4 +180,3 @@ def from_arrow_table( return storage_packets return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] - diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/packets.py index a6621ee..47df081 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/packets.py @@ -12,16 +12,17 @@ class Packet(dict[str, DataValue]): def __init__( - self, + self, obj: PacketLike | None = None, - typespec: TypeSpec | None = None, - source_info: dict[str, str|None] | None = None + typespec: TypeSpec | None = None, + source_info: dict[str, str | None] | None = None, ): if obj is None: obj = {} super().__init__(obj) if typespec is None: from orcapod.types.typespec_utils import get_typespec_from_dict + typespec = get_typespec_from_dict(self) self._typespec = typespec if source_info is None: @@ -36,18 +37,22 @@ def typespec(self) -> TypeSpec: @property def source_info(self) -> dict[str, str | None]: return {key: self._source_info.get(key, None) for key in self.keys()} - + @source_info.setter def source_info(self, source_info: Mapping[str, str | None]): - self._source_info = {key: value for key, value in source_info.items() if value is not None} + self._source_info = { + key: value for key, value in source_info.items() if value is not None + } def get_composite(self) -> PacketLike: composite = self.copy() for k, v in self.source_info.items(): composite[f"_source_info_{k}"] = v return composite - - def map_keys(self, mapping: Mapping[str, str], drop_unmapped: bool=False) -> 'Packet': + + def map_keys( + self, mapping: Mapping[str, str], drop_unmapped: bool = False + ) -> "Packet": """ Map the keys of the packet using the provided mapping. @@ -58,23 +63,25 @@ def map_keys(self, mapping: Mapping[str, str], drop_unmapped: bool=False) -> 'Pa A new Packet with keys mapped according to the provided mapping. """ if drop_unmapped: - new_content = { - v: self[k] for k, v in mapping.items() if k in self - } + new_content = {v: self[k] for k, v in mapping.items() if k in self} new_typespec = { v: self.typespec[k] for k, v in mapping.items() if k in self.typespec } new_source_info = { - v: self.source_info[k] for k, v in mapping.items() if k in self.source_info + v: self.source_info[k] + for k, v in mapping.items() + if k in self.source_info } else: new_content = {mapping.get(k, k): v for k, v in self.items()} new_typespec = {mapping.get(k, k): v for k, v in self.typespec.items()} - new_source_info = {mapping.get(k, k): v for k, v in self.source_info.items()} + new_source_info = { + mapping.get(k, k): v for k, v in self.source_info.items() + } return Packet(new_content, typespec=new_typespec, source_info=new_source_info) - - def join(self, other: 'Packet') -> 'Packet': + + def join(self, other: "Packet") -> "Packet": """ Join another packet to this one, merging their keys and values. @@ -86,13 +93,15 @@ def join(self, other: 'Packet') -> 'Packet': """ # make sure there is no key collision if not set(self.keys()).isdisjoint(other.keys()): - raise ValueError(f"Key collision detected: packets {self} and {other} have overlapping keys" - " and cannot be joined without losing information.") + raise ValueError( + f"Key collision detected: packets {self} and {other} have overlapping keys" + " and cannot be joined without losing information." + ) new_content = {**self, **other} new_typespec = {**self.typespec, **other.typespec} new_source_info = {**self.source_info, **other.source_info} - + return Packet(new_content, typespec=new_typespec, source_info=new_source_info) @@ -103,23 +112,30 @@ def join(self, other: 'Packet') -> 'Packet': class SemanticPacket(dict[str, Any]): """ A packet that conforms to a semantic schema, mapping string keys to values. - + This is used to represent data packets in OrcaPod with semantic types. - + Attributes ---------- keys : str The keys of the packet. values : Any The values corresponding to each key. - + Examples -------- >>> packet = SemanticPacket(name='Alice', age=30) >>> print(packet) {'name': 'Alice', 'age': 30} """ - def __init__(self, *args, semantic_schema: schemas.SemanticSchema | None = None, source_info: dict[str, str|None] | None = None, **kwargs): + + def __init__( + self, + *args, + semantic_schema: schemas.SemanticSchema | None = None, + source_info: dict[str, str | None] | None = None, + **kwargs, + ): super().__init__(*args, **kwargs) self.schema = semantic_schema if source_info is None: @@ -134,7 +150,12 @@ def get_composite(self) -> dict[str, Any]: class PacketConverter: - def __init__(self, typespec: TypeSpec, registry: SemanticTypeRegistry, include_source_info: bool = True): + def __init__( + self, + typespec: TypeSpec, + registry: SemanticTypeRegistry, + include_source_info: bool = True, + ): self.typespec = typespec self.registry = registry @@ -148,8 +169,6 @@ def __init__(self, typespec: TypeSpec, registry: SemanticTypeRegistry, include_s self.semantic_schema, include_source_info=self.include_source_info ) - - self.key_handlers: dict[str, TypeHandler] = {} self.expected_key_set = set(self.typespec.keys()) @@ -178,7 +197,9 @@ def _check_key_consistency(self, keys): raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - def from_python_packet_to_semantic_packet(self, python_packet: PacketLike) -> SemanticPacket: + def from_python_packet_to_semantic_packet( + self, python_packet: PacketLike + ) -> SemanticPacket: """Convert a Python packet to a semantic packet. Args: @@ -193,22 +214,22 @@ def from_python_packet_to_semantic_packet(self, python_packet: PacketLike) -> Se ValueError: If conversion fails """ # Validate packet keys - semantic_packet = SemanticPacket(python_packet, semantic_schema=self.semantic_schema, source_info=getattr(python_packet, "source_info", None)) + semantic_packet = SemanticPacket( + python_packet, + semantic_schema=self.semantic_schema, + source_info=getattr(python_packet, "source_info", None), + ) self._check_key_consistency(set(semantic_packet.keys())) # convert from storage to Python types for semantic types for key, handler in self.key_handlers.items(): try: - semantic_packet[key] = handler.python_to_storage( - semantic_packet[key] - ) + semantic_packet[key] = handler.python_to_storage(semantic_packet[key]) except Exception as e: raise ValueError(f"Failed to convert value for '{key}': {e}") from e return semantic_packet - - def from_python_packet_to_arrow_table(self, python_packet: PacketLike) -> pa.Table: """Convert a Python packet to an Arrow table. @@ -221,7 +242,9 @@ def from_python_packet_to_arrow_table(self, python_packet: PacketLike) -> pa.Tab semantic_packet = self.from_python_packet_to_semantic_packet(python_packet) return self.from_semantic_packet_to_arrow_table(semantic_packet) - def from_semantic_packet_to_arrow_table(self, semantic_packet: SemanticPacket) -> pa.Table: + def from_semantic_packet_to_arrow_table( + self, semantic_packet: SemanticPacket + ) -> pa.Table: """Convert a semantic packet to an Arrow table. Args: @@ -231,12 +254,15 @@ def from_semantic_packet_to_arrow_table(self, semantic_packet: SemanticPacket) - Arrow table representation of the packet """ if self.include_source_info: - return pa.Table.from_pylist([semantic_packet.get_composite()], schema=self.arrow_schema) + return pa.Table.from_pylist( + [semantic_packet.get_composite()], schema=self.arrow_schema + ) else: return pa.Table.from_pylist([semantic_packet], schema=self.arrow_schema) - - def from_arrow_table_to_semantic_packets(self, arrow_table: pa.Table) -> Collection[SemanticPacket]: + def from_arrow_table_to_semantic_packets( + self, arrow_table: pa.Table + ) -> Collection[SemanticPacket]: """Convert an Arrow table to a semantic packet. Args: @@ -249,18 +275,34 @@ def from_arrow_table_to_semantic_packets(self, arrow_table: pa.Table) -> Collect # schema matches what's expected if not arrow_table.schema.equals(self.arrow_schema): raise ValueError("Arrow table schema does not match expected schema") - + semantic_packets_contents = arrow_table.to_pylist() - + semantic_packets = [] for all_packet_content in semantic_packets_contents: - packet_content = {k: v for k, v in all_packet_content.items() if k in self.expected_key_set} - source_info = {k.removeprefix('_source_info_'): v for k, v in all_packet_content.items() if k.startswith('_source_info_')} - semantic_packets.append(SemanticPacket(packet_content, semantic_schema=self.semantic_schema, source_info=source_info)) + packet_content = { + k: v + for k, v in all_packet_content.items() + if k in self.expected_key_set + } + source_info = { + k.removeprefix("_source_info_"): v + for k, v in all_packet_content.items() + if k.startswith("_source_info_") + } + semantic_packets.append( + SemanticPacket( + packet_content, + semantic_schema=self.semantic_schema, + source_info=source_info, + ) + ) return semantic_packets - def from_semantic_packet_to_python_packet(self, semantic_packet: SemanticPacket) -> Packet: + def from_semantic_packet_to_python_packet( + self, semantic_packet: SemanticPacket + ) -> Packet: """Convert a semantic packet to a Python packet. Args: @@ -270,18 +312,20 @@ def from_semantic_packet_to_python_packet(self, semantic_packet: SemanticPacket) Python packet representation of the semantic packet """ # Validate packet keys - python_packet = Packet(semantic_packet, typespec=self.typespec, source_info=semantic_packet.source_info) + python_packet = Packet( + semantic_packet, + typespec=self.typespec, + source_info=semantic_packet.source_info, + ) packet_keys = set(python_packet.keys()) self._check_key_consistency(packet_keys) for key, handler in self.key_handlers.items(): try: - python_packet[key] = handler.storage_to_python( - python_packet[key] - ) + python_packet[key] = handler.storage_to_python(python_packet[key]) except Exception as e: raise ValueError(f"Failed to convert value for '{key}': {e}") from e - + return python_packet def from_arrow_table_to_python_packets(self, arrow_table: pa.Table) -> list[Packet]: @@ -294,5 +338,6 @@ def from_arrow_table_to_python_packets(self, arrow_table: pa.Table) -> list[Pack List of Python packets converted from the Arrow table """ semantic_packets = self.from_arrow_table_to_semantic_packets(arrow_table) - return [self.from_semantic_packet_to_python_packet(sp) for sp in semantic_packets] - + return [ + self.from_semantic_packet_to_python_packet(sp) for sp in semantic_packets + ] diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 19e8a3b..dc2112f 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,5 +1,4 @@ - -from orcapod.types import TypeSpec +from orcapod.types import TypeSpec from orcapod.types.semantic_type_registry import SemanticTypeRegistry from typing import Any import pyarrow as pa @@ -14,11 +13,13 @@ bool: pa.bool_(), } + def python_to_arrow_type(python_type: type) -> pa.DataType: if python_type in DEFAULT_ARROW_TYPE_LUT: return DEFAULT_ARROW_TYPE_LUT[python_type] raise TypeError(f"Converstion of python type {python_type} is not supported yet") + def arrow_to_python_type(arrow_type: pa.DataType) -> type: if pa.types.is_integer(arrow_type): return int @@ -38,68 +39,68 @@ def arrow_to_python_type(arrow_type: pa.DataType) -> type: raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") - class PythonSchema(dict[str, type]): """ A schema for Python data types, mapping string keys to Python types. - + This is used to define the expected structure of data packets in OrcaPod. - + Attributes ---------- keys : str The keys of the schema. values : type The types corresponding to each key. - + Examples -------- >>> schema = PythonSchema(name=str, age=int) >>> print(schema) {'name': , 'age': } """ + @property def with_source_info(self) -> dict[str, type]: """ Get the schema with source info fields included. - + Returns ------- dict[str, type|None] A new schema including source info fields. """ - return {**self, **{f'_source_info_{k}': str for k in self.keys()}} + return {**self, **{f"_source_info_{k}": str for k in self.keys()}} - -class SemanticSchema(dict[str, tuple[type, str|None]]): +class SemanticSchema(dict[str, tuple[type, str | None]]): """ A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. - + This is used to define the expected structure of data packets with semantic types in OrcaPod. - + Attributes ---------- keys : str The keys of the schema. values : tuple[type, str|None] The types and optional semantic type corresponding to each key. - + Examples -------- >>> schema = SemanticSchema(image=(str, 'path'), age=(int, None)) >>> print(schema) {'image': (, 'path'), 'age': (, None)} """ + def get_store_type(self, key: str) -> type | None: """ Get the storage type for a given key in the schema. - + Parameters ---------- key : str The key for which to retrieve the storage type. - + Returns ------- type | None @@ -110,24 +111,24 @@ def get_store_type(self, key: str) -> type | None: def get_semantic_type(self, key: str) -> str | None: """ Get the semantic type for a given key in the schema. - + Parameters ---------- key : str The key for which to retrieve the semantic type. - + Returns ------- str | None The semantic type associated with the key, or None if not found. """ return self.get(key, (None, None))[1] - + @property def storage_schema(self) -> PythonSchema: """ Get the storage schema, which is a PythonSchema representation of the semantic schema. - + Returns ------- PythonSchema @@ -135,17 +136,16 @@ def storage_schema(self) -> PythonSchema: """ return PythonSchema({k: v[0] for k, v in self.items()}) - @property def storage_schema_with_source_info(self) -> dict[str, type]: """ Get the storage schema with source info fields included. - + Returns ------- dict[str, type] A new schema including source info fields. - + Examples -------- >>> semantic_schema = SemanticSchema(name=(str, 'name'), age=(int, None)) @@ -162,19 +162,19 @@ def from_typespec_to_semantic_schema( ) -> SemanticSchema: """ Convert a Python schema to a semantic schema using the provided semantic type registry. - + Parameters ---------- typespec : TypeSpec The typespec to convert, mapping keys to Python types. semantic_type_registry : SemanticTypeRegistry The registry containing semantic type information. - + Returns ------- SemanticSchema A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - + Examples -------- >>> typespec: TypeSpec = dict(name=str, age=int) @@ -186,31 +186,34 @@ def from_typespec_to_semantic_schema( for key, python_type in typespec.items(): if python_type in semantic_type_registry: type_info = semantic_type_registry.get_type_info(python_type) - assert type_info is not None, f"Type {python_type} should be found in the registry as `in` returned True" + assert type_info is not None, ( + f"Type {python_type} should be found in the registry as `in` returned True" + ) semantic_schema[key] = (type_info.storage_type, type_info.semantic_type) else: semantic_schema[key] = (python_type, None) return SemanticSchema(semantic_schema) + def from_semantic_schema_to_python_schema( semantic_schema: SemanticSchema, semantic_type_registry: SemanticTypeRegistry, ) -> PythonSchema: """ Convert a semantic schema to a Python schema using the provided semantic type registry. - + Parameters ---------- semantic_schema : SemanticSchema The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. semantic_type_registry : SemanticTypeRegistry The registry containing semantic type information. - + Returns ------- PythonSchema A new schema mapping keys to Python types. - + Examples -------- >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) @@ -226,23 +229,24 @@ def from_semantic_schema_to_python_schema( python_schema_content[key] = python_type return PythonSchema(python_schema_content) + def from_semantic_schema_to_arrow_schema( semantic_schema: SemanticSchema, include_source_info: bool = True, ) -> pa.Schema: """ Convert a semantic schema to an Arrow schema. - + Parameters ---------- semantic_schema : SemanticSchema The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. - + Returns ------- dict[str, type] A new schema mapping keys to Arrow-compatible types. - + Examples -------- >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) @@ -253,32 +257,39 @@ def from_semantic_schema_to_arrow_schema( fields = [] for field_name, (python_type, semantic_type) in semantic_schema.items(): arrow_type = DEFAULT_ARROW_TYPE_LUT[python_type] - field_metadata = {b"semantic_type": semantic_type.encode('utf-8')} if semantic_type else {} + field_metadata = ( + {b"semantic_type": semantic_type.encode("utf-8")} if semantic_type else {} + ) fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) if include_source_info: for field in semantic_schema: - field_metadata = {b'field_type': b'source_info'} - fields.append(pa.field(f'_source_info_{field}', pa.large_string(), metadata=field_metadata)) - + field_metadata = {b"field_type": b"source_info"} + fields.append( + pa.field( + f"_source_info_{field}", pa.large_string(), metadata=field_metadata + ) + ) + return pa.schema(fields) + def from_arrow_schema_to_semantic_schema( arrow_schema: pa.Schema, ) -> SemanticSchema: """ Convert an Arrow schema to a semantic schema. - + Parameters ---------- arrow_schema : pa.Schema The schema to convert, containing fields with metadata. - + Returns ------- SemanticSchema A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - + Examples -------- >>> arrow_schema = pa.schema([pa.field('name', pa.string(), metadata={'semantic_type': 'name'}), @@ -289,19 +300,25 @@ def from_arrow_schema_to_semantic_schema( """ semantic_schema = {} for field in arrow_schema: - if field.metadata.get(b'field_type', b'') == b'source_info': + if field.metadata.get(b"field_type", b"") == b"source_info": # Skip source info fields continue - semantic_type = field.metadata.get(b'semantic_type', None) + semantic_type = field.metadata.get(b"semantic_type", None) semantic_type = semantic_type.decode() if semantic_type else None python_type = arrow_to_python_type(field.type) semantic_schema[field.name] = (python_type, semantic_type) return SemanticSchema(semantic_schema) -def from_typespec_to_arrow_schema(typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, include_source_info: bool = True) -> pa.Schema: + +def from_typespec_to_arrow_schema( + typespec: TypeSpec, + semantic_type_registry: SemanticTypeRegistry, + include_source_info: bool = True, +) -> pa.Schema: semantic_schema = from_typespec_to_semantic_schema(typespec, semantic_type_registry) - return from_semantic_schema_to_arrow_schema(semantic_schema, include_source_info=include_source_info) + return from_semantic_schema_to_arrow_schema( + semantic_schema, include_source_info=include_source_info + ) def from_arrow_schema_to_python_schema( @@ -310,17 +327,17 @@ def from_arrow_schema_to_python_schema( ) -> PythonSchema: """ Convert an Arrow schema to a Python schema. - + Parameters ---------- arrow_schema : pa.Schema The schema to convert, containing fields with metadata. - + Returns ------- PythonSchema A new schema mapping keys to Python types. - + Examples -------- >>> arrow_schema = pa.schema([pa.field('name', pa.string()), pa.field('age', pa.int64())]) @@ -329,4 +346,6 @@ def from_arrow_schema_to_python_schema( {'name': , 'age': } """ semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) - return from_semantic_schema_to_python_schema(semantic_schema, semantic_type_registry) \ No newline at end of file + return from_semantic_schema_to_python_schema( + semantic_schema, semantic_type_registry + ) diff --git a/src/orcapod/types/semantic_type_registry.py b/src/orcapod/types/semantic_type_registry.py index d5a677f..2091904 100644 --- a/src/orcapod/types/semantic_type_registry.py +++ b/src/orcapod/types/semantic_type_registry.py @@ -33,7 +33,9 @@ def __init__(self): type, tuple[TypeHandler, str] ] = {} # PythonType -> (Handler, semantic_name) self._semantic_handlers: dict[str, TypeHandler] = {} # semantic_name -> Handler - self._semantic_to_python_lut: dict[str, type] = {} # semantic_name -> Python type + self._semantic_to_python_lut: dict[ + str, type + ] = {} # semantic_name -> Python type def register( self, @@ -49,7 +51,7 @@ def register( override: If True, allow overriding existing registration for the same semantic name and Python type(s) """ # Determine which types to register for - + python_type = handler.python_type() # Register handler for each type @@ -59,7 +61,7 @@ def register( raise ValueError( f"Type {python_type} already registered with semantic type '{existing_semantic}'" ) - + # Register by semantic name if semantic_type in self._semantic_handlers: raise ValueError(f"Semantic type '{semantic_type}' already registered") @@ -78,12 +80,12 @@ def lookup_handler_info(self, python_type: type) -> tuple[TypeHandler, str] | No if issubclass(python_type, registered_type): return (handler, semantic_type) return None - + def get_semantic_type(self, python_type: type) -> str | None: """Get semantic type for a Python type.""" handler_info = self.lookup_handler_info(python_type) return handler_info[1] if handler_info else None - + def get_handler(self, python_type: type) -> TypeHandler | None: """Get handler for a Python type.""" handler_info = self.lookup_handler_info(python_type) @@ -92,7 +94,6 @@ def get_handler(self, python_type: type) -> TypeHandler | None: def get_handler_by_semantic_type(self, semantic_type: str) -> TypeHandler | None: """Get handler by semantic type.""" return self._semantic_handlers.get(semantic_type) - def get_type_info(self, python_type: type) -> TypeInfo | None: """Get TypeInfo for a Python type.""" @@ -107,7 +108,6 @@ def get_type_info(self, python_type: type) -> TypeInfo | None: handler=handler, ) - def __contains__(self, python_type: type) -> bool: """Check if a Python type is registered.""" for registered_type in self._handlers: @@ -116,18 +116,14 @@ def __contains__(self, python_type: type) -> bool: return False - - - - # Below is a collection of functions that handles converting between various aspects of Python packets and Arrow tables. # Here for convenience, any Python dictionary with str keys and supported Python values are referred to as a packet. # Conversions are: -# python packet <-> storage packet <-> arrow table +# python packet <-> storage packet <-> arrow table # python typespec <-> storage typespec <-> arrow schema -# +# # python packet <-> storage packet requires the use of SemanticTypeRegistry # conversion between storage packet <-> arrow table requires info about semantic_type @@ -152,13 +148,13 @@ def __contains__(self, python_type: type) -> bool: # """Convert Arrow Schema to storage typespec and semantic type metadata.""" # typespec = {} # semantic_type_info = {} - + # for field in schema: # field_type = field.type # typespec[field.name] = field_type.to_pandas_dtype() # Convert Arrow type to Pandas dtype # if field.metadata and b"semantic_type" in field.metadata: # semantic_type_info[field.name] = field.metadata[b"semantic_type"].decode("utf-8") - + # return typespec, semantic_type_info @@ -168,14 +164,9 @@ def __contains__(self, python_type: type) -> bool: # semantic_type_info: dict[str, str] | None = None, - # # TypeSpec + TypeRegistry + ArrowLUT -> Arrow Schema (annotated with semantic_type) -# # - - - - +# # # # TypeSpec <-> Arrow Schema @@ -184,7 +175,7 @@ def __contains__(self, python_type: type) -> bool: # """Convert TypeSpec to PyArrow Schema.""" # if metadata_info is None: # metadata_info = {} - + # fields = [] # for field_name, field_type in typespec.items(): # type_info = registry.get_type_info(field_type) @@ -227,7 +218,6 @@ def __contains__(self, python_type: type) -> bool: # return keys_with_handlers, pa.schema(schema_fields) - # def arrow_table_to_packets( # table: pa.Table, # registry: SemanticTypeRegistry, @@ -347,14 +337,14 @@ def __contains__(self, python_type: type) -> bool: # bool: pa.bool_(), # bytes: pa.binary(), # } - + # if python_type in basic_mapping: # return basic_mapping[python_type] - + # # Handle generic types # origin = get_origin(python_type) # args = get_args(python_type) - + # if origin is list: # # Handle list[T] # if args: @@ -362,7 +352,7 @@ def __contains__(self, python_type: type) -> bool: # return pa.list_(element_type) # else: # return pa.list_(pa.large_string()) # default to list of strings - + # elif origin is dict: # # Handle dict[K, V] - PyArrow uses map type # if len(args) == 2: @@ -372,13 +362,13 @@ def __contains__(self, python_type: type) -> bool: # else: # # Otherwise default to using long string # return pa.map_(pa.large_string(), pa.large_string()) - + # elif origin is UnionType: # # Handle Optional[T] (Union[T, None]) # if len(args) == 2 and type(None) in args: # non_none_type = args[0] if args[1] is type(None) else args[1] # return python_to_pyarrow_type(non_none_type) - + # # Default fallback # if not strict: # logger.warning(f"Unsupported type {python_type}, defaulting to large_string") diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 0786d10..4e48004 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -214,7 +214,6 @@ def extract_function_typespecs( return param_info, inferred_output_types - def get_typespec_from_dict(dict: Mapping) -> TypeSpec: """ Returns a TypeSpec for the given dictionary. @@ -248,7 +247,10 @@ def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | ) return merged -def intersection_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: + +def intersection_typespecs( + left: TypeSpec | None, right: TypeSpec | None +) -> TypeSpec | None: """ Returns the intersection of two TypeSpecs, only returning keys that are present in both. If a key is present in both TypeSpecs, the type must be the same. @@ -263,6 +265,8 @@ def intersection_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> Typ intersection[key] = get_compatible_type(left[key], right[key]) except TypeError: # If types are not compatible, raise an error - raise TypeError(f"Type conflict for key '{key}': {left[key]} vs {right[key]}") - - return intersection \ No newline at end of file + raise TypeError( + f"Type conflict for key '{key}': {left[key]} vs {right[key]}" + ) + + return intersection diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py index f359a8c..dd09e1f 100644 --- a/src/orcapod/utils/object_spec.py +++ b/src/orcapod/utils/object_spec.py @@ -1,5 +1,6 @@ import importlib + def parse_objectspec(obj_spec: dict) -> Any: if "_class" in obj_spec: # if _class is specified, treat the dict as an object specification @@ -16,4 +17,4 @@ def parse_objectspec(obj_spec: dict) -> Any: parsed_object[k] = parse_objectspec(v) else: parsed_object[k] = v - return parsed_object \ No newline at end of file + return parsed_object diff --git a/src/orcapod/utils/stream_utils.py b/src/orcapod/utils/stream_utils.py index 5c5bb62..4246088 100644 --- a/src/orcapod/utils/stream_utils.py +++ b/src/orcapod/utils/stream_utils.py @@ -12,7 +12,6 @@ V = TypeVar("V") - def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: merged = left.copy() for key, right_value in right.items(): @@ -26,8 +25,6 @@ def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: return merged - - def common_elements(*values) -> Collection[str]: """ Returns the common keys between all lists of values. The identified common elements are @@ -57,7 +54,10 @@ def join_tags(tag1: Mapping[K, V], tag2: Mapping[K, V]) -> dict[K, V] | None: joined_tag[k] = v return joined_tag -def semijoin_tags(tag1: Mapping[K, V], tag2: Mapping[K, V], target_keys: Collection[K]|None = None) -> dict[K, V] | None: + +def semijoin_tags( + tag1: Mapping[K, V], tag2: Mapping[K, V], target_keys: Collection[K] | None = None +) -> dict[K, V] | None: """ Semijoin two tags. If the tags have the same key, the value must be the same or None will be returned. If all shared key's value match, tag1 would be returned @@ -72,6 +72,7 @@ def semijoin_tags(tag1: Mapping[K, V], tag2: Mapping[K, V], target_keys: Collect return None return dict(tag1) + def check_packet_compatibility(packet1: Packet, packet2: Packet) -> bool: """ Checks if two packets are compatible. If the packets have the same key, the value must be the same or False will be returned. diff --git a/tests/test_hashing/test_basic_composite_hasher.py b/tests/test_hashing/test_basic_composite_hasher.py index f2da406..a2d35a6 100644 --- a/tests/test_hashing/test_basic_composite_hasher.py +++ b/tests/test_hashing/test_basic_composite_hasher.py @@ -181,7 +181,9 @@ def test_default_file_hasher_file_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + algorithm=algorithm + ) hash1 = hasher.hash_file(file_path) hash2 = hasher.hash_file(file_path) assert hash1 == hash2, f"Hash inconsistent for algorithm {algorithm}" @@ -193,7 +195,9 @@ def test_default_file_hasher_file_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + buffer_size=buffer_size + ) hash1 = hasher.hash_file(file_path) hash2 = hasher.hash_file(file_path) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" @@ -222,7 +226,9 @@ def test_default_file_hasher_pathset_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + algorithm=algorithm + ) hash1 = hasher.hash_pathset(pathset) hash2 = hasher.hash_pathset(pathset) assert hash1 == hash2, f"Hash inconsistent for algorithm {algorithm}" @@ -234,7 +240,9 @@ def test_default_file_hasher_pathset_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + buffer_size=buffer_size + ) hash1 = hasher.hash_pathset(pathset) hash2 = hasher.hash_pathset(pathset) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" @@ -266,7 +274,9 @@ def test_default_file_hasher_packet_hash_algorithm_parameters(): for algorithm in algorithms: try: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + algorithm=algorithm + ) hash1 = hasher.hash_packet(packet) hash2 = hasher.hash_packet(packet) @@ -285,7 +295,9 @@ def test_default_file_hasher_packet_hash_algorithm_parameters(): buffer_sizes = [1024, 4096, 16384, 65536] for buffer_size in buffer_sizes: - hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite(buffer_size=buffer_size) + hasher = LegacyPathLikeHasherFactory.create_basic_legacy_composite( + buffer_size=buffer_size + ) hash1 = hasher.hash_packet(packet) hash2 = hasher.hash_packet(packet) assert hash1 == hash2, f"Hash inconsistent for buffer size {buffer_size}" diff --git a/tests/test_hashing/test_hasher_factory.py b/tests/test_hashing/test_hasher_factory.py index 69804a3..68daa3a 100644 --- a/tests/test_hashing/test_hasher_factory.py +++ b/tests/test_hashing/test_hasher_factory.py @@ -30,7 +30,9 @@ def test_create_file_hasher_without_cacher(self): def test_create_file_hasher_with_cacher(self): """Test creating a file hasher with string cacher (returns CachedFileHasher).""" cacher = InMemoryCacher() - hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(string_cacher=cacher) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( + string_cacher=cacher + ) # Should return LegacyCachedFileHasher assert isinstance(hasher, LegacyCachedFileHasher) @@ -56,13 +58,15 @@ def test_create_file_hasher_custom_algorithm(self): ) assert isinstance(hasher, LegacyCachedFileHasher) assert isinstance(hasher.file_hasher, LegacyDefaultFileHasher) - assert hasher.file_hasher.algorithm == "sha512" - assert hasher.file_hasher.buffer_size == 65536 + assert hasher.file_hasher.algorithm == "sha512" + assert hasher.file_hasher.buffer_size == 65536 def test_create_file_hasher_custom_buffer_size(self): """Test creating file hasher with custom buffer size.""" # Without cacher - hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(buffer_size=32768) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( + buffer_size=32768 + ) assert isinstance(hasher, LegacyDefaultFileHasher) assert hasher.algorithm == "sha256" assert hasher.buffer_size == 32768 @@ -94,7 +98,9 @@ def test_create_file_hasher_different_cacher_types(self): """Test creating file hasher with different types of string cachers.""" # InMemoryCacher memory_cacher = InMemoryCacher() - hasher1 = LegacyPathLikeHasherFactory.create_legacy_file_hasher(string_cacher=memory_cacher) + hasher1 = LegacyPathLikeHasherFactory.create_legacy_file_hasher( + string_cacher=memory_cacher + ) assert isinstance(hasher1, LegacyCachedFileHasher) assert hasher1.string_cacher is memory_cacher @@ -184,13 +190,17 @@ def test_create_file_hasher_parameter_edge_cases(self): assert hasher1.buffer_size == 1 # Large buffer size - hasher2 = LegacyPathLikeHasherFactory.create_legacy_file_hasher(buffer_size=1024 * 1024) + hasher2 = LegacyPathLikeHasherFactory.create_legacy_file_hasher( + buffer_size=1024 * 1024 + ) assert isinstance(hasher2, LegacyDefaultFileHasher) assert hasher2.buffer_size == 1024 * 1024 # Different algorithms for algorithm in ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher(algorithm=algorithm) + hasher = LegacyPathLikeHasherFactory.create_legacy_file_hasher( + algorithm=algorithm + ) assert isinstance(hasher, LegacyDefaultFileHasher) assert hasher.algorithm == algorithm diff --git a/tests/test_hashing/test_legacy_composite_hasher.py b/tests/test_hashing/test_legacy_composite_hasher.py index f3a8de4..f234bb7 100644 --- a/tests/test_hashing/test_legacy_composite_hasher.py +++ b/tests/test_hashing/test_legacy_composite_hasher.py @@ -6,8 +6,15 @@ import pytest from orcapod.hashing.legacy_core import hash_to_hex -from orcapod.hashing.file_hashers import LegacyDefaultFileHasher, LegacyDefaultCompositeFileHasher -from orcapod.hashing.types import LegacyFileHasher, LegacyPacketHasher, LegacyPathSetHasher +from orcapod.hashing.file_hashers import ( + LegacyDefaultFileHasher, + LegacyDefaultCompositeFileHasher, +) +from orcapod.hashing.types import ( + LegacyFileHasher, + LegacyPacketHasher, + LegacyPathSetHasher, +) # Custom implementation of hash_file for tests that doesn't check for file existence @@ -90,7 +97,9 @@ def patch_hash_functions(): """Patch the hash functions in the core module for all tests.""" with ( patch("orcapod.hashing.legacy_core.hash_file", side_effect=mock_hash_file), - patch("orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset), + patch( + "orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset + ), patch("orcapod.hashing.legacy_core.hash_packet", side_effect=mock_hash_packet), ): yield diff --git a/tests/test_hashing/test_path_set_hasher.py b/tests/test_hashing/test_path_set_hasher.py index 0a48acb..c235eb0 100644 --- a/tests/test_hashing/test_path_set_hasher.py +++ b/tests/test_hashing/test_path_set_hasher.py @@ -86,7 +86,9 @@ def mock_hash_pathset( @pytest.fixture(autouse=True) def patch_hash_pathset(): """Patch the hash_pathset function in the hashing module for all tests.""" - with patch("orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset): + with patch( + "orcapod.hashing.legacy_core.hash_pathset", side_effect=mock_hash_pathset + ): yield @@ -225,7 +227,8 @@ def custom_hash_nonexistent(pathset, **kwargs): # Patch hash_pathset just for this test with patch( - "orcapod.hashing.legacy_core.hash_pathset", side_effect=custom_hash_nonexistent + "orcapod.hashing.legacy_core.hash_pathset", + side_effect=custom_hash_nonexistent, ): result = pathset_hasher.hash_pathset(pathset) diff --git a/tests/test_hashing/test_string_cacher/test_redis_cacher.py b/tests/test_hashing/test_string_cacher/test_redis_cacher.py index 3ef49e1..eef7c43 100644 --- a/tests/test_hashing/test_string_cacher/test_redis_cacher.py +++ b/tests/test_hashing/test_string_cacher/test_redis_cacher.py @@ -68,21 +68,21 @@ def keys(self, pattern): return [key for key in self.data.keys() if key.startswith(prefix)] return [key for key in self.data.keys() if key == pattern] + class MockRedisModule: ConnectionError = MockConnectionError RedisError = MockRedisError Redis = MagicMock(return_value=MockRedis()) # Simple one-liner! - def mock_get_redis(): return MockRedisModule + def mock_no_redis(): return None - class TestRedisCacher: """Test cases for RedisCacher with mocked Redis.""" From cbe82aba171a35d21dc4c29aae7113c8c7c9f107 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 16:31:50 +0000 Subject: [PATCH 035/224] fix: legacy_core imports --- tests/test_hashing/test_file_hashes.py | 2 +- tests/test_hashing/test_hash_samples.py | 2 +- tests/test_hashing/test_pathset_and_packet.py | 2 +- tests/test_hashing/test_pathset_packet_hashes.py | 2 +- tests/test_store/test_dir_data_store.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_hashing/test_file_hashes.py b/tests/test_hashing/test_file_hashes.py index 1de0716..afcaaad 100644 --- a/tests/test_hashing/test_file_hashes.py +++ b/tests/test_hashing/test_file_hashes.py @@ -12,7 +12,7 @@ import pytest # Add the parent directory to the path to import orcapod -from orcapod.hashing import hash_file +from orcapod.hashing.legacy_core import hash_file def load_hash_lut(): diff --git a/tests/test_hashing/test_hash_samples.py b/tests/test_hashing/test_hash_samples.py index cfb3e35..1e536cb 100644 --- a/tests/test_hashing/test_hash_samples.py +++ b/tests/test_hashing/test_hash_samples.py @@ -12,7 +12,7 @@ import pytest -from orcapod.hashing import hash_to_hex, hash_to_int, hash_to_uuid +from orcapod.hashing.legacy_core import hash_to_hex, hash_to_int, hash_to_uuid def get_latest_hash_samples(): diff --git a/tests/test_hashing/test_pathset_and_packet.py b/tests/test_hashing/test_pathset_and_packet.py index fc00b29..cde79da 100644 --- a/tests/test_hashing/test_pathset_and_packet.py +++ b/tests/test_hashing/test_pathset_and_packet.py @@ -13,7 +13,7 @@ import pytest -from orcapod.hashing import hash_file, hash_packet, hash_pathset +from orcapod.hashing.legacy_core import hash_file, hash_packet, hash_pathset logger = logging.getLogger(__name__) diff --git a/tests/test_hashing/test_pathset_packet_hashes.py b/tests/test_hashing/test_pathset_packet_hashes.py index 7745881..7df740d 100644 --- a/tests/test_hashing/test_pathset_packet_hashes.py +++ b/tests/test_hashing/test_pathset_packet_hashes.py @@ -12,7 +12,7 @@ import pytest # Add the parent directory to the path to import orcapod -from orcapod.hashing import hash_packet, hash_pathset +from orcapod.hashing.legacy_core import hash_packet, hash_pathset def load_pathset_hash_lut(): diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index eae39eb..09d84d7 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -499,7 +499,7 @@ def test_dir_data_store_legacy_mode_compatibility(temp_dir, sample_files): output_packet = {"output_file": sample_files["output"]["output1"]} # Get the hash values directly for comparison - from orcapod.hashing import hash_packet + from orcapod.hashing.legacy_core import hash_packet legacy_hash = hash_packet(packet, algorithm="sha256") assert store_default.packet_hasher is not None, ( @@ -610,7 +610,7 @@ def test_dir_data_store_hash_equivalence(temp_dir, sample_files): output_packet = {"output_file": sample_files["output"]["output1"]} # First compute hashes directly - from orcapod.hashing import hash_packet + from orcapod.hashing.legacy_core import hash_packet from orcapod.hashing.defaults import get_default_composite_file_hasher legacy_hash = hash_packet(packet, algorithm="sha256") From caca67b5ea0444cfcbfdaaf5db25e330dcb841dd Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 19:39:51 +0000 Subject: [PATCH 036/224] wip: arrow logical serialization --- src/orcapod/hashing/arrow_serialization.py | 822 +++++++++++++++++++++ 1 file changed, 822 insertions(+) create mode 100644 src/orcapod/hashing/arrow_serialization.py diff --git a/src/orcapod/hashing/arrow_serialization.py b/src/orcapod/hashing/arrow_serialization.py new file mode 100644 index 0000000..e4926cb --- /dev/null +++ b/src/orcapod/hashing/arrow_serialization.py @@ -0,0 +1,822 @@ +import pyarrow as pa +from io import BytesIO +import pyarrow.ipc as ipc +import struct +from typing import Any +import hashlib + + +def serialize_table_ipc(table: pa.Table) -> bytes: + # TODO: fix and use logical table hashing instead + """Serialize table using Arrow IPC format for stable binary representation.""" + buffer = BytesIO() + + # Write format version + buffer.write(b"ARROW_IPC_V1") + + # Use IPC stream format for deterministic serialization + with ipc.new_stream(buffer, table.schema) as writer: + writer.write_table(table) + + return buffer.getvalue() + + +def serialize_table_logical(table: pa.Table) -> bytes: + """ + Serialize table using column-wise processing with direct binary data access. + + This implementation works directly with Arrow's underlying binary buffers + without converting to Python objects, making it much faster and more + memory efficient while maintaining high repeatability. + """ + buffer = BytesIO() + + # Write format version + buffer.write(b"ARROW_BINARY_V1") + + # Serialize schema deterministically + _serialize_schema_deterministic(buffer, table.schema) + + # Process each column using direct binary access + column_digests = [] + for i in range(table.num_columns): + column = table.column(i) + field = table.schema.field(i) + column_digest = _serialize_column_binary(column, field) + column_digests.append(column_digest) + + # Combine column digests + for digest in column_digests: + buffer.write(digest) + + return buffer.getvalue() + + +def _serialize_schema_deterministic(buffer: BytesIO, schema: pa.Schema) -> None: + """Serialize schema information deterministically.""" + buffer.write(struct.pack(" None: + """Serialize Arrow data type deterministically.""" + type_id = data_type.id + buffer.write(struct.pack(" bytes: + """ + Serialize column using direct binary buffer access. + + To ensure chunking independence, we combine chunks into a single array + before processing. This ensures identical output regardless of chunk boundaries. + """ + buffer = BytesIO() + + # Combine all chunks into a single array for consistent processing + if column.num_chunks > 1: + # Multiple chunks - combine them + combined_array = pa.concat_arrays(column.chunks) + elif column.num_chunks == 1: + # Single chunk - use directly + combined_array = column.chunk(0) + else: + # No chunks - create empty array + combined_array = pa.array([], type=field.type) + + # Process the combined array + chunk_result = _serialize_array_binary(combined_array, field.type) + buffer.write(chunk_result) + + return buffer.getvalue() + + +def _serialize_array_binary(array: pa.Array, data_type: pa.DataType) -> bytes: + """Serialize array using direct access to Arrow's binary buffers.""" + buffer = BytesIO() + + # Get validity buffer (null bitmap) if it exists + validity_buffer = None + if array.buffers()[0] is not None: + validity_buffer = array.buffers()[0] + + # Process based on Arrow type, accessing buffers directly + if _is_primitive_type(data_type): + _serialize_primitive_array_binary(buffer, array, data_type, validity_buffer) + + elif pa.types.is_string(data_type) or pa.types.is_large_string(data_type): + _serialize_string_array_binary(buffer, array, data_type, validity_buffer) + + elif pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type): + _serialize_binary_array_binary(buffer, array, data_type, validity_buffer) + + elif pa.types.is_list(data_type) or pa.types.is_large_list(data_type): + _serialize_list_array_binary(buffer, array, data_type, validity_buffer) + + elif pa.types.is_struct(data_type): + _serialize_struct_array_binary(buffer, array, data_type, validity_buffer) + + elif pa.types.is_dictionary(data_type): + _serialize_dictionary_array_binary(buffer, array, data_type, validity_buffer) + + else: + # Fallback to element-wise processing for complex types + _serialize_array_fallback(buffer, array, data_type, validity_buffer) + + return buffer.getvalue() + + +def _is_primitive_type(data_type: pa.DataType) -> bool: + """Check if type can be processed as primitive (fixed-size) data.""" + return ( + pa.types.is_integer(data_type) + or pa.types.is_floating(data_type) + or pa.types.is_boolean(data_type) + or pa.types.is_date(data_type) + or pa.types.is_time(data_type) + or pa.types.is_timestamp(data_type) + ) + + +def _serialize_primitive_array_binary( + buffer: BytesIO, array: pa.Array, data_type: pa.DataType, validity_buffer +): + """Serialize primitive arrays by directly copying binary data.""" + # Write validity bitmap + _serialize_validity_buffer(buffer, validity_buffer) + + # Get data buffer (buffer[1] for primitive types) + data_buffer = array.buffers()[1] + if data_buffer is not None: + # For primitive types, copy the buffer directly + if pa.types.is_boolean(data_type): + # Boolean needs the length for bit interpretation + buffer.write(struct.pack(" 0: + child_array = array.children[0] + + # Recursively serialize child array + if child_array is not None: + child_data = _serialize_array_binary(child_array, data_type.value_type) + buffer.write(child_data) + + +def _serialize_struct_array_binary( + buffer: BytesIO, array: pa.Array, data_type: pa.DataType, validity_buffer +): + """Serialize struct arrays by processing child arrays.""" + # Write validity bitmap + _serialize_validity_buffer(buffer, validity_buffer) + + # Serialize each child field + for i, child_array in enumerate(array.children): + field_type = data_type[i].type + child_data = _serialize_array_binary(child_array, field_type) + buffer.write(child_data) + + +def _serialize_dictionary_array_binary( + buffer: BytesIO, array: pa.Array, data_type: pa.DataType, validity_buffer +): + """Serialize dictionary arrays using indices + dictionary.""" + # Write validity bitmap + _serialize_validity_buffer(buffer, validity_buffer) + + # Serialize indices array + indices_data = _serialize_array_binary(array.indices, data_type.index_type) + buffer.write(indices_data) + + # Serialize dictionary array + dict_data = _serialize_array_binary(array.dictionary, data_type.value_type) + buffer.write(dict_data) + + +def _serialize_validity_buffer(buffer: BytesIO, validity_buffer): + """Serialize validity (null) bitmap.""" + if validity_buffer is not None: + # Copy validity bitmap directly + buffer.write(validity_buffer.to_pybytes()) + # If no validity buffer, there are no nulls (implicit) + + +def _serialize_boolean_buffer(buffer: BytesIO, data_buffer, array_length: int): + """Serialize boolean buffer (bit-packed).""" + # Boolean data is bit-packed, copy directly + bool_bytes = data_buffer.to_pybytes() + buffer.write(struct.pack(" int: + """Get byte width of primitive types.""" + if pa.types.is_boolean(data_type): + return 1 # Bit-packed, but minimum 1 byte + elif pa.types.is_integer(data_type) or pa.types.is_floating(data_type): + return data_type.bit_width // 8 + elif pa.types.is_date(data_type): + return 4 if data_type == pa.date32() else 8 + elif pa.types.is_time(data_type) or pa.types.is_timestamp(data_type): + return data_type.bit_width // 8 + else: + return 8 # Default + + +def _serialize_array_fallback( + buffer: BytesIO, array: pa.Array, data_type: pa.DataType, validity_buffer +): + """Fallback to element-wise processing for complex types.""" + # Write validity bitmap + _serialize_validity_buffer(buffer, validity_buffer) + + # Process element by element (only for types that need it) + for i in range(len(array)): + if array.is_null(i): + buffer.write(b"\x00") + else: + buffer.write(b"\x01") + # For complex nested types, we might still need .as_py() + # But this should be rare with proper binary handling above + value = array[i].as_py() + _serialize_complex_value(buffer, value, data_type) + + +def _serialize_complex_value(buffer: BytesIO, value: Any, data_type: pa.DataType): + """Serialize complex values that can't be handled by direct buffer access.""" + # This handles edge cases like nested structs with mixed types + if pa.types.is_decimal(data_type): + decimal_str = str(value).encode("utf-8") + buffer.write(struct.pack(" str: + """Create deterministic hash using binary serialization.""" + serialized = serialize_table_logical(table) + + if algorithm == "sha256": + hasher = hashlib.sha256() + elif algorithm == "sha3_256": + hasher = hashlib.sha3_256() + elif algorithm == "blake2b": + hasher = hashlib.blake2b() + else: + raise ValueError(f"Unsupported hash algorithm: {algorithm}") + + hasher.update(serialized) + return hasher.hexdigest() + + +def serialize_table_logical_streaming(table: pa.Table) -> str: + """ + Memory-efficient streaming version that produces the same hash as serialize_table_logical_hash. + + This version processes data in streaming fashion but maintains the same logical structure + as the non-streaming version to ensure identical hashes and chunking independence. + """ + hasher = hashlib.sha256() + + # Hash format version (same as non-streaming) + hasher.update(b"ARROW_BINARY_V1") + + # Hash schema (same as non-streaming) + schema_buffer = BytesIO() + _serialize_schema_deterministic(schema_buffer, table.schema) + hasher.update(schema_buffer.getvalue()) + + # Process each column using the same logic as non-streaming + for i in range(table.num_columns): + column = table.column(i) + field = table.schema.field(i) + + # Use the same column serialization logic for chunking independence + column_data = _serialize_column_binary(column, field) + + # Hash the column data + hasher.update(column_data) + + return hasher.hexdigest() + + +# Test utilities +def create_test_table_1(): + """Create a basic test table with various data types.""" + return pa.table( + { + "int32_col": pa.array([1, 2, None, 4, 5], type=pa.int32()), + "float64_col": pa.array([1.1, 2.2, 3.3, None, 5.5], type=pa.float64()), + "string_col": pa.array(["hello", "world", None, "arrow", "fast"]), + "bool_col": pa.array([True, False, None, True, False]), + "binary_col": pa.array([b"data1", b"data2", None, b"data4", b"data5"]), + } + ) + + +def create_test_table_reordered_columns(): + """Same data as test_table_1 but with different column order.""" + return pa.table( + { + "string_col": pa.array(["hello", "world", None, "arrow", "fast"]), + "bool_col": pa.array([True, False, None, True, False]), + "int32_col": pa.array([1, 2, None, 4, 5], type=pa.int32()), + "binary_col": pa.array([b"data1", b"data2", None, b"data4", b"data5"]), + "float64_col": pa.array([1.1, 2.2, 3.3, None, 5.5], type=pa.float64()), + } + ) + + +def create_test_table_reordered_rows(): + """Same data as test_table_1 but with different row order.""" + return pa.table( + { + "int32_col": pa.array([5, 4, None, 2, 1], type=pa.int32()), + "float64_col": pa.array([5.5, None, 3.3, 2.2, 1.1], type=pa.float64()), + "string_col": pa.array(["fast", "arrow", None, "world", "hello"]), + "bool_col": pa.array([False, True, None, False, True]), + "binary_col": pa.array([b"data5", b"data4", None, b"data2", b"data1"]), + } + ) + + +def create_test_table_different_types(): + """Same logical data but with different Arrow types where possible.""" + return pa.table( + { + "int32_col": pa.array( + [1, 2, None, 4, 5], type=pa.int64() + ), # int64 instead of int32 + "float64_col": pa.array( + [1.1, 2.2, 3.3, None, 5.5], type=pa.float32() + ), # float32 instead of float64 + "string_col": pa.array(["hello", "world", None, "arrow", "fast"]), + "bool_col": pa.array([True, False, None, True, False]), + "binary_col": pa.array([b"data1", b"data2", None, b"data4", b"data5"]), + } + ) + + +def create_test_table_different_chunking(): + """Same data as test_table_1 but with different chunking.""" + # Create arrays with explicit chunking + int_chunks = [ + pa.array([1, 2], type=pa.int32()), + pa.array([None, 4, 5], type=pa.int32()), + ] + float_chunks = [ + pa.array([1.1], type=pa.float64()), + pa.array([2.2, 3.3, None, 5.5], type=pa.float64()), + ] + string_chunks = [pa.array(["hello", "world"]), pa.array([None, "arrow", "fast"])] + bool_chunks = [pa.array([True, False, None]), pa.array([True, False])] + binary_chunks = [ + pa.array([b"data1"]), + pa.array([b"data2", None, b"data4", b"data5"]), + ] + + return pa.table( + { + "int32_col": pa.chunked_array(int_chunks), + "float64_col": pa.chunked_array(float_chunks), + "string_col": pa.chunked_array(string_chunks), + "bool_col": pa.chunked_array(bool_chunks), + "binary_col": pa.chunked_array(binary_chunks), + } + ) + + +def create_test_table_empty(): + """Create an empty table with same schema.""" + return pa.table( + { + "int32_col": pa.array([], type=pa.int32()), + "float64_col": pa.array([], type=pa.float64()), + "string_col": pa.array([], type=pa.string()), + "bool_col": pa.array([], type=pa.bool_()), + "binary_col": pa.array([], type=pa.binary()), + } + ) + + +def create_test_table_all_nulls(): + """Create a table with all null values.""" + return pa.table( + { + "int32_col": pa.array([None, None, None], type=pa.int32()), + "float64_col": pa.array([None, None, None], type=pa.float64()), + "string_col": pa.array([None, None, None], type=pa.string()), + "bool_col": pa.array([None, None, None], type=pa.bool_()), + "binary_col": pa.array([None, None, None], type=pa.binary()), + } + ) + + +def create_test_table_no_nulls(): + """Create a table with no null values.""" + return pa.table( + { + "int32_col": pa.array([1, 2, 3, 4, 5], type=pa.int32()), + "float64_col": pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), + "string_col": pa.array(["hello", "world", "arrow", "fast", "data"]), + "bool_col": pa.array([True, False, True, False, True]), + "binary_col": pa.array([b"data1", b"data2", b"data3", b"data4", b"data5"]), + } + ) + + +def create_test_table_complex_types(): + """Create a table with complex nested types.""" + return pa.table( + { + "list_col": pa.array( + [[1, 2], [3, 4, 5], None, [], [6]], type=pa.list_(pa.int32()) + ), + "struct_col": pa.array( + [ + {"a": 1, "b": "x"}, + {"a": 2, "b": "y"}, + None, + {"a": 3, "b": "z"}, + {"a": 4, "b": "w"}, + ], + type=pa.struct([("a", pa.int32()), ("b", pa.string())]), + ), + "dict_col": pa.array( + ["apple", "banana", "apple", None, "cherry"] + ).dictionary_encode(), + } + ) + + +def create_test_table_single_column(): + """Create a table with just one column.""" + return pa.table({"single_col": pa.array([1, 2, 3, 4, 5], type=pa.int32())}) + + +def create_test_table_single_row(): + """Create a table with just one row.""" + return pa.table( + { + "int32_col": pa.array([42], type=pa.int32()), + "string_col": pa.array(["single"]), + "bool_col": pa.array([True]), + } + ) + + +def run_comprehensive_tests(): + """Run comprehensive test suite for serialization.""" + import time + + print("=" * 60) + print("COMPREHENSIVE ARROW SERIALIZATION TEST SUITE") + print("=" * 60) + + # Test cases + test_cases = [ + ("Basic table", create_test_table_1), + ("Reordered columns", create_test_table_reordered_columns), + ("Reordered rows", create_test_table_reordered_rows), + ("Different types", create_test_table_different_types), + ("Different chunking", create_test_table_different_chunking), + ("Empty table", create_test_table_empty), + ("All nulls", create_test_table_all_nulls), + ("No nulls", create_test_table_no_nulls), + ("Complex types", create_test_table_complex_types), + ("Single column", create_test_table_single_column), + ("Single row", create_test_table_single_row), + ] + + # Generate hashes for all test cases + results = {} + + print("\n1. GENERATING HASHES FOR ALL TEST CASES") + print("-" * 50) + + for name, create_func in test_cases: + try: + table = create_func() + + # Generate all hash types + logical_hash = serialize_table_logical_hash(table) + streaming_hash = serialize_table_logical_streaming(table) + ipc_hash = hashlib.sha256(serialize_table_ipc(table)).hexdigest() + + results[name] = { + "table": table, + "logical": logical_hash, + "streaming": streaming_hash, + "ipc": ipc_hash, + "rows": table.num_rows, + "cols": table.num_columns, + } + + print( + f"{name:20} | Rows: {table.num_rows:5} | Cols: {table.num_columns:2} | " + f"Logical: {logical_hash[:12]}... | IPC: {ipc_hash[:12]}..." + ) + + except Exception as e: + print(f"{name:20} | ERROR: {str(e)}") + results[name] = {"error": str(e)} + + print("\n2. DETERMINISM TESTS") + print("-" * 50) + + base_table = create_test_table_1() + + # Test multiple runs of same table + logical_hashes = [serialize_table_logical_hash(base_table) for _ in range(5)] + streaming_hashes = [serialize_table_logical_streaming(base_table) for _ in range(5)] + ipc_hashes = [ + hashlib.sha256(serialize_table_ipc(base_table)).hexdigest() for _ in range(5) + ] + + print( + f"Logical deterministic: {len(set(logical_hashes)) == 1} ({len(set(logical_hashes))}/5 unique)" + ) + print( + f"Streaming deterministic: {len(set(streaming_hashes)) == 1} ({len(set(streaming_hashes))}/5 unique)" + ) + print( + f"IPC deterministic: {len(set(ipc_hashes)) == 1} ({len(set(ipc_hashes))}/5 unique)" + ) + print(f"Streaming == Logical: {streaming_hashes[0] == logical_hashes[0]}") + + print("\n3. EQUIVALENCE TESTS") + print("-" * 50) + + base_logical = results["Basic table"]["logical"] + base_ipc = results["Basic table"]["ipc"] + + equivalence_tests = [ + ( + "Same table vs reordered columns", + "Reordered columns", + False, + "Different column order should produce different hash", + ), + ( + "Same table vs reordered rows", + "Reordered rows", + False, + "Different row order should produce different hash", + ), + ( + "Same table vs different types", + "Different types", + False, + "Different data types should produce different hash", + ), + ( + "Same table vs different chunking", + "Different chunking", + True, + "Same data with different chunking should produce same hash", + ), + ( + "Same table vs no nulls", + "No nulls", + False, + "Different null patterns should produce different hash", + ), + ( + "Same table vs all nulls", + "All nulls", + False, + "Different data should produce different hash", + ), + ] + + for test_name, compare_case, should_match, explanation in equivalence_tests: + if compare_case in results and "logical" in results[compare_case]: + compare_logical = results[compare_case]["logical"] + compare_ipc = results[compare_case]["ipc"] + + logical_match = base_logical == compare_logical + ipc_match = base_ipc == compare_ipc + + logical_status = "✓" if logical_match == should_match else "✗" + ipc_status = "✓" if ipc_match == should_match else "✗" + + print(f"{logical_status} {test_name}") + print(f" Logical: {logical_match} (expected: {should_match})") + print(f" IPC: {ipc_match} (expected: {should_match})") + print(f" Reason: {explanation}") + print() + + print("4. CHUNKING INDEPENDENCE DETAILED TEST") + print("-" * 50) + + # Test various chunking strategies + original_table = create_test_table_1() + combined_table = original_table.combine_chunks() + different_chunking = create_test_table_different_chunking() + + orig_logical = serialize_table_logical_hash(original_table) + comb_logical = serialize_table_logical_hash(combined_table) + diff_logical = serialize_table_logical_hash(different_chunking) + + orig_ipc = hashlib.sha256(serialize_table_ipc(original_table)).hexdigest() + comb_ipc = hashlib.sha256(serialize_table_ipc(combined_table)).hexdigest() + diff_ipc = hashlib.sha256(serialize_table_ipc(different_chunking)).hexdigest() + + print(f"Original chunking: {orig_logical[:16]}...") + print(f"Combined chunks: {comb_logical[:16]}...") + print(f"Different chunking: {diff_logical[:16]}...") + print( + f"Logical chunking-independent: {orig_logical == comb_logical == diff_logical}" + ) + print() + print(f"Original IPC: {orig_ipc[:16]}...") + print(f"Combined IPC: {comb_ipc[:16]}...") + print(f"Different IPC: {diff_ipc[:16]}...") + print(f"IPC chunking-independent: {orig_ipc == comb_ipc == diff_ipc}") + + print("\n5. PERFORMANCE COMPARISON") + print("-" * 50) + + # Create larger table for performance testing + large_size = 10000 + large_table = pa.table( + { + "int_col": pa.array(list(range(large_size)), type=pa.int32()), + "float_col": pa.array( + [i * 1.5 for i in range(large_size)], type=pa.float64() + ), + "string_col": pa.array([f"item_{i}" for i in range(large_size)]), + "bool_col": pa.array([i % 2 == 0 for i in range(large_size)]), + } + ) + + # Time each method + methods = [ + ("Logical", lambda t: serialize_table_logical_hash(t)), + ("Streaming", lambda t: serialize_table_logical_streaming(t)), + ("IPC", lambda t: hashlib.sha256(serialize_table_ipc(t)).hexdigest()), + ] + hash_result = "" + for method_name, method_func in methods: + times = [] + for _ in range(3): # Run 3 times for average + start = time.time() + hash_result = method_func(large_table) + end = time.time() + times.append(end - start) + + avg_time = sum(times) / len(times) + throughput = (large_size * 4) / avg_time # 4 columns + + print( + f"{method_name:10} | {avg_time * 1000:6.1f}ms | {throughput:8.0f} values/sec | {hash_result[:12]}..." + ) + + print("\n6. EDGE CASES") + print("-" * 50) + + edge_cases = ["Empty table", "All nulls", "Single column", "Single row"] + for case in edge_cases: + if case in results and "error" not in results[case]: + r = results[case] + print( + f"{case:15} | {r['rows']:3}r x {r['cols']:2}c | " + f"L:{r['logical'][:8]}... | I:{r['ipc'][:8]}... | " + f"Match: {r['logical'] == r['streaming']}" + ) + + print("\n7. COMPLEX TYPES TEST") + print("-" * 50) + + if "Complex types" in results and "error" not in results["Complex types"]: + complex_result = results["Complex types"] + print(f"Complex types serialization successful:") + print(f" Logical hash: {complex_result['logical']}") + print( + f" Streaming ==: {complex_result['logical'] == complex_result['streaming']}" + ) + print(f" Rows/Cols: {complex_result['rows']}r x {complex_result['cols']}c") + else: + print( + "Complex types test failed - this is expected for some complex nested types" + ) + + print(f"\n{'=' * 60}") + print("TEST SUITE COMPLETE") + print(f"{'=' * 60}") + + return results + + +# Main execution +if __name__ == "__main__": + # Run the comprehensive test suite + test_results = run_comprehensive_tests() From 7bc98e1255b043f9da8a151bd7e3053e27a526b5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 1 Jul 2025 19:40:31 +0000 Subject: [PATCH 037/224] refactor: utils renaming and relocation --- .../stores/{file_ops.py => file_utils.py} | 6 +- src/orcapod/stores/safe_dir_data_store.py | 4 +- src/orcapod/types/utils.py | 62 ------------------ src/orcapod/utils/name.py | 64 +++++++++++++++++++ 4 files changed, 69 insertions(+), 67 deletions(-) rename src/orcapod/stores/{file_ops.py => file_utils.py} (99%) delete mode 100644 src/orcapod/types/utils.py diff --git a/src/orcapod/stores/file_ops.py b/src/orcapod/stores/file_utils.py similarity index 99% rename from src/orcapod/stores/file_ops.py rename to src/orcapod/stores/file_utils.py index 4fa6202..34380e0 100644 --- a/src/orcapod/stores/file_ops.py +++ b/src/orcapod/stores/file_utils.py @@ -7,7 +7,7 @@ import os from pathlib import Path -from orcapod.types import PathLike, PathSet, Packet +from orcapod.types import PathLike, PathSet, PacketLike from collections.abc import Collection, Callable @@ -369,8 +369,8 @@ def patched_open(file, *args, **kwargs): def virtual_mount( - packet: Packet, -) -> tuple[Packet, dict[str, str], dict[str, str]]: + packet: PacketLike, +) -> tuple[PacketLike, dict[str, str], dict[str, str]]: """ Visit all pathset within the packet, and convert them to alternative path representation. By default, full path is mapped to the file name. If two or diff --git a/src/orcapod/stores/safe_dir_data_store.py b/src/orcapod/stores/safe_dir_data_store.py index 7e16f63..e02e9cc 100644 --- a/src/orcapod/stores/safe_dir_data_store.py +++ b/src/orcapod/stores/safe_dir_data_store.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Optional, Union -from .file_ops import atomic_copy, atomic_write +from .file_utils import atomic_copy, atomic_write logger = logging.getLogger(__name__) @@ -23,7 +23,7 @@ class FileLockError(Exception): @contextmanager def file_lock( - lock_path: Union[str, Path], + lock_path: str | Path, shared: bool = False, timeout: float = 30.0, delay: float = 0.1, diff --git a/src/orcapod/types/utils.py b/src/orcapod/types/utils.py deleted file mode 100644 index 5393492..0000000 --- a/src/orcapod/types/utils.py +++ /dev/null @@ -1,62 +0,0 @@ -# TODO: move these functions to util -def escape_with_postfix(field: str, postfix=None, separator="_") -> str: - """ - Escape the field string by doubling separators and optionally append a postfix. - This function takes a field string and escapes any occurrences of the separator - by doubling them, then optionally appends a postfix with a separator prefix. - - Args: - field (str): The input string containing to be escaped. - postfix (str, optional): An optional postfix to append to the escaped string. - If None, no postfix is added. Defaults to None. - separator (str, optional): The separator character to escape and use for - prefixing the postfix. Defaults to "_". - Returns: - str: The escaped string with optional postfix. Returns empty string if - fields is provided but postfix is None. - Examples: - >>> escape_with_postfix("field1_field2", "suffix") - 'field1__field2_suffix' - >>> escape_with_postfix("name_age_city", "backup", "_") - 'name__age__city_backup' - >>> escape_with_postfix("data-info", "temp", "-") - 'data--info-temp' - >>> escape_with_postfix("simple", None) - 'simple' - >>> escape_with_postfix("no_separators", "end") - 'no__separators_end' - """ - - return field.replace(separator, separator * 2) + (f"_{postfix}" if postfix else "") - - -def unescape_with_postfix(field: str, separator="_") -> tuple[str, str | None]: - """ - Unescape a string by converting double separators back to single separators and extract postfix metadata. - This function reverses the escaping process where single separators were doubled to avoid - conflicts with metadata delimiters. It splits the input on double separators, then extracts - any postfix metadata from the last part. - - Args: - field (str): The escaped string containing doubled separators and optional postfix metadata - separator (str, optional): The separator character used for escaping. Defaults to "_" - Returns: - tuple[str, str | None]: A tuple containing: - - The unescaped string with single separators restored - - The postfix metadata if present, None otherwise - Examples: - >>> unescape_with_postfix("field1__field2__field3") - ('field1_field2_field3', None) - >>> unescape_with_postfix("field1__field2_metadata") - ('field1_field2', 'metadata') - >>> unescape_with_postfix("simple") - ('simple', None) - >>> unescape_with_postfix("field1--field2", separator="-") - ('field1-field2', None) - >>> unescape_with_postfix("field1--field2-meta", separator="-") - ('field1-field2', 'meta') - """ - - parts = field.split(separator * 2) - parts[-1], *meta = parts[-1].split("_", 1) - return separator.join(parts), meta[0] if meta else None diff --git a/src/orcapod/utils/name.py b/src/orcapod/utils/name.py index ba2c4f0..2211ef6 100644 --- a/src/orcapod/utils/name.py +++ b/src/orcapod/utils/name.py @@ -5,6 +5,70 @@ import re +# TODO: move these functions to util +def escape_with_postfix(field: str, postfix=None, separator="_") -> str: + """ + Escape the field string by doubling separators and optionally append a postfix. + This function takes a field string and escapes any occurrences of the separator + by doubling them, then optionally appends a postfix with a separator prefix. + + Args: + field (str): The input string containing to be escaped. + postfix (str, optional): An optional postfix to append to the escaped string. + If None, no postfix is added. Defaults to None. + separator (str, optional): The separator character to escape and use for + prefixing the postfix. Defaults to "_". + Returns: + str: The escaped string with optional postfix. Returns empty string if + fields is provided but postfix is None. + Examples: + >>> escape_with_postfix("field1_field2", "suffix") + 'field1__field2_suffix' + >>> escape_with_postfix("name_age_city", "backup", "_") + 'name__age__city_backup' + >>> escape_with_postfix("data-info", "temp", "-") + 'data--info-temp' + >>> escape_with_postfix("simple", None) + 'simple' + >>> escape_with_postfix("no_separators", "end") + 'no__separators_end' + """ + + return field.replace(separator, separator * 2) + (f"_{postfix}" if postfix else "") + + +def unescape_with_postfix(field: str, separator="_") -> tuple[str, str | None]: + """ + Unescape a string by converting double separators back to single separators and extract postfix metadata. + This function reverses the escaping process where single separators were doubled to avoid + conflicts with metadata delimiters. It splits the input on double separators, then extracts + any postfix metadata from the last part. + + Args: + field (str): The escaped string containing doubled separators and optional postfix metadata + separator (str, optional): The separator character used for escaping. Defaults to "_" + Returns: + tuple[str, str | None]: A tuple containing: + - The unescaped string with single separators restored + - The postfix metadata if present, None otherwise + Examples: + >>> unescape_with_postfix("field1__field2__field3") + ('field1_field2_field3', None) + >>> unescape_with_postfix("field1__field2_metadata") + ('field1_field2', 'metadata') + >>> unescape_with_postfix("simple") + ('simple', None) + >>> unescape_with_postfix("field1--field2", separator="-") + ('field1-field2', None) + >>> unescape_with_postfix("field1--field2-meta", separator="-") + ('field1-field2', 'meta') + """ + + parts = field.split(separator * 2) + parts[-1], *meta = parts[-1].split("_", 1) + return separator.join(parts), meta[0] if meta else None + + def find_noncolliding_name(name: str, lut: dict) -> str: """ Generate a unique name that does not collide with existing keys in a lookup table (lut). From 51f3da283c2f7cd34fd2f2bb2abd12c43fd80901 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:54:55 +0000 Subject: [PATCH 038/224] fix: cleanup imports and fix issue in recursive structure processing --- src/orcapod/core/sources.py | 67 +++++++++++++- src/orcapod/core/streams.py | 97 +++++++++++++++++++++ src/orcapod/core/tracker.py | 73 +--------------- src/orcapod/hashing/arrow_utils.py | 4 +- src/orcapod/hashing/content_identifiable.py | 14 ++- src/orcapod/hashing/defaults.py | 7 +- src/orcapod/hashing/hash_utils.py | 2 + src/orcapod/stores/file_utils.py | 2 +- src/orcapod/types/core.py | 10 ++- src/orcapod/types/packets.py | 12 ++- src/orcapod/types/typespec_utils.py | 2 +- 11 files changed, 199 insertions(+), 91 deletions(-) diff --git a/src/orcapod/core/sources.py b/src/orcapod/core/sources.py index 3d79e7a..b1dca7d 100644 --- a/src/orcapod/core/sources.py +++ b/src/orcapod/core/sources.py @@ -3,10 +3,17 @@ from pathlib import Path from typing import Any, Literal +import polars as pl + from orcapod.core.base import Source from orcapod.hashing.legacy_core import hash_function -from orcapod.core.streams import SyncStream, SyncStreamFromGenerator -from orcapod.types import Packet, Tag +from orcapod.core.streams import ( + PolarsStream, + SyncStream, + SyncStreamFromGenerator, + StreamWrapper, +) +from orcapod.types import Packet, Tag, TypeSpec class GlobSource(Source): @@ -139,3 +146,59 @@ def claims_unique_tags( return True # Otherwise, delegate to the base class return super().claims_unique_tags(trigger_run=trigger_run) + + +class PolarsSource(Source): + def __init__( + self, + df: pl.DataFrame, + tag_keys: Collection[str], + packet_keys: Collection[str] | None = None, + ): + self.df = df + self.tag_keys = tag_keys + self.packet_keys = packet_keys + + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + if len(streams) != 0: + raise ValueError( + "PolarsSource does not support forwarding streams. " + "It generates its own stream from the DataFrame." + ) + return PolarsStream(self.df, self.tag_keys, self.packet_keys) + + +class StreamSource(Source): + def __init__(self, stream: SyncStream, **kwargs): + super().__init__(skip_tracking=True, **kwargs) + self.stream = stream + + def forward(self, *streams: SyncStream) -> SyncStream: + if len(streams) != 0: + raise ValueError( + "StreamSource does not support forwarding streams. " + "It generates its own stream from the file system." + ) + return StreamWrapper(self.stream) + + def identity_structure(self, *streams) -> Any: + if len(streams) != 0: + raise ValueError( + "StreamSource does not support forwarding streams. " + "It generates its own stream from the file system." + ) + + return (self.__class__.__name__, self.stream) + + def types( + self, *streams: SyncStream, **kwargs + ) -> tuple[TypeSpec | None, TypeSpec | None]: + return self.stream.types() + + def keys( + self, *streams: SyncStream, **kwargs + ) -> tuple[Collection[str] | None, Collection[str] | None]: + return self.stream.keys() + + def computed_label(self) -> str | None: + return self.stream.label diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index 243a1f4..170c80d 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -1,5 +1,7 @@ from collections.abc import Callable, Collection, Iterator +import polars as pl + from orcapod.core.base import SyncStream from orcapod.types import Packet, PacketLike, Tag, TypeSpec from copy import copy @@ -104,3 +106,98 @@ def keys( return super().keys(trigger_run=trigger_run) # If the keys are already set, return them return self.tag_keys.copy(), self.packet_keys.copy() + + +class PolarsStream(SyncStream): + def __init__( + self, + df: pl.DataFrame, + tag_keys: Collection[str], + packet_keys: Collection[str] | None = None, + ): + self.df = df + self.tag_keys = tuple(tag_keys) + self.packet_keys = tuple(packet_keys) if packet_keys is not None else None + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + df = self.df + # if self.packet_keys is not None: + # df = df.select(self.tag_keys + self.packet_keys) + for row in df.iter_rows(named=True): + tag = {key: row[key] for key in self.tag_keys} + packet = { + key: val + for key, val in row.items() + if key not in self.tag_keys and not key.startswith("_source_info_") + } + # TODO: revisit and fix this rather hacky implementation + source_info = { + key.removeprefix("_source_info_"): val + for key, val in row.items() + if key.startswith("_source_info_") + } + yield tag, Packet(packet, source_info=source_info) + + +class EmptyStream(SyncStream): + def __init__( + self, + tag_keys: Collection[str] | None = None, + packet_keys: Collection[str] | None = None, + tag_typespec: TypeSpec | None = None, + packet_typespec: TypeSpec | None = None, + ): + if tag_keys is None and tag_typespec is not None: + tag_keys = tag_typespec.keys() + self.tag_keys = list(tag_keys) if tag_keys else [] + + if packet_keys is None and packet_typespec is not None: + packet_keys = packet_typespec.keys() + self.packet_keys = list(packet_keys) if packet_keys else [] + + self.tag_typespec = tag_typespec + self.packet_typespec = packet_typespec + + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + return self.tag_keys, self.packet_keys + + def types( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[TypeSpec | None, TypeSpec | None]: + return self.tag_typespec, self.packet_typespec + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + # Empty stream, no data to yield + return iter([]) + + +class StreamWrapper(SyncStream): + """ + A wrapper for a SyncStream that allows the stream to be labeled and + associated with an invocation without modifying the original stream. + """ + + def __init__(self, stream: SyncStream, **kwargs): + super().__init__(**kwargs) + self.stream = stream + + def keys( + self, *streams: SyncStream, **kwargs + ) -> tuple[Collection[str] | None, Collection[str] | None]: + return self.stream.keys(*streams, **kwargs) + + def types( + self, *streams: SyncStream, **kwargs + ) -> tuple[TypeSpec | None, TypeSpec | None]: + return self.stream.types(*streams, **kwargs) + + def computed_label(self) -> str | None: + return self.stream.label + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + """ + Iterate over the stream, yielding tuples of (tags, packets). + """ + yield from self.stream diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py index 8f07ae3..e0a2bd7 100644 --- a/src/orcapod/core/tracker.py +++ b/src/orcapod/core/tracker.py @@ -1,74 +1,5 @@ -from orcapod.core.base import Invocation, Kernel, Tracker, SyncStream, Source -from orcapod.types import Tag, Packet, TypeSpec -from collections.abc import Collection, Iterator -from typing import Any - - -class StreamWrapper(SyncStream): - """ - A wrapper for a SyncStream that allows it to be used as a Source. - This is useful for cases where you want to treat a stream as a source - without modifying the original stream. - """ - - def __init__(self, stream: SyncStream, **kwargs): - super().__init__(**kwargs) - self.stream = stream - - def keys( - self, *streams: SyncStream, **kwargs - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.stream.keys(*streams, **kwargs) - - def types( - self, *streams: SyncStream, **kwargs - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.stream.types(*streams, **kwargs) - - def computed_label(self) -> str | None: - return self.stream.label - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - """ - Iterate over the stream, yielding tuples of (tags, packets). - """ - yield from self.stream - - -class StreamSource(Source): - def __init__(self, stream: SyncStream, **kwargs): - super().__init__(skip_tracking=True, **kwargs) - self.stream = stream - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 0: - raise ValueError( - "StreamSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - return StreamWrapper(self.stream) - - def identity_structure(self, *streams) -> Any: - if len(streams) != 0: - raise ValueError( - "StreamSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - - return (self.__class__.__name__, self.stream) - - def types( - self, *streams: SyncStream, **kwargs - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.stream.types() - - def keys( - self, *streams: SyncStream, **kwargs - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.stream.keys() - - def computed_label(self) -> str | None: - return self.stream.label +from orcapod.core.base import Invocation, Kernel, Tracker +from orcapod.core.sources import StreamSource class GraphTracker(Tracker): diff --git a/src/orcapod/hashing/arrow_utils.py b/src/orcapod/hashing/arrow_utils.py index 168c53f..0d46cd7 100644 --- a/src/orcapod/hashing/arrow_utils.py +++ b/src/orcapod/hashing/arrow_utils.py @@ -1,7 +1,7 @@ import pyarrow as pa import json import hashlib -from typing import Dict, List, Any +from typing import Dict, Any from decimal import Decimal import base64 @@ -168,7 +168,7 @@ def _arrow_type_to_python_type(arrow_type: pa.DataType) -> str: return str(arrow_type).lower() -def _extract_semantic_metadata(field_metadata) -> Dict[str, str]: +def _extract_semantic_metadata(field_metadata) -> dict[str, str]: """ Extract only 'semantic_type' metadata from field metadata. diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py index 1581e62..ce1b6c3 100644 --- a/src/orcapod/hashing/content_identifiable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -1,9 +1,19 @@ -from .types import ObjectHasher -from .defaults import get_default_object_hasher +from orcapod.hashing.types import ObjectHasher +from orcapod.hashing.defaults import get_default_object_hasher from typing import Any class ContentIdentifiableBase: + """ + Base class for content-identifiable objects. + This class provides a way to define objects that can be uniquely identified + based on their content rather than their identity in memory. Specifically, the identity of the + object is determined by the structure returned by the `identity_structure` method. + The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, + which defaults to the one returned by `get_default_object_hasher`. + Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. + """ + def __init__( self, identity_structure_hasher: ObjectHasher | None = None, diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index a9aebcd..3bae548 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -3,16 +3,13 @@ from orcapod.hashing.types import ( LegacyCompositeFileHasher, ArrowHasher, - FileContentHasher, StringCacher, ) -from orcapod.hashing.file_hashers import BasicFileHasher, LegacyPathLikeHasherFactory +from orcapod.hashing.file_hashers import LegacyPathLikeHasherFactory from orcapod.hashing.string_cachers import InMemoryCacher from orcapod.hashing.object_hashers import ObjectHasher from orcapod.hashing.object_hashers import LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory -from orcapod.hashing.arrow_hashers import SemanticArrowHasher -from orcapod.hashing.semantic_type_hashers import PathHasher from orcapod.hashing.versioned_hashers import ( get_versioned_semantic_arrow_hasher, get_versioned_object_hasher, @@ -24,7 +21,7 @@ def get_default_arrow_hasher( ) -> ArrowHasher: """ Get the default Arrow hasher with semantic type support. - If `with_cache` is True, it uses an in-memory cacher for caching hash values. + If `cache_file_hash` is True, it uses an in-memory cacher for caching hash values. If a `StringCacher` is provided, it uses that for caching file hashes. """ arrow_hasher = get_versioned_semantic_arrow_hasher() if cache_file_hash: diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 7fee36b..0dc0777 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -48,6 +48,8 @@ def process_structure( # Initialize the visited set if this is the top-level call if visited is None: visited = set() + else: + visited = visited.copy() # Copy to avoid modifying the original set # Check for circular references - use object's memory address # NOTE: While id() is not stable across sessions, we only use it within a session diff --git a/src/orcapod/stores/file_utils.py b/src/orcapod/stores/file_utils.py index 34380e0..712aada 100644 --- a/src/orcapod/stores/file_utils.py +++ b/src/orcapod/stores/file_utils.py @@ -384,7 +384,7 @@ def virtual_mount( new_packet = {} for key, value in packet.items(): - new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) + new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) # type: ignore return new_packet, forward_lut, reverse_lut diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 12448f8..62c100d 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -1,6 +1,4 @@ -from typing import Protocol, Any, TypeAlias, TypeVar, Generic -import pyarrow as pa -from dataclasses import dataclass +from typing import Protocol, Any, TypeAlias import os from collections.abc import Collection, Mapping @@ -34,7 +32,11 @@ # Extended data values that can be stored in packets # Either the original PathSet or one of our supported simple data types DataValue: TypeAlias = ( - PathSet | SupportedNativePythonData | None | Collection["DataValue"] + PathSet + | SupportedNativePythonData + | None + | Collection["DataValue"] + | Mapping[str, "DataValue"] ) diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/packets.py index 47df081..a5836b1 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/packets.py @@ -4,9 +4,17 @@ from orcapod.types.core import TypeSpec, Tag, TypeHandler from orcapod.types.semantic_type_registry import SemanticTypeRegistry from orcapod.types import schemas +from orcapod.types.typespec_utils import get_typespec_from_dict import pyarrow as pa -# # a packet is a mapping from string keys to data values +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. PacketLike: TypeAlias = Mapping[str, DataValue] @@ -21,8 +29,6 @@ def __init__( obj = {} super().__init__(obj) if typespec is None: - from orcapod.types.typespec_utils import get_typespec_from_dict - typespec = get_typespec_from_dict(self) self._typespec = typespec if source_info is None: diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 4e48004..a0a3c58 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -2,7 +2,7 @@ from collections.abc import Callable, Collection, Sequence, Mapping from typing import get_origin, get_args, Any -from .core import TypeSpec +from orcapod.types.core import TypeSpec import inspect import logging From 3d54067bb87f231431b5ae1f85e5d54ca5e3d612 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:55:40 +0000 Subject: [PATCH 039/224] refactor: add more robust arrow serialization strategy and use @ for separator in hash --- src/orcapod/hashing/arrow_hashers.py | 36 ++++++++++++++++++---------- src/orcapod/hashing/types.py | 2 +- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 3545911..a7b5a01 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,11 +1,15 @@ import hashlib from typing import Any import pyarrow as pa -import pyarrow.ipc as ipc -from io import BytesIO import polars as pl import json from orcapod.hashing.types import SemanticTypeHasher, StringCacher +from orcapod.hashing import arrow_serialization_old +from collections.abc import Callable + +SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { + "logical": arrow_serialization_old.serialize_table_logical, +} def serialize_pyarrow_table(table: pa.Table) -> str: @@ -51,6 +55,7 @@ def __init__( chunk_size: int = 8192, handle_missing: str = "error", semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, + serialization_method: str = "logical", ): """ Initialize SemanticArrowHasher. @@ -66,6 +71,13 @@ def __init__( semantic_type_hashers or {} ) self.hash_algorithm = hash_algorithm + if serialization_method not in SERIALIZATION_METHOD_LUT: + raise ValueError( + f"Invalid serialization method '{serialization_method}'. " + f"Supported methods: {list(SERIALIZATION_METHOD_LUT.keys())}" + ) + self.serialization_method = serialization_method + self._serialize_arrow_table = SERIALIZATION_METHOD_LUT[serialization_method] def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: """ @@ -167,16 +179,16 @@ def _sort_table_columns(self, table: pa.Table) -> pa.Table: sorted_schema = pa.schema(sorted_fields) return pa.table(sorted_columns, schema=sorted_schema) - def _serialize_table_ipc(self, table: pa.Table) -> bytes: - # TODO: fix and use logical table hashing instead - """Serialize table using Arrow IPC format for stable binary representation.""" - buffer = BytesIO() + # def _serialize_table_ipc(self, table: pa.Table) -> bytes: + # # TODO: fix and use logical table hashing instead + # """Serialize table using Arrow IPC format for stable binary representation.""" + # buffer = BytesIO() - # Use IPC stream format for deterministic serialization - with ipc.new_stream(buffer, table.schema) as writer: - writer.write_table(table) + # # Use IPC stream format for deterministic serialization + # with ipc.new_stream(buffer, table.schema) as writer: + # writer.write_table(table) - return buffer.getvalue() + # return buffer.getvalue() def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: """ @@ -200,7 +212,7 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: sorted_table = pl.DataFrame(sorted_table).to_arrow() # Step 3: Serialize using Arrow IPC format - serialized_bytes = self._serialize_table_ipc(sorted_table) + serialized_bytes = self._serialize_arrow_table(sorted_table) # Step 4: Compute final hash hasher = hashlib.new(self.hash_algorithm) @@ -208,7 +220,7 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: hash_str = hasher.hexdigest() if prefix_hasher_id: - hash_str = f"{self.get_hasher_id()}:{hash_str}" + hash_str = f"{self.get_hasher_id()}@{hash_str}" return hash_str diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index fabf812..6306d94 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -63,7 +63,7 @@ def hash_to_hex( ) hex_str = hex_str[:char_count] if prefix_hasher_id: - hex_str = self.get_hasher_id() + ":" + hex_str + hex_str = self.get_hasher_id() + "@" + hex_str return hex_str def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: From 1ac2be69b5f7770ff7b1d89bacd872765b343694 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:56:02 +0000 Subject: [PATCH 040/224] feat: logical serialization for arrow table --- src/orcapod/hashing/arrow_serialization.py | 1036 ++++++++++++++------ 1 file changed, 730 insertions(+), 306 deletions(-) diff --git a/src/orcapod/hashing/arrow_serialization.py b/src/orcapod/hashing/arrow_serialization.py index e4926cb..fa0500f 100644 --- a/src/orcapod/hashing/arrow_serialization.py +++ b/src/orcapod/hashing/arrow_serialization.py @@ -1,47 +1,396 @@ import pyarrow as pa +import pyarrow.compute as pc from io import BytesIO -import pyarrow.ipc as ipc import struct from typing import Any import hashlib -def serialize_table_ipc(table: pa.Table) -> bytes: - # TODO: fix and use logical table hashing instead - """Serialize table using Arrow IPC format for stable binary representation.""" - buffer = BytesIO() +def bool_sequence_to_byte(sequence: list[bool]) -> bytes: + """Convert a sequence of booleans to a byte array.""" + if len(sequence) > 8: + raise ValueError("Sequence length exceeds 8 bits, cannot fit in a byte.") + mask = 1 + flags = 0 + for value in sequence: + if value: + flags |= mask + mask <<= 1 + return struct.pack(" bytes: + """Serialize order options to bytes for inclusion in format.""" + flags = 0 + if self.ignore_column_order: + flags |= 1 + if self.ignore_row_order: + flags |= 2 + return struct.pack(" "OrderOptions": + """Deserialize order options from bytes.""" + flags = struct.unpack(" pa.Array: + """ + Convert any Arrow array to string representation for sorting purposes. + Handles all data types including complex ones. + """ + if pa.types.is_string(array.type) or pa.types.is_large_string(array.type): + # Already string + return array + + elif pa.types.is_binary(array.type) or pa.types.is_large_binary(array.type): + # Convert binary to base64 string representation for deterministic sorting + try: + # Use Arrow's base64 encoding if available + import base64 + + str_values = [] + # Get null mask + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) # Will be handled by fill_null later + else: + binary_val = array[i].as_py() + if binary_val is not None: + str_values.append(base64.b64encode(binary_val).decode("ascii")) + else: + str_values.append(None) + return pa.array(str_values, type=pa.string()) + except Exception: + # Fallback: convert to hex string + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + binary_val = array[i].as_py() + if binary_val is not None: + str_values.append(binary_val.hex()) + else: + str_values.append(None) + except Exception: + str_values.append(f"BINARY_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + binary_val = array[i].as_py() + if binary_val is not None: + str_values.append(binary_val.hex()) + else: + str_values.append(None) + except Exception: + str_values.append(f"BINARY_{i}") + return pa.array(str_values, type=pa.string()) + + elif _is_primitive_type(array.type): + # Convert primitive types to string + try: + return pc.cast(array, pa.string()) + except Exception: + # Manual conversion for types that don't cast well + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + value = array[i].as_py() + str_values.append(str(value)) + except Exception: + str_values.append(f"PRIMITIVE_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + value = array[i].as_py() + if value is not None: + str_values.append(str(value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"PRIMITIVE_{i}") + return pa.array(str_values, type=pa.string()) + + elif pa.types.is_list(array.type) or pa.types.is_large_list(array.type): + # Convert list to string representation + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + value = array[i].as_py() + # Sort list elements for consistent representation + if value is not None: + sorted_value = sorted( + value, key=lambda x: (x is None, str(x)) + ) + str_values.append(str(sorted_value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"LIST_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + value = array[i].as_py() + if value is not None: + sorted_value = sorted(value, key=lambda x: (x is None, str(x))) + str_values.append(str(sorted_value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"LIST_{i}") + return pa.array(str_values, type=pa.string()) + + elif pa.types.is_struct(array.type): + # Convert struct to string representation + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + value = array[i].as_py() + if value is not None: + # Sort dict keys for consistent representation + if isinstance(value, dict): + sorted_items = sorted( + value.items(), key=lambda x: str(x[0]) + ) + str_values.append(str(dict(sorted_items))) + else: + str_values.append(str(value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"STRUCT_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + value = array[i].as_py() + if value is not None: + if isinstance(value, dict): + sorted_items = sorted( + value.items(), key=lambda x: str(x[0]) + ) + str_values.append(str(dict(sorted_items))) + else: + str_values.append(str(value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"STRUCT_{i}") + return pa.array(str_values, type=pa.string()) + + elif pa.types.is_dictionary(array.type): + # Convert dictionary to string representation using the decoded values + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + value = array[i].as_py() + str_values.append(str(value)) + except Exception: + str_values.append(f"DICT_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + value = array[i].as_py() + if value is not None: + str_values.append(str(value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"DICT_{i}") + return pa.array(str_values, type=pa.string()) + + else: + # Generic fallback for any other types + try: + return pc.cast(array, pa.string()) + except Exception: + # Manual conversion as last resort + str_values = [] + try: + null_mask = pc.is_null(array) # type: ignore + for i in range(len(array)): + if null_mask[i].as_py(): + str_values.append(None) + else: + try: + value = array[i].as_py() + str_values.append(str(value)) + except Exception: + str_values.append(f"UNKNOWN_{array.type}_{i}") + except Exception: + # If null checking fails, just convert all values + for i in range(len(array)): + try: + value = array[i].as_py() + if value is not None: + str_values.append(str(value)) + else: + str_values.append(None) + except Exception: + str_values.append(f"UNKNOWN_{array.type}_{i}") + return pa.array(str_values, type=pa.string()) + + +def _create_row_sort_key(table: pa.Table) -> pa.Array: + """ + Create a deterministic sort key for rows by combining all column values. + This ensures consistent row ordering regardless of input order. + """ + if table.num_rows == 0: + return pa.array([], type=pa.string()) + + # Convert each column to string representation for sorting + sort_components = [] + + for i in range(table.num_columns): + column = table.column(i) + field = table.schema.field(i) + + # Combine all chunks into a single array + if column.num_chunks > 1: + combined_array = pa.concat_arrays(column.chunks) + elif column.num_chunks == 1: + combined_array = column.chunk(0) + else: + combined_array = pa.array([], type=field.type) + # Convert to string representation for sorting + str_array = _convert_array_to_string_for_sorting(combined_array) -def serialize_table_logical(table: pa.Table) -> bytes: + # Handle nulls by replacing with a consistent null representation + str_array = pc.fill_null(str_array, "NULL") + sort_components.append(str_array) + + # Combine all columns into a single sort key + if len(sort_components) == 1: + return sort_components[0] + else: + # Concatenate all string representations with separators + separator = pa.scalar("||") + combined = sort_components[0] + for component in sort_components[1:]: + combined = pc.binary_join_element_wise(combined, separator, component) # type: ignore + return combined + + +def _sort_table_by_content(table: pa.Table) -> pa.Table: + """Sort table rows based on content for deterministic ordering.""" + if table.num_rows <= 1: + return table + + # Create sort key + sort_key = _create_row_sort_key(table) + + # Get sort indices + sort_indices = pc.sort_indices(sort_key) # type: ignore + + # Apply sort to table + return pc.take(table, sort_indices) + + +def _sort_table_columns_by_name(table: pa.Table) -> pa.Table: + """Sort table columns alphabetically by name for deterministic ordering.""" + if table.num_columns <= 1: + return table + + # Get column names and sort them + column_names = [field.name for field in table.schema] + sorted_names = sorted(column_names) + + # If already sorted, return as-is + if column_names == sorted_names: + return table + + # Reorder columns + return table.select(sorted_names) + + +def serialize_table_logical( + table: pa.Table, order_options: OrderOptions | None = None +) -> bytes: """ Serialize table using column-wise processing with direct binary data access. This implementation works directly with Arrow's underlying binary buffers without converting to Python objects, making it much faster and more memory efficient while maintaining high repeatability. + + Args: + table: PyArrow table to serialize + order_options: Options for handling column and row order independence """ + if order_options is None: + order_options = OrderOptions() + buffer = BytesIO() # Write format version - buffer.write(b"ARROW_BINARY_V1") + buffer.write(b"ARROW_BINARY_V1") # Updated version to include order options + + # Write order options + buffer.write(order_options.to_bytes()) + + # Apply ordering transformations if requested + processed_table = table + + if order_options.ignore_column_order: + processed_table = _sort_table_columns_by_name(processed_table) + + if order_options.ignore_row_order: + processed_table = _sort_table_by_content(processed_table) # Serialize schema deterministically - _serialize_schema_deterministic(buffer, table.schema) + _serialize_schema_deterministic(buffer, processed_table.schema) # Process each column using direct binary access column_digests = [] - for i in range(table.num_columns): - column = table.column(i) - field = table.schema.field(i) + for i in range(processed_table.num_columns): + column = processed_table.column(i) + field = processed_table.schema.field(i) column_digest = _serialize_column_binary(column, field) column_digests.append(column_digest) @@ -132,7 +481,7 @@ def _serialize_column_binary(column: pa.ChunkedArray, field: pa.Field) -> bytes: # Combine all chunks into a single array for consistent processing if column.num_chunks > 1: - # Multiple chunks - combine them + # Multiple chunks - combine them using pa.concat_arrays combined_array = pa.concat_arrays(column.chunks) elif column.num_chunks == 1: # Single chunk - use directly @@ -158,26 +507,37 @@ def _serialize_array_binary(array: pa.Array, data_type: pa.DataType) -> bytes: validity_buffer = array.buffers()[0] # Process based on Arrow type, accessing buffers directly - if _is_primitive_type(data_type): - _serialize_primitive_array_binary(buffer, array, data_type, validity_buffer) + try: + if _is_primitive_type(data_type): + _serialize_primitive_array_binary(buffer, array, data_type, validity_buffer) - elif pa.types.is_string(data_type) or pa.types.is_large_string(data_type): - _serialize_string_array_binary(buffer, array, data_type, validity_buffer) + elif pa.types.is_string(data_type) or pa.types.is_large_string(data_type): + _serialize_string_array_binary(buffer, array, data_type, validity_buffer) - elif pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type): - _serialize_binary_array_binary(buffer, array, data_type, validity_buffer) + elif pa.types.is_binary(data_type) or pa.types.is_large_binary(data_type): + _serialize_binary_array_binary(buffer, array, data_type, validity_buffer) - elif pa.types.is_list(data_type) or pa.types.is_large_list(data_type): - _serialize_list_array_binary(buffer, array, data_type, validity_buffer) + elif pa.types.is_list(data_type) or pa.types.is_large_list(data_type): + _serialize_list_array_binary(buffer, array, data_type, validity_buffer) - elif pa.types.is_struct(data_type): - _serialize_struct_array_binary(buffer, array, data_type, validity_buffer) + elif pa.types.is_struct(data_type): + _serialize_struct_array_binary(buffer, array, data_type, validity_buffer) - elif pa.types.is_dictionary(data_type): - _serialize_dictionary_array_binary(buffer, array, data_type, validity_buffer) + elif pa.types.is_dictionary(data_type): + _serialize_dictionary_array_binary( + buffer, array, data_type, validity_buffer + ) - else: - # Fallback to element-wise processing for complex types + else: + # Fallback to element-wise processing for complex types + _serialize_array_fallback(buffer, array, data_type, validity_buffer) + + except Exception as e: + # If binary serialization fails, fall back to element-wise processing + print( + f"Warning: Binary serialization failed for {data_type}, falling back to element-wise: {e}" + ) + buffer = BytesIO() # Reset buffer _serialize_array_fallback(buffer, array, data_type, validity_buffer) return buffer.getvalue() @@ -252,17 +612,31 @@ def _serialize_list_array_binary( if offset_buffer is not None: buffer.write(offset_buffer.to_pybytes()) - # Get child array - handle both .children and .values access patterns + # Get child array - handle different access patterns child_array = None + + # Method 1: Try .values (most common for ListArray) if hasattr(array, "values") and array.values is not None: child_array = array.values - elif hasattr(array, "children") and len(array.children) > 0: + + # Method 2: Try .children (some array types) + elif hasattr(array, "children") and array.children and len(array.children) > 0: child_array = array.children[0] + # Method 3: Try accessing via flatten() for some list types + elif hasattr(array, "flatten"): + try: + child_array = array.flatten() + except Exception: + pass + # Recursively serialize child array if child_array is not None: child_data = _serialize_array_binary(child_array, data_type.value_type) buffer.write(child_data) + else: + # If we can't access child arrays directly, fall back to element-wise processing + _serialize_array_fallback(buffer, array, data_type, validity_buffer) def _serialize_struct_array_binary( @@ -272,8 +646,27 @@ def _serialize_struct_array_binary( # Write validity bitmap _serialize_validity_buffer(buffer, validity_buffer) + # Get child arrays - handle different access patterns for StructArray + child_arrays = [] + if hasattr(array, "field"): + # StructArray uses .field(i) to access child arrays + for i in range(len(data_type)): + child_arrays.append(array.field(i)) + elif hasattr(array, "children") and array.children: + # Some array types use .children + child_arrays = array.children + else: + # Fallback: try to access fields by iterating + try: + for i in range(len(data_type)): + child_arrays.append(array.field(i)) + except (AttributeError, IndexError): + # If all else fails, use element-wise processing + _serialize_array_fallback(buffer, array, data_type, validity_buffer) + return + # Serialize each child field - for i, child_array in enumerate(array.children): + for i, child_array in enumerate(child_arrays): field_type = data_type[i].type child_data = _serialize_array_binary(child_array, field_type) buffer.write(child_data) @@ -303,30 +696,6 @@ def _serialize_validity_buffer(buffer: BytesIO, validity_buffer): # If no validity buffer, there are no nulls (implicit) -def _serialize_boolean_buffer(buffer: BytesIO, data_buffer, array_length: int): - """Serialize boolean buffer (bit-packed).""" - # Boolean data is bit-packed, copy directly - bool_bytes = data_buffer.to_pybytes() - buffer.write(struct.pack(" int: - """Get byte width of primitive types.""" - if pa.types.is_boolean(data_type): - return 1 # Bit-packed, but minimum 1 byte - elif pa.types.is_integer(data_type) or pa.types.is_floating(data_type): - return data_type.bit_width // 8 - elif pa.types.is_date(data_type): - return 4 if data_type == pa.date32() else 8 - elif pa.types.is_time(data_type) or pa.types.is_timestamp(data_type): - return data_type.bit_width // 8 - else: - return 8 # Default - - def _serialize_array_fallback( buffer: BytesIO, array: pa.Array, data_type: pa.DataType, validity_buffer ): @@ -334,35 +703,132 @@ def _serialize_array_fallback( # Write validity bitmap _serialize_validity_buffer(buffer, validity_buffer) - # Process element by element (only for types that need it) + # Process element by element for i in range(len(array)): - if array.is_null(i): + try: + null_mask = pc.is_null(array) # type: ignore + is_null = null_mask[i].as_py() + except: + # Fallback null check + try: + value = array[i].as_py() + is_null = value is None + except: + is_null = False + + if is_null: buffer.write(b"\x00") else: buffer.write(b"\x01") - # For complex nested types, we might still need .as_py() - # But this should be rare with proper binary handling above - value = array[i].as_py() - _serialize_complex_value(buffer, value, data_type) + + # For complex nested types, convert to Python and serialize + try: + value = array[i].as_py() + _serialize_complex_value(buffer, value, data_type) + except Exception as e: + # If .as_py() fails, try alternative approaches + try: + # For some array types, we can access scalar values differently + scalar = array[i] + if hasattr(scalar, "value"): + value = scalar.value + else: + value = str(scalar) # Convert to string as last resort + _serialize_complex_value(buffer, value, data_type) + except Exception: + # Absolute fallback - serialize type name and index + fallback_str = f"{data_type}[{i}]" + fallback_bytes = fallback_str.encode("utf-8") + buffer.write(struct.pack(" str: +def serialize_table_logical_hash( + table: pa.Table, + algorithm: str = "sha256", + order_options: OrderOptions | None = None, +) -> str: """Create deterministic hash using binary serialization.""" - serialized = serialize_table_logical(table) + serialized = serialize_table_logical(table, order_options) if algorithm == "sha256": hasher = hashlib.sha256() @@ -377,27 +843,44 @@ def serialize_table_logical_hash(table: pa.Table, algorithm: str = "sha256") -> return hasher.hexdigest() -def serialize_table_logical_streaming(table: pa.Table) -> str: +def serialize_table_logical_streaming( + table: pa.Table, order_options: OrderOptions | None = None +) -> str: """ Memory-efficient streaming version that produces the same hash as serialize_table_logical_hash. This version processes data in streaming fashion but maintains the same logical structure as the non-streaming version to ensure identical hashes and chunking independence. """ + if order_options is None: + order_options = OrderOptions() + hasher = hashlib.sha256() # Hash format version (same as non-streaming) hasher.update(b"ARROW_BINARY_V1") + # Hash order options + hasher.update(order_options.to_bytes()) + + # Apply ordering transformations if requested + processed_table = table + + if order_options.ignore_column_order: + processed_table = _sort_table_columns_by_name(processed_table) + + if order_options.ignore_row_order: + processed_table = _sort_table_by_content(processed_table) + # Hash schema (same as non-streaming) schema_buffer = BytesIO() - _serialize_schema_deterministic(schema_buffer, table.schema) + _serialize_schema_deterministic(schema_buffer, processed_table.schema) hasher.update(schema_buffer.getvalue()) # Process each column using the same logic as non-streaming - for i in range(table.num_columns): - column = table.column(i) - field = table.schema.field(i) + for i in range(processed_table.num_columns): + column = processed_table.column(i) + field = processed_table.schema.field(i) # Use the same column serialization logic for chunking independence column_data = _serialize_column_binary(column, field) @@ -408,7 +891,46 @@ def serialize_table_logical_streaming(table: pa.Table) -> str: return hasher.hexdigest() -# Test utilities +# IPC serialization for comparison (updated to include order options for fair comparison) +def serialize_table_ipc( + table: pa.Table, order_options: OrderOptions | None = None +) -> bytes: + """Serialize table using Arrow IPC format for comparison.""" + from io import BytesIO + import pyarrow.ipc as ipc + + if order_options is None: + order_options = OrderOptions() + + buffer = BytesIO() + + # Add format version for consistency with logical serialization + buffer.write(b"ARROW_IPC_V2") + + # Add order options + buffer.write(order_options.to_bytes()) + + # Apply ordering transformations if requested + processed_table = table + + if order_options.ignore_column_order: + processed_table = _sort_table_columns_by_name(processed_table) + + if order_options.ignore_row_order: + processed_table = _sort_table_by_content(processed_table) + + # Standard IPC serialization + ipc_buffer = BytesIO() + with ipc.new_stream(ipc_buffer, processed_table.schema) as writer: + writer.write_table(processed_table) + + # Append IPC data + buffer.write(ipc_buffer.getvalue()) + + return buffer.getvalue() + + +# Test utilities (updated to test order independence) def create_test_table_1(): """Create a basic test table with various data types.""" return pa.table( @@ -494,45 +1016,6 @@ def create_test_table_different_chunking(): ) -def create_test_table_empty(): - """Create an empty table with same schema.""" - return pa.table( - { - "int32_col": pa.array([], type=pa.int32()), - "float64_col": pa.array([], type=pa.float64()), - "string_col": pa.array([], type=pa.string()), - "bool_col": pa.array([], type=pa.bool_()), - "binary_col": pa.array([], type=pa.binary()), - } - ) - - -def create_test_table_all_nulls(): - """Create a table with all null values.""" - return pa.table( - { - "int32_col": pa.array([None, None, None], type=pa.int32()), - "float64_col": pa.array([None, None, None], type=pa.float64()), - "string_col": pa.array([None, None, None], type=pa.string()), - "bool_col": pa.array([None, None, None], type=pa.bool_()), - "binary_col": pa.array([None, None, None], type=pa.binary()), - } - ) - - -def create_test_table_no_nulls(): - """Create a table with no null values.""" - return pa.table( - { - "int32_col": pa.array([1, 2, 3, 4, 5], type=pa.int32()), - "float64_col": pa.array([1.1, 2.2, 3.3, 4.4, 5.5], type=pa.float64()), - "string_col": pa.array(["hello", "world", "arrow", "fast", "data"]), - "bool_col": pa.array([True, False, True, False, True]), - "binary_col": pa.array([b"data1", b"data2", b"data3", b"data4", b"data5"]), - } - ) - - def create_test_table_complex_types(): """Create a table with complex nested types.""" return pa.table( @@ -557,29 +1040,13 @@ def create_test_table_complex_types(): ) -def create_test_table_single_column(): - """Create a table with just one column.""" - return pa.table({"single_col": pa.array([1, 2, 3, 4, 5], type=pa.int32())}) - - -def create_test_table_single_row(): - """Create a table with just one row.""" - return pa.table( - { - "int32_col": pa.array([42], type=pa.int32()), - "string_col": pa.array(["single"]), - "bool_col": pa.array([True]), - } - ) - - def run_comprehensive_tests(): - """Run comprehensive test suite for serialization.""" + """Run comprehensive test suite for serialization with order independence.""" import time - print("=" * 60) - print("COMPREHENSIVE ARROW SERIALIZATION TEST SUITE") - print("=" * 60) + print("=" * 70) + print("COMPREHENSIVE ARROW SERIALIZATION TEST SUITE (WITH ORDER OPTIONS)") + print("=" * 70) # Test cases test_cases = [ @@ -588,159 +1055,151 @@ def run_comprehensive_tests(): ("Reordered rows", create_test_table_reordered_rows), ("Different types", create_test_table_different_types), ("Different chunking", create_test_table_different_chunking), - ("Empty table", create_test_table_empty), - ("All nulls", create_test_table_all_nulls), - ("No nulls", create_test_table_no_nulls), ("Complex types", create_test_table_complex_types), - ("Single column", create_test_table_single_column), - ("Single row", create_test_table_single_row), ] - # Generate hashes for all test cases - results = {} + # Order option combinations to test + order_configs = [ + ("Default (order-sensitive)", OrderOptions(False, False)), + ("Column-order independent", OrderOptions(True, False)), + ("Row-order independent", OrderOptions(False, True)), + ("Fully order-independent", OrderOptions(True, True)), + ] - print("\n1. GENERATING HASHES FOR ALL TEST CASES") + print("\n1. ORDER INDEPENDENCE TESTS") print("-" * 50) - for name, create_func in test_cases: - try: - table = create_func() - - # Generate all hash types - logical_hash = serialize_table_logical_hash(table) - streaming_hash = serialize_table_logical_streaming(table) - ipc_hash = hashlib.sha256(serialize_table_ipc(table)).hexdigest() - - results[name] = { - "table": table, - "logical": logical_hash, - "streaming": streaming_hash, - "ipc": ipc_hash, - "rows": table.num_rows, - "cols": table.num_columns, - } - - print( - f"{name:20} | Rows: {table.num_rows:5} | Cols: {table.num_columns:2} | " - f"Logical: {logical_hash[:12]}... | IPC: {ipc_hash[:12]}..." - ) + base_table = create_test_table_1() + reordered_cols = create_test_table_reordered_columns() + reordered_rows = create_test_table_reordered_rows() - except Exception as e: - print(f"{name:20} | ERROR: {str(e)}") - results[name] = {"error": str(e)} + for config_name, order_opts in order_configs: + print(f"\n{config_name}:") + print(f" Config: {order_opts}") - print("\n2. DETERMINISM TESTS") - print("-" * 50) + # Test with base table + base_hash = serialize_table_logical_hash(base_table, order_options=order_opts) + cols_hash = serialize_table_logical_hash( + reordered_cols, order_options=order_opts + ) + rows_hash = serialize_table_logical_hash( + reordered_rows, order_options=order_opts + ) - base_table = create_test_table_1() + # Test streaming consistency + base_stream = serialize_table_logical_streaming( + base_table, order_options=order_opts + ) - # Test multiple runs of same table - logical_hashes = [serialize_table_logical_hash(base_table) for _ in range(5)] - streaming_hashes = [serialize_table_logical_streaming(base_table) for _ in range(5)] - ipc_hashes = [ - hashlib.sha256(serialize_table_ipc(base_table)).hexdigest() for _ in range(5) - ] + print(f" Base table: {base_hash[:12]}...") + print(f" Reordered columns: {cols_hash[:12]}...") + print(f" Reordered rows: {rows_hash[:12]}...") + print(f" Streaming matches: {base_hash == base_stream}") - print( - f"Logical deterministic: {len(set(logical_hashes)) == 1} ({len(set(logical_hashes))}/5 unique)" - ) - print( - f"Streaming deterministic: {len(set(streaming_hashes)) == 1} ({len(set(streaming_hashes))}/5 unique)" - ) - print( - f"IPC deterministic: {len(set(ipc_hashes)) == 1} ({len(set(ipc_hashes))}/5 unique)" - ) - print(f"Streaming == Logical: {streaming_hashes[0] == logical_hashes[0]}") + # Check expected behavior + cols_should_match = order_opts.ignore_column_order + rows_should_match = order_opts.ignore_row_order + + cols_match = base_hash == cols_hash + rows_match = base_hash == rows_hash + + cols_status = "✓" if cols_match == cols_should_match else "✗" + rows_status = "✓" if rows_match == rows_should_match else "✗" + + print( + f" {cols_status} Column order independence: {cols_match} (expected: {cols_should_match})" + ) + print( + f" {rows_status} Row order independence: {rows_match} (expected: {rows_should_match})" + ) - print("\n3. EQUIVALENCE TESTS") + print("\n2. CHUNKING INDEPENDENCE WITH ORDER OPTIONS") print("-" * 50) - base_logical = results["Basic table"]["logical"] - base_ipc = results["Basic table"]["ipc"] - - equivalence_tests = [ - ( - "Same table vs reordered columns", - "Reordered columns", - False, - "Different column order should produce different hash", - ), - ( - "Same table vs reordered rows", - "Reordered rows", - False, - "Different row order should produce different hash", - ), - ( - "Same table vs different types", - "Different types", - False, - "Different data types should produce different hash", - ), - ( - "Same table vs different chunking", - "Different chunking", - True, - "Same data with different chunking should produce same hash", - ), - ( - "Same table vs no nulls", - "No nulls", - False, - "Different null patterns should produce different hash", - ), - ( - "Same table vs all nulls", - "All nulls", - False, - "Different data should produce different hash", - ), - ] + original = create_test_table_1() + combined = original.combine_chunks() + different_chunking = create_test_table_different_chunking() + + for config_name, order_opts in order_configs: + orig_hash = serialize_table_logical_hash(original, order_options=order_opts) + comb_hash = serialize_table_logical_hash(combined, order_options=order_opts) + diff_hash = serialize_table_logical_hash( + different_chunking, order_options=order_opts + ) + + chunking_independent = orig_hash == comb_hash == diff_hash + status = "✓" if chunking_independent else "✗" + + print( + f"{status} {config_name:25} | Chunking independent: {chunking_independent}" + ) + + print("\n3. FORMAT VERSION COMPATIBILITY") + print("-" * 50) - for test_name, compare_case, should_match, explanation in equivalence_tests: - if compare_case in results and "logical" in results[compare_case]: - compare_logical = results[compare_case]["logical"] - compare_ipc = results[compare_case]["ipc"] + # Test that different order options produce different hashes when they should + test_table = create_test_table_1() - logical_match = base_logical == compare_logical - ipc_match = base_ipc == compare_ipc + hashes = {} + for config_name, order_opts in order_configs: + hash_value = serialize_table_logical_hash(test_table, order_options=order_opts) + hashes[config_name] = hash_value + print(f"{config_name:25} | {hash_value[:16]}...") - logical_status = "✓" if logical_match == should_match else "✗" - ipc_status = "✓" if ipc_match == should_match else "✗" + # Verify that order-sensitive vs order-independent produce different hashes + default_hash = hashes["Default (order-sensitive)"] + col_indep_hash = hashes["Column-order independent"] + row_indep_hash = hashes["Row-order independent"] + full_indep_hash = hashes["Fully order-independent"] - print(f"{logical_status} {test_name}") - print(f" Logical: {logical_match} (expected: {should_match})") - print(f" IPC: {ipc_match} (expected: {should_match})") - print(f" Reason: {explanation}") - print() + print(f"\nHash uniqueness:") + print(f" Default != Col-independent: {default_hash != col_indep_hash}") + print(f" Default != Row-independent: {default_hash != row_indep_hash}") + print(f" Default != Fully independent: {default_hash != full_indep_hash}") - print("4. CHUNKING INDEPENDENCE DETAILED TEST") + print("\n4. CONTENT EQUIVALENCE TEST") print("-" * 50) - # Test various chunking strategies - original_table = create_test_table_1() - combined_table = original_table.combine_chunks() - different_chunking = create_test_table_different_chunking() + # Create tables with same content but different presentation + table_a = pa.table({"col1": pa.array([1, 2, 3]), "col2": pa.array(["a", "b", "c"])}) - orig_logical = serialize_table_logical_hash(original_table) - comb_logical = serialize_table_logical_hash(combined_table) - diff_logical = serialize_table_logical_hash(different_chunking) + table_b = pa.table( + { + "col2": pa.array(["a", "b", "c"]), # Different column order + "col1": pa.array([1, 2, 3]), + } + ) - orig_ipc = hashlib.sha256(serialize_table_ipc(original_table)).hexdigest() - comb_ipc = hashlib.sha256(serialize_table_ipc(combined_table)).hexdigest() - diff_ipc = hashlib.sha256(serialize_table_ipc(different_chunking)).hexdigest() + table_c = pa.table( + { + "col1": pa.array([3, 1, 2]), # Different row order + "col2": pa.array(["c", "a", "b"]), + } + ) - print(f"Original chunking: {orig_logical[:16]}...") - print(f"Combined chunks: {comb_logical[:16]}...") - print(f"Different chunking: {diff_logical[:16]}...") - print( - f"Logical chunking-independent: {orig_logical == comb_logical == diff_logical}" + table_d = pa.table( + { + "col2": pa.array(["c", "a", "b"]), # Both different + "col1": pa.array([3, 1, 2]), + } ) - print() - print(f"Original IPC: {orig_ipc[:16]}...") - print(f"Combined IPC: {comb_ipc[:16]}...") - print(f"Different IPC: {diff_ipc[:16]}...") - print(f"IPC chunking-independent: {orig_ipc == comb_ipc == diff_ipc}") + + full_indep_opts = OrderOptions(True, True) + + hash_a = serialize_table_logical_hash(table_a, order_options=full_indep_opts) + hash_b = serialize_table_logical_hash(table_b, order_options=full_indep_opts) + hash_c = serialize_table_logical_hash(table_c, order_options=full_indep_opts) + hash_d = serialize_table_logical_hash(table_d, order_options=full_indep_opts) + + all_match = hash_a == hash_b == hash_c == hash_d + status = "✓" if all_match else "✗" + + print(f"{status} Content equivalence test:") + print(f" Table A (original): {hash_a[:12]}...") + print(f" Table B (reord cols): {hash_b[:12]}...") + print(f" Table C (reord rows): {hash_c[:12]}...") + print(f" Table D (both reord): {hash_d[:12]}...") + print(f" All hashes match: {all_match}") print("\n5. PERFORMANCE COMPARISON") print("-" * 50) @@ -758,18 +1217,14 @@ def run_comprehensive_tests(): } ) - # Time each method - methods = [ - ("Logical", lambda t: serialize_table_logical_hash(t)), - ("Streaming", lambda t: serialize_table_logical_streaming(t)), - ("IPC", lambda t: hashlib.sha256(serialize_table_ipc(t)).hexdigest()), - ] - hash_result = "" - for method_name, method_func in methods: + # Time each method with different order options + for config_name, order_opts in order_configs: times = [] for _ in range(3): # Run 3 times for average start = time.time() - hash_result = method_func(large_table) + hash_result = serialize_table_logical_hash( + large_table, order_options=order_opts + ) end = time.time() times.append(end - start) @@ -777,43 +1232,12 @@ def run_comprehensive_tests(): throughput = (large_size * 4) / avg_time # 4 columns print( - f"{method_name:10} | {avg_time * 1000:6.1f}ms | {throughput:8.0f} values/sec | {hash_result[:12]}..." + f"{config_name:25} | {avg_time * 1000:6.1f}ms | {throughput:8.0f} values/sec" ) - print("\n6. EDGE CASES") - print("-" * 50) - - edge_cases = ["Empty table", "All nulls", "Single column", "Single row"] - for case in edge_cases: - if case in results and "error" not in results[case]: - r = results[case] - print( - f"{case:15} | {r['rows']:3}r x {r['cols']:2}c | " - f"L:{r['logical'][:8]}... | I:{r['ipc'][:8]}... | " - f"Match: {r['logical'] == r['streaming']}" - ) - - print("\n7. COMPLEX TYPES TEST") - print("-" * 50) - - if "Complex types" in results and "error" not in results["Complex types"]: - complex_result = results["Complex types"] - print(f"Complex types serialization successful:") - print(f" Logical hash: {complex_result['logical']}") - print( - f" Streaming ==: {complex_result['logical'] == complex_result['streaming']}" - ) - print(f" Rows/Cols: {complex_result['rows']}r x {complex_result['cols']}c") - else: - print( - "Complex types test failed - this is expected for some complex nested types" - ) - - print(f"\n{'=' * 60}") - print("TEST SUITE COMPLETE") - print(f"{'=' * 60}") - - return results + print(f"\n{'=' * 70}") + print("ORDER-INDEPENDENT SERIALIZATION TEST SUITE COMPLETE") + print(f"{'=' * 70}") # Main execution From dab33787cdd0a88b382a89d462e8ae5d92d71dcb Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:56:51 +0000 Subject: [PATCH 041/224] feat: update versioned arrow hasher to use new serialization --- src/orcapod/hashing/object_hashers.py | 21 ++- src/orcapod/hashing/versioned_hashers.py | 5 +- src/orcapod/pipeline/nodes.py | 213 ++++++++++------------- 3 files changed, 110 insertions(+), 129 deletions(-) diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index bdd0169..3401574 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,7 +1,6 @@ -from polars import Object -from .types import FunctionInfoExtractor, ObjectHasher -from .legacy_core import legacy_hash -from .hash_utils import hash_object +from orcapod.hashing.types import FunctionInfoExtractor, ObjectHasher +from orcapod.hashing import legacy_core +from orcapod.hashing import hash_utils class BasicObjectHasher(ObjectHasher): @@ -30,7 +29,9 @@ def hash(self, obj: object) -> bytes: Returns: bytes: The byte representation of the hash. """ - return hash_object(obj, function_info_extractor=self.function_info_extractor) + return hash_utils.hash_object( + obj, function_info_extractor=self.function_info_extractor + ) class LegacyObjectHasher(ObjectHasher): @@ -54,6 +55,12 @@ def __init__( """ self.function_info_extractor = function_info_extractor + def get_hasher_id(self) -> str: + """ + Returns a unique identifier/name assigned to the hasher + """ + return "legacy_object_hasher" + def hash(self, obj: object) -> bytes: """ Hash an object to a byte representation. @@ -64,4 +71,6 @@ def hash(self, obj: object) -> bytes: Returns: bytes: The byte representation of the hash. """ - return legacy_hash(obj, function_info_extractor=self.function_info_extractor) + return legacy_core.legacy_hash( + obj, function_info_extractor=self.function_info_extractor + ) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index e6095a0..d2fec4d 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,6 +1,6 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher -from .types import ObjectHasher, ArrowHasher +from .types import ObjectHasher import importlib from typing import Any @@ -13,6 +13,7 @@ "hasher_id": "arrow_v0.1", "hash_algorithm": "sha256", "chunk_size": 8192, + "serialization_method": "logical", "semantic_type_hashers": { "path": { "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", @@ -65,7 +66,7 @@ def parse_objectspec(obj_spec: dict) -> Any: def get_versioned_semantic_arrow_hasher( version: str | None = None, -) -> ArrowHasher: +) -> SemanticArrowHasher: """ Get the versioned hasher for the specified version. diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index b5bd54e..fabb664 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,19 +1,16 @@ from orcapod.core.pod import Pod, FunctionPod from orcapod.core import SyncStream, Source, Kernel +from orcapod.core.streams import PolarsStream +from orcapod.core.streams import EmptyStream from orcapod.stores import ArrowDataStore from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry -from orcapod.types.typespec_utils import ( - get_typespec_from_dict, - union_typespecs, - extract_function_typespecs, -) +from orcapod.types.typespec_utils import union_typespecs from orcapod.types.semantic_type_registry import SemanticTypeRegistry from orcapod.types import packets, schemas from orcapod.hashing import ObjectHasher, ArrowHasher from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher from typing import Any, Literal from collections.abc import Collection, Iterator -import pyarrow as pa import polars as pl from orcapod.core.streams import SyncStreamFromGenerator @@ -26,91 +23,6 @@ def get_tag_typespec(tag: Tag) -> dict[str, type]: return {k: str for k in tag} -class PolarsSource(Source): - def __init__( - self, - df: pl.DataFrame, - tag_keys: Collection[str], - packet_keys: Collection[str] | None = None, - ): - self.df = df - self.tag_keys = tag_keys - self.packet_keys = packet_keys - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - if len(streams) != 0: - raise ValueError( - "PolarsSource does not support forwarding streams. " - "It generates its own stream from the DataFrame." - ) - return PolarsStream(self.df, self.tag_keys, self.packet_keys) - - -class PolarsStream(SyncStream): - def __init__( - self, - df: pl.DataFrame, - tag_keys: Collection[str], - packet_keys: Collection[str] | None = None, - ): - self.df = df - self.tag_keys = tuple(tag_keys) - self.packet_keys = tuple(packet_keys) if packet_keys is not None else None - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - df = self.df - # if self.packet_keys is not None: - # df = df.select(self.tag_keys + self.packet_keys) - for row in df.iter_rows(named=True): - tag = {key: row[key] for key in self.tag_keys} - packet = { - key: val - for key, val in row.items() - if key not in self.tag_keys and not key.startswith("_source_info_") - } - # TODO: revisit and fix this rather hacky implementation - source_info = { - key.removeprefix("_source_info_"): val - for key, val in row.items() - if key.startswith("_source_info_") - } - yield tag, Packet(packet, source_info=source_info) - - -class EmptyStream(SyncStream): - def __init__( - self, - tag_keys: Collection[str] | None = None, - packet_keys: Collection[str] | None = None, - tag_typespec: TypeSpec | None = None, - packet_typespec: TypeSpec | None = None, - ): - if tag_keys is None and tag_typespec is not None: - tag_keys = tag_typespec.keys() - self.tag_keys = list(tag_keys) if tag_keys else [] - - if packet_keys is None and packet_typespec is not None: - packet_keys = packet_typespec.keys() - self.packet_keys = list(packet_keys) if packet_keys else [] - - self.tag_typespec = tag_typespec - self.packet_typespec = packet_typespec - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.tag_keys, self.packet_keys - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.tag_typespec, self.packet_typespec - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - # Empty stream, no data to yield - return iter([]) - - class KernelInvocationWrapper(Kernel): def __init__( self, kernel: Kernel, input_streams: Collection[SyncStream], **kwargs @@ -119,10 +31,10 @@ def __init__( self.kernel = kernel self.input_streams = list(input_streams) - def __repr__(self): + def __repr__(self) -> str: return f"{self.__class__.__name__}<{self.kernel!r}>" - def __str__(self): + def __str__(self) -> str: return f"{self.__class__.__name__}<{self.kernel}>" def computed_label(self) -> str | None: @@ -187,7 +99,7 @@ def __init__( kernel: Kernel, input_streams: Collection[SyncStream], output_store: ArrowDataStore, - store_path_prefix: tuple[str, ...] | None = None, + store_path_prefix: tuple[str, ...] = (), kernel_hasher: ObjectHasher | None = None, arrow_packet_hasher: ArrowHasher | None = None, packet_type_registry: SemanticTypeRegistry | None = None, @@ -196,7 +108,7 @@ def __init__( super().__init__(kernel, input_streams, **kwargs) self.output_store = output_store - self.store_path_prefix = store_path_prefix or () + self.store_path_prefix = store_path_prefix # These are configurable but are not expected to be modified except for special circumstances if kernel_hasher is None: @@ -235,10 +147,27 @@ def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): # hasher changed -- trigger recomputation of properties that depend on kernel hasher self.update_cached_values() + @property + def source_info(self) -> tuple[str, ...]: + """ + Returns a tuple of (label, kernel_hash) that uniquely identifies the source of the cached outputs. + This is used to store and retrieve the outputs from the output store. + """ + return self.label, self.kernel_hasher.hash_to_hex( + self.kernel, prefix_hasher_id=True + ) + + @property + def store_path(self) -> tuple[str, ...]: + """ + Returns the path prefix for the output store. + This is used to store and retrieve the outputs from the output store. + """ + return self.store_path_prefix + self.source_info + def update_cached_values(self): - self.source_info = self.store_path_prefix + ( - self.label, - self.kernel_hasher.hash_to_hex(self.kernel, prefix_hasher_id=True), + self.kernel_hash = self.kernel_hasher.hash_to_hex( + self.kernel, prefix_hasher_id=True ) self.tag_keys, self.packet_keys = self.keys(trigger_run=False) self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) @@ -269,7 +198,6 @@ def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: raise ValueError( "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" ) - source_info_sig = ":".join(self.source_info) return PolarsStream( self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys ) @@ -304,9 +232,9 @@ def post_call(self, tag: Tag, packet: Packet) -> None: ) # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) - if not self.output_store.get_record(self.source_info, output_id): + if not self.output_store.get_record(self.store_path, output_id): self.output_store.add_record( - self.source_info, + self.store_path, output_id, output_table, ) @@ -320,7 +248,7 @@ def output_iterator_completion_hook(self) -> None: @property def lazy_df(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(self.source_info) + return self.output_store.get_all_records_as_polars(self.store_path) @property def df(self) -> pl.DataFrame | None: @@ -378,7 +306,9 @@ def __init__( output_store: ArrowDataStore, tag_store: ArrowDataStore | None = None, label: str | None = None, - store_path_prefix: tuple[str, ...] | None = None, + store_path_prefix: tuple[str, ...] = (), + output_store_path_prefix: tuple[str, ...] = (), + tag_store_path_prefix: tuple[str, ...] = (), skip_memoization_lookup: bool = False, skip_memoization: bool = False, skip_tag_record: bool = False, @@ -395,7 +325,9 @@ def __init__( error_handling=error_handling, **kwargs, ) - self.store_path_prefix = store_path_prefix or () + self.output_store_path_prefix = store_path_prefix + output_store_path_prefix + self.tag_store_path_prefix = store_path_prefix + tag_store_path_prefix + self.output_store = output_store self.tag_store = tag_store @@ -419,6 +351,18 @@ def __init__( self.update_cached_values() self._cache_computed = False + @property + def tag_keys(self) -> tuple[str, ...]: + if self._tag_keys is None: + raise ValueError("Tag keys are not set, cannot return tag keys") + return self._tag_keys + + @property + def output_keys(self) -> tuple[str, ...]: + if self._output_keys is None: + raise ValueError("Output keys are not set, cannot return output keys") + return self._output_keys + @property def object_hasher(self) -> ObjectHasher: return self._object_hasher @@ -459,17 +403,19 @@ def update_cached_values(self) -> None: self.function_pod_hash = self.object_hasher.hash_to_hex( self.function_pod, prefix_hasher_id=True ) + self.node_hash = self.object_hasher.hash_to_hex(self, prefix_hasher_id=True) self.input_typespec, self.output_typespec = ( self.function_pod.get_function_typespecs() ) - self.tag_keys, self.output_keys = self.keys(trigger_run=False) + tag_keys, output_keys = self.keys(trigger_run=False) - if self.tag_keys is None or self.output_keys is None: + if tag_keys is None or output_keys is None: raise ValueError( "Currently, cached function pod wrapper can only work with function pods that have keys defined." ) - self.tag_keys = tuple(self.tag_keys) - self.output_keys = tuple(self.output_keys) + self._tag_keys = tuple(tag_keys) + self._output_keys = tuple(output_keys) + self.tag_typespec, self.output_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.output_typespec is None: raise ValueError( @@ -532,9 +478,29 @@ def get_packet_key(self, packet: Packet) -> str: ) @property - def source_info(self): + def pod_source_info(self): return self.function_pod.function_name, self.function_pod_hash + @property + def node_source_info(self): + return self.label, self.node_hash + + @property + def output_store_path(self) -> tuple[str, ...]: + """ + Returns the path prefix for the output store. + This is used to store and retrieve the outputs from the output store. + """ + return self.output_store_path_prefix + self.pod_source_info + + @property + def tag_store_path(self) -> tuple[str, ...]: + """ + Returns the path prefix for the tag store. + This is used to store and retrieve the tags associated with memoized packets. + """ + return self.tag_store_path_prefix + self.node_source_info + def is_memoized(self, packet: Packet) -> bool: return self.retrieve_memoized(packet) is not None @@ -566,9 +532,9 @@ def _add_pipeline_record_with_packet_key( # TODO: add error handling # check if record already exists: - retrieved_table = self.tag_store.get_record(self.source_info, entry_hash) + retrieved_table = self.tag_store.get_record(self.tag_store_path, entry_hash) if retrieved_table is None: - self.tag_store.add_record(self.source_info, entry_hash, table) + self.tag_store.add_record(self.tag_store_path, entry_hash, table) return tag @@ -587,7 +553,7 @@ def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: """ logger.debug(f"Retrieving memoized packet with key {packet_key}") arrow_table = self.output_store.get_record( - self.source_info, + self.output_store_path, packet_key, ) if arrow_table is None: @@ -625,7 +591,7 @@ def _memoize_with_packet_key( # consider simpler alternative packets = self.output_converter.from_arrow_table_to_python_packets( self.output_store.add_record( - self.source_info, + self.output_store_path, packet_key, self.output_converter.from_python_packet_to_arrow_table(output_packet), ) @@ -668,7 +634,7 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: # e.g. if the output is a file, the path may be changed # add source info to the output packet source_info = { - k: "-".join(self.source_info) + "-" + packet_key + k: "-".join(self.pod_source_info) + "-" + packet_key + ":" + str(k) for k in output_packet.source_info } # TODO: fix and make this not access protected field directly @@ -691,12 +657,12 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: return tag, output_packet def get_all_outputs(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(self.source_info) + return self.output_store.get_all_records_as_polars(self.output_store_path) def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: if self.tag_store is None: raise ValueError("Tag store is not set, no tag record can be retrieved") - data = self.tag_store.get_all_records_as_polars(self.source_info) + data = self.tag_store.get_all_records_as_polars(self.tag_store_path) if not with_packet_id: return data.drop("__packet_key") if data is not None else None return data @@ -711,11 +677,11 @@ def get_all_entries_with_tags( if self.tag_store is None: raise ValueError("Tag store is not set, no tag record can be retrieved") - tag_records = self.tag_store.get_all_records_as_polars(self.source_info) + tag_records = self.tag_store.get_all_records_as_polars(self.tag_store_path) if tag_records is None: return None result_packets = self.output_store.get_records_by_ids_as_polars( - self.source_info, + self.output_store_path, tag_records.collect()["__packet_key"], preserve_input_order=True, ) @@ -790,14 +756,19 @@ class DummyCachedFunctionPod(CachedFunctionPodWrapper): """ def __init__(self, source_pod: CachedFunctionPodWrapper): - self._source_info = source_pod.source_info + self._pod_source_info = source_pod.pod_source_info + self._node_source_info = source_pod.node_source_info self.output_store = source_pod.output_store self.tag_store = source_pod.tag_store self.function_pod = DummyFunctionPod(source_pod.function_pod.function_name) @property - def source_info(self) -> tuple[str, str]: - return self._source_info + def pod_source_info(self) -> tuple[str, str]: + return self._pod_source_info + + @property + def node_source_info(self) -> tuple[str, str]: + return self._node_source_info class Node(KernelInvocationWrapper, Source): From 4f079270a136656eca3f99bc6577d1d35b28e156 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:57:10 +0000 Subject: [PATCH 042/224] wip: delta table store implementation --- .../stores/delta_table_arrow_data_store.py | 683 +++++++++++++++--- 1 file changed, 598 insertions(+), 85 deletions(-) diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/delta_table_arrow_data_store.py index c05dea9..e5ddfb9 100644 --- a/src/orcapod/stores/delta_table_arrow_data_store.py +++ b/src/orcapod/stores/delta_table_arrow_data_store.py @@ -1,10 +1,14 @@ import pyarrow as pa +import pyarrow.compute as pc import polars as pl from pathlib import Path -from typing import Any, Union +from typing import Any, Dict, List import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError +import threading +from collections import defaultdict +import json # Module-level logger logger = logging.getLogger(__name__) @@ -12,7 +16,7 @@ class DeltaTableArrowDataStore: """ - Delta Table-based Arrow data store with flexible hierarchical path support. + Delta Table-based Arrow data store with flexible hierarchical path support and schema preservation. Uses tuple-based source paths for robust parameter handling: - ("source_name", "source_id") -> source_name/source_id/ @@ -26,6 +30,8 @@ def __init__( duplicate_entry_behavior: str = "error", create_base_path: bool = True, max_hierarchy_depth: int = 10, + batch_size: int = 100, + auto_flush_interval: float = 300.0, # 5 minutes ): """ Initialize the DeltaTableArrowDataStore. @@ -37,6 +43,8 @@ def __init__( - 'overwrite': Replace existing entry with new data create_base_path: Whether to create the base path if it doesn't exist max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + auto_flush_interval: Time in seconds to auto-flush pending batches (0 to disable) """ # Validate duplicate behavior if duplicate_entry_behavior not in ["error", "overwrite"]: @@ -45,6 +53,8 @@ def __init__( self.duplicate_entry_behavior = duplicate_entry_behavior self.base_path = Path(base_path) self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + self.auto_flush_interval = auto_flush_interval if create_base_path: self.base_path.mkdir(parents=True, exist_ok=True) @@ -56,11 +66,55 @@ def __init__( # Cache for Delta tables to avoid repeated initialization self._delta_table_cache: dict[str, DeltaTable] = {} + # Cache for original schemas (without __entry_id column) + self._schema_cache: dict[str, pa.Schema] = {} + + # Batch management + self._pending_batches: Dict[str, List[pa.Table]] = defaultdict(list) + self._batch_lock = threading.Lock() + + # Auto-flush timer + self._flush_timer = None + # if auto_flush_interval > 0: + # self._start_auto_flush_timer() + logger.info( f"Initialized DeltaTableArrowDataStore at {self.base_path} " - f"with duplicate_entry_behavior='{duplicate_entry_behavior}'" + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, auto_flush_interval={auto_flush_interval}s" ) + def _start_auto_flush_timer(self): + """Start the auto-flush timer.""" + if self._flush_timer: + self._flush_timer.cancel() + + if self.auto_flush_interval > 0: + self._flush_timer = threading.Timer( + self.auto_flush_interval, self._auto_flush + ) + self._flush_timer.daemon = True + self._flush_timer.start() + + def _auto_flush(self): + """Auto-flush all pending batches.""" + try: + print("Flushing!", flush=True) + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during auto-flush: {e}") + finally: + self._start_auto_flush_timer() + + def __del__(self): + """Cleanup when object is destroyed.""" + try: + if self._flush_timer: + self._flush_timer.cancel() + self.flush_all_batches() + except Exception: + pass # Ignore errors during cleanup + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: """ Validate source path components. @@ -104,6 +158,204 @@ def _get_table_path(self, source_path: tuple[str, ...]) -> Path: path = path / component return path + def _get_schema_metadata_path(self, source_path: tuple[str, ...]) -> Path: + """Get the path for storing original schema metadata.""" + table_path = self._get_table_path(source_path) + return table_path / "_original_schema.json" + + def _save_original_schema( + self, source_path: tuple[str, ...], schema: pa.Schema + ) -> None: + """Save the original schema (without __entry_id) to metadata file.""" + source_key = self._get_source_key(source_path) + + # Cache the schema + self._schema_cache[source_key] = schema + + try: + # Save to file as well for persistence + schema_path = self._get_schema_metadata_path(source_path) + schema_path.parent.mkdir(parents=True, exist_ok=True) + + # Convert schema to JSON-serializable format + def convert_metadata(metadata): + """Convert Arrow metadata (bytes keys/values) to JSON-safe format.""" + if metadata is None: + return None + result = {} + for key, value in metadata.items(): + # Convert bytes keys and values to strings + str_key = ( + key.decode("utf-8") if isinstance(key, bytes) else str(key) + ) + str_value = ( + value.decode("utf-8") + if isinstance(value, bytes) + else str(value) + ) + result[str_key] = str_value + return result + + schema_dict = { + "fields": [ + { + "name": field.name, + "type": str(field.type), + "nullable": field.nullable, + "metadata": convert_metadata(field.metadata), + } + for field in schema + ], + "metadata": convert_metadata(schema.metadata), + } + + with open(schema_path, "w") as f: + json.dump(schema_dict, f, indent=2) + + except Exception as e: + logger.warning(f"Could not save schema metadata for {source_key}: {e}") + + def _load_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: + """Load the original schema from cache or metadata file.""" + source_key = self._get_source_key(source_path) + + # Check cache first + if source_key in self._schema_cache: + return self._schema_cache[source_key] + + # Try to load from file + try: + schema_path = self._get_schema_metadata_path(source_path) + if not schema_path.exists(): + return None + + with open(schema_path, "r") as f: + schema_dict = json.load(f) + + # Reconstruct schema from JSON + def convert_metadata_back(metadata_dict): + """Convert JSON metadata back to Arrow format (bytes keys/values).""" + if metadata_dict is None: + return None + result = {} + for key, value in metadata_dict.items(): + # Convert string keys and values back to bytes + bytes_key = key.encode("utf-8") + bytes_value = ( + value.encode("utf-8") + if isinstance(value, str) + else str(value).encode("utf-8") + ) + result[bytes_key] = bytes_value + return result + + fields = [] + for field_dict in schema_dict["fields"]: + # Parse the type string back to Arrow type + type_str = field_dict["type"] + arrow_type = self._parse_arrow_type_string(type_str) + + metadata = convert_metadata_back(field_dict.get("metadata")) + + field = pa.field( + field_dict["name"], + arrow_type, + nullable=field_dict["nullable"], + metadata=metadata, + ) + fields.append(field) + + schema_metadata = convert_metadata_back(schema_dict.get("metadata")) + + schema = pa.schema(fields, metadata=schema_metadata) + + # Cache it + self._schema_cache[source_key] = schema + return schema + + except Exception as e: + logger.warning(f"Could not load schema metadata for {source_key}: {e}") + return None + + def _parse_arrow_type_string(self, type_str: str) -> pa.DataType: + """Parse Arrow type string back to Arrow type object.""" + # This is a simplified parser for common types + # You might need to extend this for more complex types + type_str = type_str.strip() + + # Handle basic types + if type_str == "int64": + return pa.int64() + elif type_str == "int32": + return pa.int32() + elif type_str == "float64": + return pa.float64() + elif type_str == "float32": + return pa.float32() + elif type_str == "bool": + return pa.bool_() + elif type_str == "string": + return pa.string() + elif type_str == "large_string": + return pa.large_string() + elif type_str == "binary": + return pa.binary() + elif type_str == "large_binary": + return pa.large_binary() + elif type_str.startswith("timestamp"): + # Extract timezone if present + if "[" in type_str and "]" in type_str: + tz = type_str.split("[")[1].split("]")[0] + if tz == "UTC": + tz = "UTC" + return pa.timestamp("us", tz=tz) + else: + return pa.timestamp("us") + elif type_str.startswith("list<"): + # Parse list type + inner_type_str = type_str[5:-1] # Remove 'list<' and '>' + inner_type = self._parse_arrow_type_string(inner_type_str) + return pa.list_(inner_type) + else: + # Fallback to string for unknown types + logger.warning(f"Unknown Arrow type string: {type_str}, using string") + return pa.string() + + def _get_or_create_delta_table( + self, source_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + source_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(source_path) + table_path = self._get_table_path(source_path) + + # Check cache first + if source_key in self._delta_table_cache: + return self._delta_table_cache[source_key] + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: """Ensure the table has an __entry_id column.""" if "__entry_id" not in arrow_data.column_names: @@ -150,21 +402,199 @@ def _handle_entry_id_column( # If add_entry_id_column is True, keep __entry_id as is return arrow_data + def _create_entry_id_filter(self, entry_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + entry_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "=", entry_id)] + + def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + entry_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "in", entry_ids)] + + def _read_table_with_schema_preservation( + self, + delta_table: DeltaTable, + source_path: tuple[str, ...], + filters: list = None, + ) -> pa.Table: + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + source_path: Source path for schema lookup + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + try: + # Get the original schema (without __entry_id) + original_schema = self._load_original_schema(source_path) + + if original_schema is not None: + # Create target schema with __entry_id column + entry_id_field = pa.field( + "__entry_id", pa.large_string(), nullable=False + ) + target_schema = pa.schema([entry_id_field] + list(original_schema)) + + # Use to_pyarrow_dataset with the target schema + dataset = delta_table.to_pyarrow_dataset(schema=target_schema) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) + else: + # Fallback to table-level filtering + return delta_table.to_pyarrow_table(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + else: + # Fallback to regular method if no schema found + logger.warning( + f"No original schema found for {'/'.join(source_path)}, using fallback" + ) + return delta_table.to_pyarrow_table(filters=filters) + + except Exception as e: + logger.warning( + f"Error reading with schema preservation: {e}, falling back to regular method" + ) + return delta_table.to_pyarrow_table(filters=filters) + + def _flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + print("Flushing triggered!!", flush=True) + source_key = self._get_source_key(source_path) + + with self._batch_lock: + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = [] + + if not pending_tables: + return + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables) + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_or_create_delta_table(source_path) + + if delta_table is None: + # Create new table - save original schema first + original_schema = self._remove_entry_id_column(combined_table).schema + self._save_original_schema(source_path, original_schema) + + write_deltalake(str(table_path), combined_table, mode="overwrite") + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + # Handle duplicates if needed + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # Append new records + write_deltalake( + str(table_path), combined_table, mode="append", schema_mode="merge" + ) + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + with self._batch_lock: + self._pending_batches[source_key] = ( + pending_tables + self._pending_batches[source_key] + ) + raise + def add_record( self, source_path: tuple[str, ...], entry_id: str, arrow_data: pa.Table, ignore_duplicate: bool = False, + force_flush: bool = False, ) -> pa.Table: """ - Add a record to the Delta table. + Add a record to the Delta table (batched). Args: source_path: Tuple of path components (e.g., ("org", "project", "dataset")) entry_id: Unique identifier for this record arrow_data: The Arrow table data to store ignore_duplicate: If True, ignore duplicate entry error + force_flush: If True, immediately flush this record to disk Returns: The Arrow table data that was stored @@ -173,18 +603,15 @@ def add_record( ValueError: If entry_id already exists and duplicate_entry_behavior is 'error' """ self._validate_source_path(source_path) - - table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) - # Ensure directory exists - table_path.mkdir(parents=True, exist_ok=True) - - # Add entry_id column to the data - data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) - - # Check for existing entry if needed - if not ignore_duplicate and self.duplicate_entry_behavior == "error": + # Check for existing entry if needed (only for immediate duplicates, not batch) + if ( + not ignore_duplicate + and self.duplicate_entry_behavior == "error" + and not force_flush + ): + # Only check existing table, not pending batch for performance existing_record = self.get_record(source_path, entry_id) if existing_record is not None: raise ValueError( @@ -192,60 +619,121 @@ def add_record( f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - try: - # Try to load existing table - delta_table = DeltaTable(str(table_path)) + # Save original schema if this is the first record for this source + if source_key not in self._schema_cache: + self._save_original_schema(source_path, arrow_data.schema) - if self.duplicate_entry_behavior == "overwrite": - # Delete existing record if it exists, then append new one - try: - # First, delete existing record with this entry_id - delta_table.delete(f"__entry_id = '{entry_id}'") - logger.debug( - f"Deleted existing record {entry_id} from {source_key}" - ) - except Exception as e: - # If delete fails (e.g., record doesn't exist), that's fine - logger.debug(f"No existing record to delete for {entry_id}: {e}") + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) - # Append new record - write_deltalake( - str(table_path), data_with_entry_id, mode="append", schema_mode="merge" - ) + if force_flush: + # Write immediately + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_or_create_delta_table(source_path) + + if delta_table is None: + # Create new table - save original schema first + self._save_original_schema(source_path, arrow_data.schema) + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {entry_id}: {e}" + ) + + write_deltalake( + str(table_path), + data_with_entry_id, + mode="append", + schema_mode="merge", + ) - except TableNotFoundError: - # Table doesn't exist, create it - write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") - logger.debug(f"Created new Delta table for {source_key}") + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to batch + with self._batch_lock: + self._pending_batches[source_key].append(data_with_entry_id) + batch_size = len(self._pending_batches[source_key]) - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + # Check if we need to flush + if batch_size >= self.batch_size: + self._flush_batch(source_path) logger.debug(f"Added record {entry_id} to {source_key}") return arrow_data + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Manually flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + self._flush_batch(source_path) + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + with self._batch_lock: + source_keys = list(self._pending_batches.keys()) + + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self._flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def get_pending_batch_info(self) -> Dict[str, int]: + """ + Get information about pending batches. + + Returns: + Dictionary mapping source keys to number of pending records + """ + with self._batch_lock: + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } + def get_record( self, source_path: tuple[str, ...], entry_id: str ) -> pa.Table | None: """ - Get a specific record by entry_id. + Get a specific record by entry_id with schema preservation. Args: source_path: Tuple of path components entry_id: Unique identifier for the record Returns: - Arrow table for the record, or None if not found + Arrow table for the record with original schema, or None if not found """ self._validate_source_path(source_path) - table_path = self._get_table_path(source_path) + delta_table = self._get_or_create_delta_table(source_path) + if delta_table is None: + return None try: - delta_table = DeltaTable(str(table_path)) - - # Query for the specific entry_id - result = delta_table.to_pyarrow_table(filter=f"__entry_id = '{entry_id}'") + # Use schema-preserving read + filter_expr = self._create_entry_id_filter(entry_id) + result = self._read_table_with_schema_preservation( + delta_table, source_path, filters=filter_expr + ) if len(result) == 0: return None @@ -253,19 +741,17 @@ def get_record( # Remove the __entry_id column before returning return self._remove_entry_id_column(result) - except TableNotFoundError: - return None except Exception as e: logger.error( f"Error getting record {entry_id} from {'/'.join(source_path)}: {e}" ) - return None + raise e def get_all_records( self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False ) -> pa.Table | None: """ - Retrieve all records for a given source path as a single table. + Retrieve all records for a given source path as a single table with schema preservation. Args: source_path: Tuple of path components @@ -275,15 +761,17 @@ def get_all_records( - str: Include entry ID column with custom name Returns: - Arrow table containing all records, or None if no records found + Arrow table containing all records with original schema, or None if no records found """ self._validate_source_path(source_path) - table_path = self._get_table_path(source_path) + delta_table = self._get_or_create_delta_table(source_path) + if delta_table is None: + return None try: - delta_table = DeltaTable(str(table_path)) - result = delta_table.to_pyarrow_table() + # Use schema-preserving read + result = self._read_table_with_schema_preservation(delta_table, source_path) if len(result) == 0: return None @@ -291,8 +779,6 @@ def get_all_records( # Handle entry_id column based on parameter return self._handle_entry_id_column(result, add_entry_id_column) - except TableNotFoundError: - return None except Exception as e: logger.error(f"Error getting all records from {'/'.join(source_path)}: {e}") return None @@ -322,7 +808,7 @@ def get_records_by_ids( preserve_input_order: bool = False, ) -> pa.Table | None: """ - Retrieve records by entry IDs as a single table. + Retrieve records by entry IDs as a single table with schema preservation. Args: source_path: Tuple of path components @@ -331,7 +817,7 @@ def get_records_by_ids( preserve_input_order: If True, return results in input order with nulls for missing Returns: - Arrow table containing all found records, or None if no records found + Arrow table containing all found records with original schema, or None if no records found """ self._validate_source_path(source_path) @@ -353,16 +839,16 @@ def get_records_by_ids( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" ) - table_path = self._get_table_path(source_path) + delta_table = self._get_or_create_delta_table(source_path) + if delta_table is None: + return None try: - delta_table = DeltaTable(str(table_path)) - - # Create filter for the entry IDs - escape single quotes in IDs - escaped_ids = [id_.replace("'", "''") for id_ in entry_ids_list] - id_filter = " OR ".join([f"__entry_id = '{id_}'" for id_ in escaped_ids]) - - result = delta_table.to_pyarrow_table(filter=id_filter) + # Use schema-preserving read with filters + filter_expr = self._create_entry_ids_filter(entry_ids_list) + result = self._read_table_with_schema_preservation( + delta_table, source_path, filters=filter_expr + ) if len(result) == 0: return None @@ -383,8 +869,6 @@ def get_records_by_ids( # Handle entry_id column based on parameter return self._handle_entry_id_column(result, add_entry_id_column) - except TableNotFoundError: - return None except Exception as e: logger.error( f"Error getting records by IDs from {'/'.join(source_path)}: {e}" @@ -462,6 +946,9 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: """ self._validate_source_path(source_path) + # Flush any pending batches first + self._flush_batch(source_path) + table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) @@ -469,9 +956,11 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: return False try: - # Remove from cache + # Remove from caches if source_key in self._delta_table_cache: del self._delta_table_cache[source_key] + if source_key in self._schema_cache: + del self._schema_cache[source_key] # Remove directory import shutil @@ -498,21 +987,26 @@ def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: """ self._validate_source_path(source_path) - table_path = self._get_table_path(source_path) + # Flush any pending batches first + self._flush_batch(source_path) - try: - delta_table = DeltaTable(str(table_path)) + delta_table = self._get_or_create_delta_table(source_path) + if delta_table is None: + return False - # Check if record exists - escaped_entry_id = entry_id.replace("'", "''") - existing = delta_table.to_pyarrow_table( - filter=f"__entry_id = '{escaped_entry_id}'" + try: + # Check if record exists using proper filter + filter_expr = self._create_entry_id_filter(entry_id) + existing = self._read_table_with_schema_preservation( + delta_table, source_path, filters=filter_expr ) if len(existing) == 0: return False - # Delete the record - delta_table.delete(f"__entry_id = '{escaped_entry_id}'") + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) # Update cache source_key = self._get_source_key(source_path) @@ -521,8 +1015,6 @@ def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") return True - except TableNotFoundError: - return False except Exception as e: logger.error( f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" @@ -541,27 +1033,48 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: """ self._validate_source_path(source_path) - table_path = self._get_table_path(source_path) + delta_table = self._get_or_create_delta_table(source_path) + if delta_table is None: + return None try: - delta_table = DeltaTable(str(table_path)) - # Get basic info schema = delta_table.schema() history = delta_table.history() + source_key = self._get_source_key(source_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + # Get original schema info + original_schema = self._load_original_schema(source_path) return { - "path": str(table_path), + "path": str(self._get_table_path(source_path)), "source_path": source_path, "schema": schema, + "original_schema": original_schema, "version": delta_table.version(), "num_files": len(delta_table.files()), "history_length": len(history), "latest_commit": history[0] if history else None, + "pending_records": pending_count, } - except TableNotFoundError: - return None except Exception as e: logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") return None + + def get_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: + """ + Get the original schema (without __entry_id column) for a source path. + + Args: + source_path: Tuple of path components + + Returns: + Original Arrow schema or None if not found + """ + self._validate_source_path(source_path) + return self._load_original_schema(source_path) From 1b7519e7a929e9e70695d5e257ee748f4787fa47 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:57:46 +0000 Subject: [PATCH 043/224] feat: better handling of stores and add flushing to stores and pipeline --- src/orcapod/pipeline/pipeline.py | 25 ++++++++++++++++++++++--- src/orcapod/stores/types.py | 4 ++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 7e04d96..a30aa62 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -34,13 +34,23 @@ def __init__( self, name: str | tuple[str, ...], pipeline_store: ArrowDataStore, - results_store: ArrowDataStore, + results_store: ArrowDataStore | None = None, auto_compile: bool = True, ) -> None: super().__init__() if not isinstance(name, tuple): name = (name,) self.name = name + self.pipeline_store_path_prefix = self.name + self.results_store_path_prefix = () + if results_store is None: + if pipeline_store is None: + raise ValueError( + "Either pipeline_store or results_store must be provided" + ) + results_store = pipeline_store + self.results_store_path_prefix = self.name + ("_results",) + self.pipeline_store = pipeline_store self.results_store = results_store self.labels_to_nodes = {} @@ -78,6 +88,12 @@ def save(self, path: Path | str) -> None: temp_path.unlink() raise + def flush(self) -> None: + """Flush all pending writes to the data store""" + self.pipeline_store.flush() + self.results_store.flush() + logger.info("Pipeline stores flushed") + def record(self, invocation: Invocation) -> None: """ Record an invocation in the pipeline. @@ -93,13 +109,14 @@ def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node input_nodes, output_store=self.results_store, tag_store=self.pipeline_store, - store_path_prefix=self.name, + output_store_path_prefix=self.results_store_path_prefix, + tag_store_path_prefix=self.pipeline_store_path_prefix, ) return KernelNode( kernel, input_nodes, output_store=self.pipeline_store, - store_path_prefix=self.name, + store_path_prefix=self.pipeline_store_path_prefix, ) def compile(self): @@ -175,6 +192,8 @@ def run(self, full_sync: bool = False) -> None: node.reset_cache() node.flow() + self.flush() + @classmethod def load(cls, path: Path | str) -> "Pipeline": """Load complete pipeline state""" diff --git a/src/orcapod/stores/types.py b/src/orcapod/stores/types.py index da7e492..42b0ed5 100644 --- a/src/orcapod/stores/types.py +++ b/src/orcapod/stores/types.py @@ -80,3 +80,7 @@ def get_records_by_ids_as_polars( ) -> pl.LazyFrame | None: """Retrieve records by entry IDs as a single Polars DataFrame.""" ... + + def flush(self) -> None: + """Flush all pending writes/saves to the data store.""" + ... From 07fd76e340bfd30b69578766b1917cd27eacee17 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:58:11 +0000 Subject: [PATCH 044/224] feat: integrate actual saving to parquet into simple in memory store --- src/orcapod/stores/arrow_data_stores.py | 205 ++++++++++++------------ 1 file changed, 102 insertions(+), 103 deletions(-) diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/arrow_data_stores.py index 2897ead..9d001a6 100644 --- a/src/orcapod/stores/arrow_data_stores.py +++ b/src/orcapod/stores/arrow_data_stores.py @@ -8,6 +8,7 @@ from datetime import datetime, timedelta import logging from orcapod.stores.types import DuplicateError +from pathlib import Path # Module-level logger logger = logging.getLogger(__name__) @@ -101,7 +102,9 @@ class SimpleInMemoryDataStore: Uses dict of dict of Arrow tables for efficient storage and retrieval. """ - def __init__(self, duplicate_entry_behavior: str = "error"): + def __init__( + self, path: str | Path | None = None, duplicate_entry_behavior: str = "error" + ): """ Initialize the InMemoryArrowDataStore. @@ -120,6 +123,12 @@ def __init__(self, duplicate_entry_behavior: str = "error"): logger.info( f"Initialized InMemoryArrowDataStore with duplicate_entry_behavior='{duplicate_entry_behavior}'" ) + self.base_path = Path(path) if path else None + if self.base_path: + try: + self.base_path.mkdir(parents=True, exist_ok=True) + except Exception as e: + logger.error(f"Error creating base path {self.base_path}: {e}") def _get_source_key(self, source_path: tuple[str, ...]) -> str: """Generate key for source storage.""" @@ -170,10 +179,16 @@ def add_record( logger.debug(f"{action} record {entry_id} in {source_key}") return arrow_data + def load_existing_record(self, source_path: tuple[str, ...]): + source_key = self._get_source_key(source_path) + if self.base_path is not None and source_key not in self._in_memory_store: + self.load_from_parquet(self.base_path, source_path) + def get_record( self, source_path: tuple[str, ...], entry_id: str ) -> pa.Table | None: """Get a specific record.""" + self.load_existing_record(source_path) source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) return local_data.get(entry_id) @@ -182,6 +197,7 @@ def get_all_records( self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False ) -> pa.Table | None: """Retrieve all records for a given source as a single table.""" + self.load_existing_record(source_path) source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) @@ -257,6 +273,8 @@ def get_records_by_ids( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" ) + self.load_existing_record(source_path) + source_key = self._get_source_key(source_path) local_data = self._in_memory_store.get(source_key, {}) @@ -394,19 +412,12 @@ def save_to_parquet(self, base_path: str | Path) -> None: saved_count = 0 - for source_key, local_data in self._in_memory_store.items(): + for source_id, local_data in self._in_memory_store.items(): if not local_data: continue - # Parse source_name and source_id from the key - if ":" not in source_key: - logger.warning(f"Invalid source key format: {source_key}, skipping") - continue - - source_name, source_id = source_key.split(":", 1) - # Create directory structure - source_dir = base_path / source_name / source_id + source_dir = base_path / source_id source_dir.mkdir(parents=True, exist_ok=True) # Combine all tables for this source with entry_id column @@ -430,12 +441,14 @@ def save_to_parquet(self, base_path: str | Path) -> None: saved_count += 1 logger.debug( - f"Saved {len(combined_table)} records for {source_key} to {parquet_path}" + f"Saved {len(combined_table)} records for {source_id} to {parquet_path}" ) logger.info(f"Saved {saved_count} sources to Parquet files in {base_path}") - def load_from_parquet(self, base_path: str | Path) -> None: + def load_from_parquet( + self, base_path: str | Path, source_path: tuple[str, ...] + ) -> None: """ Load data from Parquet files with the expected directory structure. @@ -444,113 +457,99 @@ def load_from_parquet(self, base_path: str | Path) -> None: Args: base_path: Base directory path containing the Parquet files """ - base_path = Path(base_path) - if not base_path.exists(): + source_key = self._get_source_key(source_path) + target_path = Path(base_path) / source_key + + if not target_path.exists(): logger.warning(f"Base path {base_path} does not exist") return - # Clear existing data - self._in_memory_store.clear() - loaded_count = 0 - # Traverse directory structure: source_name/source_id/ - for source_name_dir in base_path.iterdir(): - if not source_name_dir.is_dir(): - continue - - source_name = source_name_dir.name + # Look for Parquet files in this directory + parquet_files = list(target_path.glob("*.parquet")) + if not parquet_files: + logger.debug(f"No Parquet files found in {target_path}") + return - for source_id_dir in source_name_dir.iterdir(): - if not source_id_dir.is_dir(): - continue + # Load all Parquet files and combine them + all_records = [] - source_id = source_id_dir.name - source_key = self._get_source_key((source_name, source_id)) + for parquet_file in parquet_files: + try: + import pyarrow.parquet as pq - # Look for Parquet files in this directory - parquet_files = list(source_id_dir.glob("*.parquet")) + table = pq.read_table(parquet_file) - if not parquet_files: - logger.debug(f"No Parquet files found in {source_id_dir}") + # Validate that __entry_id column exists + if "__entry_id" not in table.column_names: + logger.warning( + f"Parquet file {parquet_file} missing __entry_id column, skipping" + ) continue - # Load all Parquet files and combine them - all_records = [] + all_records.append(table) + logger.debug(f"Loaded {len(table)} records from {parquet_file}") - for parquet_file in parquet_files: - try: - import pyarrow.parquet as pq - - table = pq.read_table(parquet_file) - - # Validate that __entry_id column exists - if "__entry_id" not in table.column_names: - logger.warning( - f"Parquet file {parquet_file} missing __entry_id column, skipping" - ) - continue - - all_records.append(table) - logger.debug(f"Loaded {len(table)} records from {parquet_file}") - - except Exception as e: - logger.error(f"Failed to load Parquet file {parquet_file}: {e}") - continue + except Exception as e: + logger.error(f"Failed to load Parquet file {parquet_file}: {e}") + continue - # Process all records for this source - if all_records: - # Combine all tables - if len(all_records) == 1: - combined_table = all_records[0] - else: - combined_table = pa.concat_tables(all_records) - - # Split back into individual records by entry_id - local_data = {} - entry_ids = combined_table.column("__entry_id").to_pylist() - - # Group records by entry_id - entry_id_groups = {} - for i, entry_id in enumerate(entry_ids): - if entry_id not in entry_id_groups: - entry_id_groups[entry_id] = [] - entry_id_groups[entry_id].append(i) - - # Extract each entry_id's records - for entry_id, indices in entry_id_groups.items(): - # Take rows for this entry_id and remove __entry_id column - entry_table = combined_table.take(indices) - - # Remove __entry_id column - column_names = entry_table.column_names - if "__entry_id" in column_names: - indices_to_keep = [ - i - for i, name in enumerate(column_names) - if name != "__entry_id" - ] - entry_table = entry_table.select(indices_to_keep) - - local_data[entry_id] = entry_table - - self._in_memory_store[source_key] = local_data - loaded_count += 1 - - record_count = len(combined_table) - unique_entries = len(entry_id_groups) - logger.debug( - f"Loaded {record_count} records ({unique_entries} unique entries) for {source_key}" - ) + # Process all records for this source + if all_records: + # Combine all tables + if len(all_records) == 1: + combined_table = all_records[0] + else: + combined_table = pa.concat_tables(all_records) + + # Split back into individual records by entry_id + local_data = {} + entry_ids = combined_table.column("__entry_id").to_pylist() + + # Group records by entry_id + entry_id_groups = {} + for i, entry_id in enumerate(entry_ids): + if entry_id not in entry_id_groups: + entry_id_groups[entry_id] = [] + entry_id_groups[entry_id].append(i) + + # Extract each entry_id's records + for entry_id, indices in entry_id_groups.items(): + # Take rows for this entry_id and remove __entry_id column + entry_table = combined_table.take(indices) + + # Remove __entry_id column + column_names = entry_table.column_names + if "__entry_id" in column_names: + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + entry_table = entry_table.select(indices_to_keep) + + local_data[entry_id] = entry_table + + self._in_memory_store[source_key] = local_data + loaded_count += 1 + + record_count = len(combined_table) + unique_entries = len(entry_id_groups) + logger.info( + f"Loaded {record_count} records ({unique_entries} unique entries) for {source_key}" + ) - logger.info(f"Loaded {loaded_count} sources from Parquet files in {base_path}") + def flush(self): + """ + Flush all in-memory data to Parquet files in the base path. + This will overwrite existing files. + """ + if self.base_path is None: + logger.warning("Base path is not set, cannot flush data") + return - # Log summary of loaded data - total_records = sum( - len(local_data) for local_data in self._in_memory_store.values() - ) - logger.info(f"Total records loaded: {total_records}") + logger.info(f"Flushing data to Parquet files in {self.base_path}") + self.save_to_parquet(self.base_path) @dataclass From 8411b40833ee28ced7a001be1545ea528004b995 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 2 Jul 2025 00:58:51 +0000 Subject: [PATCH 045/224] refactor: cleanup improt and comment out old packet converter for future removal --- src/orcapod/types/packet_converter.py | 364 +++++++++++++------------- src/orcapod/types/schemas.py | 1 - 2 files changed, 182 insertions(+), 183 deletions(-) diff --git a/src/orcapod/types/packet_converter.py b/src/orcapod/types/packet_converter.py index e486222..6edea00 100644 --- a/src/orcapod/types/packet_converter.py +++ b/src/orcapod/types/packet_converter.py @@ -1,182 +1,182 @@ -from orcapod.types.core import TypeSpec, TypeHandler -from orcapod.types.packets import Packet, PacketLike -from orcapod.types.semantic_type_registry import ( - SemanticTypeRegistry, - TypeInfo, - get_metadata_from_schema, - arrow_to_dicts, -) -from typing import Any -from collections.abc import Mapping, Sequence -import pyarrow as pa -import logging - -logger = logging.getLogger(__name__) - - -def is_packet_supported( - python_type_info: TypeSpec, - registry: SemanticTypeRegistry, - type_lut: dict | None = None, -) -> bool: - """Check if all types in the packet are supported by the registry or known to the default lut.""" - if type_lut is None: - type_lut = {} - return all( - python_type in registry or python_type in type_lut - for python_type in python_type_info.values() - ) - - -class PacketConverter: - def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): - self.python_type_spec = python_type_spec - self.registry = registry - - # Lookup handlers and type info for fast access - self.handlers: dict[str, TypeHandler] = {} - self.storage_type_info: dict[str, TypeInfo] = {} - - self.expected_key_set = set(python_type_spec.keys()) - - # prepare the corresponding arrow table schema with metadata - self.keys_with_handlers, self.schema = create_schema_from_python_type_info( - python_type_spec, registry - ) - - self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") - - def _check_key_consistency(self, keys): - """Check if the provided keys match the expected keys.""" - keys_set = set(keys) - if keys_set != self.expected_key_set: - missing_keys = self.expected_key_set - keys_set - extra_keys = keys_set - self.expected_key_set - error_parts = [] - if missing_keys: - error_parts.append(f"Missing keys: {missing_keys}") - if extra_keys: - error_parts.append(f"Extra keys: {extra_keys}") - - raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - - def _to_storage_packet(self, packet: PacketLike) -> dict[str, Any]: - """Convert packet to storage representation. - - Args: - packet: Dictionary mapping parameter names to Python values - - Returns: - Dictionary with same keys but values converted to storage format - - Raises: - KeyError: If packet keys don't match the expected type_info keys - TypeError: If value type doesn't match expected type - ValueError: If conversion fails - """ - # Validate packet keys - packet_keys = set(packet.keys()) - - self._check_key_consistency(packet_keys) - - # Convert each value - storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet - - for key, handler in self.keys_with_handlers: - try: - storage_packet[key] = handler.python_to_storage(storage_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return storage_packet - - def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> PacketLike: - """Convert storage packet back to Python packet. - - Args: - storage_packet: Dictionary with values in storage format - - Returns: - Packet with values converted back to Python types - - Raises: - KeyError: If storage packet keys don't match the expected type_info keys - TypeError: If value type doesn't match expected type - ValueError: If conversion fails - """ - # Validate storage packet keys - storage_keys = set(storage_packet.keys()) - - self._check_key_consistency(storage_keys) - - # Convert each value back to Python type - packet: PacketLike = dict(storage_packet) - - for key, handler in self.keys_with_handlers: - try: - packet[key] = handler.storage_to_python(storage_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return packet - - def to_arrow_table(self, packet: PacketLike | Sequence[PacketLike]) -> pa.Table: - """Convert packet to PyArrow Table with field metadata. - - Args: - packet: Dictionary mapping parameter names to Python values - - Returns: - PyArrow Table with the packet data as a single row - """ - # Convert packet to storage format - if not isinstance(packet, Sequence): - packets = [packet] - else: - packets = packet - - storage_packets = [self._to_storage_packet(p) for p in packets] - - # Create arrays - arrays = [] - for field in self.schema: - values = [p[field.name] for p in storage_packets] - array = pa.array(values, type=field.type) - arrays.append(array) - - return pa.Table.from_arrays(arrays, schema=self.schema) - - def from_arrow_table( - self, table: pa.Table, verify_semantic_equivalence: bool = True - ) -> list[Packet]: - """Convert Arrow table to packet with field metadata. - - Args: - table: PyArrow Table with metadata - - Returns: - List of packets converted from the Arrow table - """ - # Check for consistency in the semantic type mapping: - semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") - - if semantic_type_info != self.semantic_type_lut: - if not verify_semantic_equivalence: - logger.warning( - "Arrow table semantic types do not match expected type registry. " - f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" - ) - else: - raise ValueError( - "Arrow table semantic types do not match expected type registry. " - f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" - ) - - # Create packets from the Arrow table - # TODO: make this more efficient - storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore - if not self.keys_with_handlers: - # no special handling required - return storage_packets - - return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] +# from orcapod.types.core import TypeSpec, TypeHandler +# from orcapod.types.packets import Packet, PacketLike +# from orcapod.types.semantic_type_registry import ( +# SemanticTypeRegistry, +# TypeInfo, +# get_metadata_from_schema, +# arrow_to_dicts, +# ) +# from typing import Any +# from collections.abc import Mapping, Sequence +# import pyarrow as pa +# import logging + +# logger = logging.getLogger(__name__) + + +# def is_packet_supported( +# python_type_info: TypeSpec, +# registry: SemanticTypeRegistry, +# type_lut: dict | None = None, +# ) -> bool: +# """Check if all types in the packet are supported by the registry or known to the default lut.""" +# if type_lut is None: +# type_lut = {} +# return all( +# python_type in registry or python_type in type_lut +# for python_type in python_type_info.values() +# ) + + +# class PacketConverter: +# def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): +# self.python_type_spec = python_type_spec +# self.registry = registry + +# # Lookup handlers and type info for fast access +# self.handlers: dict[str, TypeHandler] = {} +# self.storage_type_info: dict[str, TypeInfo] = {} + +# self.expected_key_set = set(python_type_spec.keys()) + +# # prepare the corresponding arrow table schema with metadata +# self.keys_with_handlers, self.schema = create_schema_from_python_type_info( +# python_type_spec, registry +# ) + +# self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") + +# def _check_key_consistency(self, keys): +# """Check if the provided keys match the expected keys.""" +# keys_set = set(keys) +# if keys_set != self.expected_key_set: +# missing_keys = self.expected_key_set - keys_set +# extra_keys = keys_set - self.expected_key_set +# error_parts = [] +# if missing_keys: +# error_parts.append(f"Missing keys: {missing_keys}") +# if extra_keys: +# error_parts.append(f"Extra keys: {extra_keys}") + +# raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") + +# def _to_storage_packet(self, packet: PacketLike) -> dict[str, Any]: +# """Convert packet to storage representation. + +# Args: +# packet: Dictionary mapping parameter names to Python values + +# Returns: +# Dictionary with same keys but values converted to storage format + +# Raises: +# KeyError: If packet keys don't match the expected type_info keys +# TypeError: If value type doesn't match expected type +# ValueError: If conversion fails +# """ +# # Validate packet keys +# packet_keys = set(packet.keys()) + +# self._check_key_consistency(packet_keys) + +# # Convert each value +# storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet + +# for key, handler in self.keys_with_handlers: +# try: +# storage_packet[key] = handler.python_to_storage(storage_packet[key]) +# except Exception as e: +# raise ValueError(f"Failed to convert value for '{key}': {e}") from e + +# return storage_packet + +# def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> PacketLike: +# """Convert storage packet back to Python packet. + +# Args: +# storage_packet: Dictionary with values in storage format + +# Returns: +# Packet with values converted back to Python types + +# Raises: +# KeyError: If storage packet keys don't match the expected type_info keys +# TypeError: If value type doesn't match expected type +# ValueError: If conversion fails +# """ +# # Validate storage packet keys +# storage_keys = set(storage_packet.keys()) + +# self._check_key_consistency(storage_keys) + +# # Convert each value back to Python type +# packet: PacketLike = dict(storage_packet) + +# for key, handler in self.keys_with_handlers: +# try: +# packet[key] = handler.storage_to_python(storage_packet[key]) +# except Exception as e: +# raise ValueError(f"Failed to convert value for '{key}': {e}") from e + +# return packet + +# def to_arrow_table(self, packet: PacketLike | Sequence[PacketLike]) -> pa.Table: +# """Convert packet to PyArrow Table with field metadata. + +# Args: +# packet: Dictionary mapping parameter names to Python values + +# Returns: +# PyArrow Table with the packet data as a single row +# """ +# # Convert packet to storage format +# if not isinstance(packet, Sequence): +# packets = [packet] +# else: +# packets = packet + +# storage_packets = [self._to_storage_packet(p) for p in packets] + +# # Create arrays +# arrays = [] +# for field in self.schema: +# values = [p[field.name] for p in storage_packets] +# array = pa.array(values, type=field.type) +# arrays.append(array) + +# return pa.Table.from_arrays(arrays, schema=self.schema) + +# def from_arrow_table( +# self, table: pa.Table, verify_semantic_equivalence: bool = True +# ) -> list[Packet]: +# """Convert Arrow table to packet with field metadata. + +# Args: +# table: PyArrow Table with metadata + +# Returns: +# List of packets converted from the Arrow table +# """ +# # Check for consistency in the semantic type mapping: +# semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") + +# if semantic_type_info != self.semantic_type_lut: +# if not verify_semantic_equivalence: +# logger.warning( +# "Arrow table semantic types do not match expected type registry. " +# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" +# ) +# else: +# raise ValueError( +# "Arrow table semantic types do not match expected type registry. " +# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" +# ) + +# # Create packets from the Arrow table +# # TODO: make this more efficient +# storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore +# if not self.keys_with_handlers: +# # no special handling required +# return storage_packets + +# return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index dc2112f..35cc4f0 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,6 +1,5 @@ from orcapod.types import TypeSpec from orcapod.types.semantic_type_registry import SemanticTypeRegistry -from typing import Any import pyarrow as pa import datetime From d90e5c64f5615a5b4f14b960d7b01f3debc16e2e Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 19:43:15 +0000 Subject: [PATCH 046/224] fix: attach label on kernel invocation to the invocation object --- src/orcapod/core/base.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 2144c34..64b99cb 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -53,8 +53,6 @@ def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStrea def __call__( self, *streams: "SyncStream", label: str | None = None, **kwargs ) -> "SyncStream": - if label is not None: - self.label = label # Special handling of Source: trigger call on source if passed as stream normalized_streams = [ stream() if isinstance(stream, Source) else stream for stream in streams @@ -64,7 +62,7 @@ def __call__( output_stream = self.forward(*pre_processed_streams, **kwargs) post_processed_stream = self.post_forward_hook(output_stream, **kwargs) # create an invocation instance - invocation = Invocation(self, pre_processed_streams) + invocation = Invocation(self, pre_processed_streams, label=label) # label the output_stream with the invocation that produced the stream post_processed_stream.invocation = invocation @@ -458,6 +456,7 @@ def map( packet_map: dict | None = None, tag_map: dict | None = None, drop_unmapped: bool = True, + label: str | None = None, ) -> "SyncStream": """ Returns a new stream that is the result of mapping the packets and tags in the stream. @@ -470,9 +469,11 @@ def map( output = self if packet_map is not None: - output = MapPackets(packet_map, drop_unmapped=drop_unmapped)(output) + output = MapPackets(packet_map, drop_unmapped=drop_unmapped, label=label)( + output + ) if tag_map is not None: - output = MapTags(tag_map, drop_unmapped=drop_unmapped)(output) + output = MapTags(tag_map, drop_unmapped=drop_unmapped, label=label)(output) return output From fe35abacab3fa1d60a55d646346fc3ed75369314 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 19:43:40 +0000 Subject: [PATCH 047/224] fix: invoke superclass init --- src/orcapod/core/operators.py | 40 +++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py index bcf63e3..5049e8e 100644 --- a/src/orcapod/core/operators.py +++ b/src/orcapod/core/operators.py @@ -23,8 +23,8 @@ class Repeat(Operator): The repeat count is the number of times to repeat each packet. """ - def __init__(self, repeat_count: int) -> None: - super().__init__() + def __init__(self, repeat_count: int, **kwargs) -> None: + super().__init__(**kwargs) if not isinstance(repeat_count, int): raise TypeError("repeat_count must be an integer") if repeat_count < 0: @@ -381,8 +381,10 @@ class MapPackets(Operator): drop_unmapped=False, in which case unmapped keys will be retained. """ - def __init__(self, key_map: dict[str, str], drop_unmapped: bool = True) -> None: - super().__init__() + def __init__( + self, key_map: dict[str, str], drop_unmapped: bool = True, **kwargs + ) -> None: + super().__init__(**kwargs) self.key_map = key_map self.drop_unmapped = drop_unmapped @@ -481,8 +483,8 @@ class DefaultTag(Operator): tag already contains the same key, it will not be overwritten. """ - def __init__(self, default_tag: Tag) -> None: - super().__init__() + def __init__(self, default_tag: Tag, **kwargs) -> None: + super().__init__(**kwargs) self.default_tag = default_tag def forward(self, *streams: SyncStream) -> SyncStream: @@ -527,8 +529,10 @@ class MapTags(Operator): drop_unmapped=False, in which case unmapped tags will be retained. """ - def __init__(self, key_map: dict[str, str], drop_unmapped: bool = True) -> None: - super().__init__() + def __init__( + self, key_map: dict[str, str], drop_unmapped: bool = True, **kwargs + ) -> None: + super().__init__(**kwargs) self.key_map = key_map self.drop_unmapped = drop_unmapped @@ -658,8 +662,8 @@ class Filter(Operator): The predicate function should return True for packets that should be kept and False for packets that should be dropped. """ - def __init__(self, predicate: Callable[[Tag, Packet], bool]): - super().__init__() + def __init__(self, predicate: Callable[[Tag, Packet], bool], **kwargs): + super().__init__(**kwargs) self.predicate = predicate def forward(self, *streams: SyncStream) -> SyncStream: @@ -704,8 +708,10 @@ class Transform(Operator): The transformation function should return a tuple of (new_tag, new_packet). """ - def __init__(self, transform: Callable[[Tag, Packet], tuple[Tag, Packet]]): - super().__init__() + def __init__( + self, transform: Callable[[Tag, Packet], tuple[Tag, Packet]], **kwargs + ): + super().__init__(**kwargs) self.transform = transform def forward(self, *streams: SyncStream) -> SyncStream: @@ -742,8 +748,9 @@ def __init__( batch_size: int, tag_processor: None | Callable[[Collection[Tag]], Tag] = None, drop_last: bool = True, + **kwargs, ): - super().__init__() + super().__init__(**kwargs) self.batch_size = batch_size if tag_processor is None: tag_processor = batch_tags # noqa: E731 @@ -806,8 +813,9 @@ def __init__( reduce_keys: bool = False, selection_function: Callable[[Collection[tuple[Tag, Packet]]], Collection[bool]] | None = None, + **kwargs, ) -> None: - super().__init__() + super().__init__(**kwargs) self.group_keys = group_keys self.reduce_keys = reduce_keys self.selection_function = selection_function @@ -875,8 +883,8 @@ class CacheStream(Operator): Call `clear_cache()` to clear the cache. """ - def __init__(self) -> None: - super().__init__() + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) self.cache: list[tuple[Tag, Packet]] = [] self.is_cached = False From ef301b38cce0528cef80df6861bbee3477d0fe46 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 19:44:13 +0000 Subject: [PATCH 048/224] feat: expose explicit check for assigned label on content identifiable base --- src/orcapod/hashing/content_identifiable.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py index ce1b6c3..1e48243 100644 --- a/src/orcapod/hashing/content_identifiable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -30,6 +30,16 @@ def __init__( ) self._label = label + @property + def has_assigned_label(self) -> bool: + """ + Check if the label is explicitly set for this object. + + Returns: + bool: True if the label is explicitly set, False otherwise. + """ + return self._label is not None + @property def label(self) -> str: """ From ead67045e11b32fe10603d783012896c428564bf Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 19:44:37 +0000 Subject: [PATCH 049/224] feat: add label on wrapped invocation --- src/orcapod/pipeline/pipeline.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index a30aa62..2a7d86e 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -102,7 +102,9 @@ def record(self, invocation: Invocation) -> None: super().record(invocation) self._dirty = True - def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node: + def wrap_invocation( + self, kernel: Kernel, input_nodes: Collection[Node], label: str | None = None + ) -> Node: if isinstance(kernel, FunctionPod): return FunctionPodNode( kernel, @@ -111,12 +113,14 @@ def wrap_invocation(self, kernel: Kernel, input_nodes: Collection[Node]) -> Node tag_store=self.pipeline_store, output_store_path_prefix=self.results_store_path_prefix, tag_store_path_prefix=self.pipeline_store_path_prefix, + label=label, ) return KernelNode( kernel, input_nodes, output_store=self.pipeline_store, store_path_prefix=self.pipeline_store_path_prefix, + label=label, ) def compile(self): @@ -133,7 +137,11 @@ def compile(self): for invocation in nx.topological_sort(G): # map streams to the new streams based on Nodes input_nodes = [edge_lut[stream] for stream in invocation.streams] - new_node = self.wrap_invocation(invocation.kernel, input_nodes) + label = None + if invocation.has_assigned_label: + # If the invocation has a label, use it directly + label = invocation.label + new_node = self.wrap_invocation(invocation.kernel, input_nodes, label=label) # register the new node against the original invocation node_lut[invocation] = new_node From cbb8754bfd53ecdea067ed71306f34ce15e4efd7 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:44:14 +0000 Subject: [PATCH 050/224] doc: add tutorial notebook --- .../01_quick_dive_into_orcapod.ipynb | 821 ++++++++++++++++++ 1 file changed, 821 insertions(+) create mode 100644 notebooks/tutorials/01_quick_dive_into_orcapod.ipynb diff --git a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb new file mode 100644 index 0000000..2f99783 --- /dev/null +++ b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb @@ -0,0 +1,821 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "27cdd37d", + "metadata": {}, + "outputs": [], + "source": [ + "import orcapod as op" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e776b8dc", + "metadata": {}, + "outputs": [], + "source": [ + "N = 10\n", + "stream = op.SyncStreamFromLists(\n", + " tags=[{\"id\": i} for i in range(N)],\n", + " packets=[{\"x\": i, \"y\": i + 1} for i in range(N)],\n", + " tag_typespec={\"id\": int},\n", + " packet_typespec={\"x\": int, \"y\": int},\n", + " label=\"MySource\",\n", + ")\n", + "\n", + "word_stream = op.SyncStreamFromLists(\n", + " tags=[{\"id\": i} for i in range(N)],\n", + " packets=[{\"word1\": f\"hello {i}\", \"word2\": f\"world {i}\"} for i in range(N)],\n", + " tag_typespec={\"id\": int},\n", + " packet_typespec={\"word1\": str, \"word2\": str},\n", + " label=\"HelloWorld\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "78ab941b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': 0} {'x': 0, 'y': 1}\n", + "{'id': 1} {'x': 1, 'y': 2}\n", + "{'id': 2} {'x': 2, 'y': 3}\n", + "{'id': 3} {'x': 3, 'y': 4}\n", + "{'id': 4} {'x': 4, 'y': 5}\n", + "{'id': 5} {'x': 5, 'y': 6}\n", + "{'id': 6} {'x': 6, 'y': 7}\n", + "{'id': 7} {'x': 7, 'y': 8}\n", + "{'id': 8} {'x': 8, 'y': 9}\n", + "{'id': 9} {'x': 9, 'y': 10}\n" + ] + } + ], + "source": [ + "for tag, packet in stream:\n", + " print(tag, packet)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c32596f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': 0} {'word1': 'hello 0', 'word2': 'world 0'}\n", + "{'id': 1} {'word1': 'hello 1', 'word2': 'world 1'}\n", + "{'id': 2} {'word1': 'hello 2', 'word2': 'world 2'}\n", + "{'id': 3} {'word1': 'hello 3', 'word2': 'world 3'}\n", + "{'id': 4} {'word1': 'hello 4', 'word2': 'world 4'}\n", + "{'id': 5} {'word1': 'hello 5', 'word2': 'world 5'}\n", + "{'id': 6} {'word1': 'hello 6', 'word2': 'world 6'}\n", + "{'id': 7} {'word1': 'hello 7', 'word2': 'world 7'}\n", + "{'id': 8} {'word1': 'hello 8', 'word2': 'world 8'}\n", + "{'id': 9} {'word1': 'hello 9', 'word2': 'world 9'}\n" + ] + } + ], + "source": [ + "for tag, packet in word_stream:\n", + " print(tag, packet)" + ] + }, + { + "cell_type": "markdown", + "id": "ea7eb5ed", + "metadata": {}, + "source": [ + "## Defining function pods" + ] + }, + { + "cell_type": "markdown", + "id": "891bbadf", + "metadata": {}, + "source": [ + "Now we define our own function pods to perform simple computation. \n", + "Defining a function pod is quite simple, you simply \n", + "1. define a regular function with type annotations\n", + "2. decorate with `op.function_pod`, passing in the name ('key') for the output value(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8f5d5dbc", + "metadata": {}, + "outputs": [], + "source": [ + "@op.function_pod(\"total\")\n", + "def total(x: int, y: int) -> int:\n", + " return x + y\n", + "\n", + "\n", + "@op.function_pod(\"delta\")\n", + "def delta(x: int, y: int) -> int:\n", + " return 2 * y - x\n", + "\n", + "\n", + "@op.function_pod(\"mult\")\n", + "def mult(x: int, y: int) -> int:\n", + " return x * y\n", + "\n", + "\n", + "@op.function_pod(\"concat_string\")\n", + "def concat(x: str, y: str) -> str:\n", + " return x + y\n" + ] + }, + { + "cell_type": "markdown", + "id": "bd843166", + "metadata": {}, + "source": [ + "Wrapped functions are now `FunctionPod` and expects to be called with streams as inputs. You can still access the original function through its `function` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c0a191b2", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Expected SyncStream, got int for stream 5", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:60\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m streams:\n\u001b[32m 59\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, SyncStream):\n\u001b[32m---> \u001b[39m\u001b[32m60\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 61\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mExpected SyncStream, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(stream).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m for stream \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstream\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 62\u001b[39m )\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source):\n\u001b[32m 64\u001b[39m \u001b[38;5;66;03m# if the stream is a Source, instantiate it\u001b[39;00m\n\u001b[32m 65\u001b[39m stream = stream()\n", + "\u001b[31mTypeError\u001b[39m: Expected SyncStream, got int for stream 5" + ] + } + ], + "source": [ + "# this won't work, because it's expecting a stream as input\n", + "total(5, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88a9b698", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# but you can access original function this way\n", + "total.function(5, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c8ad097f", + "metadata": {}, + "outputs": [], + "source": [ + "# Passing a stream into a pod does NOT immediately trigger execution, but rather returns another stream\n", + "\n", + "total_stream = total(stream)" + ] + }, + { + "cell_type": "markdown", + "id": "0af7a165", + "metadata": {}, + "source": [ + "Iterating through the stream or calling `flow` triggers the computation" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "93c3f1a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': 0} {'total': 1}\n", + "{'id': 1} {'total': 3}\n", + "{'id': 2} {'total': 5}\n", + "{'id': 3} {'total': 7}\n", + "{'id': 4} {'total': 9}\n", + "{'id': 5} {'total': 11}\n", + "{'id': 6} {'total': 13}\n", + "{'id': 7} {'total': 15}\n", + "{'id': 8} {'total': 17}\n", + "{'id': 9} {'total': 19}\n" + ] + } + ], + "source": [ + "for tag, packet in total_stream:\n", + " print(tag, packet)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cfadfb8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[({'id': 0}, {'total': 1}),\n", + " ({'id': 1}, {'total': 3}),\n", + " ({'id': 2}, {'total': 5}),\n", + " ({'id': 3}, {'total': 7}),\n", + " ({'id': 4}, {'total': 9}),\n", + " ({'id': 5}, {'total': 11}),\n", + " ({'id': 6}, {'total': 13}),\n", + " ({'id': 7}, {'total': 15}),\n", + " ({'id': 8}, {'total': 17}),\n", + " ({'id': 9}, {'total': 19})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_stream.flow()" + ] + }, + { + "cell_type": "markdown", + "id": "d1013dd1", + "metadata": {}, + "source": [ + "If you try to pass in an incompatible stream (stream whose packets don't match the expected inputs of the function), you will immediately get an error." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2805282e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Key 'word1' not found in parameter types.\n" + ] + }, + { + "ename": "TypeError", + "evalue": "Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:75\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 69\u001b[39m normalized_streams = [\n\u001b[32m 70\u001b[39m stream() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source) \u001b[38;5;28;01melse\u001b[39;00m stream\n\u001b[32m 71\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m verified_streams\n\u001b[32m 72\u001b[39m ]\n\u001b[32m 74\u001b[39m pre_processed_streams = \u001b[38;5;28mself\u001b[39m.pre_forward_hook(*normalized_streams, **kwargs)\n\u001b[32m---> \u001b[39m\u001b[32m75\u001b[39m output_stream = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43mpre_processed_streams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m post_processed_stream = \u001b[38;5;28mself\u001b[39m.post_forward_hook(output_stream, **kwargs)\n\u001b[32m 77\u001b[39m \u001b[38;5;66;03m# create an invocation instance\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:236\u001b[39m, in \u001b[36mFunctionPod.forward\u001b[39m\u001b[34m(self, *streams, **kwargs)\u001b[39m\n\u001b[32m 232\u001b[39m _, packet_typespec = stream.types(trigger_run=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m packet_typespec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m check_typespec_compatibility(\n\u001b[32m 234\u001b[39m packet_typespec, \u001b[38;5;28mself\u001b[39m.function_input_typespec\n\u001b[32m 235\u001b[39m ):\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 237\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInput packet types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is not compatible with the function\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms expected input types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.function_input_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 238\u001b[39m )\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().forward(*streams, **kwargs)\n", + "\u001b[31mTypeError\u001b[39m: Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }" + ] + } + ], + "source": [ + "total_stream = total(word_stream)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4c9c030a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'id': int}, {'x': int, 'y': int})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can check the tag and packet types of the stream\n", + "stream.types()" + ] + }, + { + "cell_type": "markdown", + "id": "3ba299b2", + "metadata": {}, + "source": [ + "## Defining pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "1e1dd036", + "metadata": {}, + "source": [ + "We will now piece together multiple function pods into a pipeline. We do this by instantiating a `Pipeline` object. We will store the results into a simple data store." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8083f54a", + "metadata": {}, + "outputs": [], + "source": [ + "# Use simple data store, saving data to Parquet files\n", + "pipeline_store = op.stores.SimpleParquetDataStore(\"./example_data_store\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a475308c", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = op.Pipeline(\"test_pipeline\", pipeline_store)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a42158b9", + "metadata": {}, + "source": [ + "Now we have a pipeline object, we can use it to define our pipeline by simply \"chaining\" together function pod calls." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f923ecf1", + "metadata": {}, + "outputs": [], + "source": [ + "with pipeline:\n", + " total_stream = total(stream)\n", + " delta_stream = delta(stream)\n", + " mult_stream = mult(\n", + " total_stream.map({\"total\": \"x\"}), delta_stream.map({\"delta\": \"y\"})\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "b67e9413", + "metadata": {}, + "source": [ + "And that's it! Now the elements of the pipeline is available as properties on the pipeline." + ] + }, + { + "cell_type": "markdown", + "id": "7ee41a20", + "metadata": {}, + "source": [ + "By default, the function pods are made available under the function's name in the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "66230603", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FunctionPodNode>" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.total" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6587f2f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "FunctionPodNode>" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.mult" + ] + }, + { + "cell_type": "markdown", + "id": "16d0dba3", + "metadata": {}, + "source": [ + "Other implicitly created nodes such as joining of two streams are made available under the corresponding operator class (e.g. `Join`)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bd0dfba2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KernelNode" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.Join" + ] + }, + { + "cell_type": "markdown", + "id": "71dba5c5", + "metadata": {}, + "source": [ + "You can list out all nodes through `nodes` property" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e22758ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MySource': KernelNode,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", + " 'MapPackets_0': KernelNode,\n", + " 'MapPackets_1': KernelNode,\n", + " 'Join': KernelNode,\n", + " 'mult': FunctionPodNode>}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.nodes" + ] + }, + { + "cell_type": "markdown", + "id": "039b617f", + "metadata": {}, + "source": [ + "You can easily rename any node using the pipeline's `rename` method" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0d1a470e", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.rename(\"MapPackets_0\", \"total_map\")\n", + "pipeline.rename(\"MapPackets_1\", \"mult_map\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "3a43984d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'MySource': KernelNode,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", + " 'Join': KernelNode,\n", + " 'mult': FunctionPodNode>,\n", + " 'total_map': KernelNode,\n", + " 'mult_map': KernelNode}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.nodes" + ] + }, + { + "cell_type": "markdown", + "id": "c438f111", + "metadata": {}, + "source": [ + "Renaming does NOT change the structure of the pipeline in anyway -- it simply changes how it's labeld for your convenience." + ] + }, + { + "cell_type": "markdown", + "id": "befa6107", + "metadata": {}, + "source": [ + "### Running pipeline and accessing results" + ] + }, + { + "cell_type": "markdown", + "id": "4d4412b1", + "metadata": {}, + "source": [ + "Since we just created the pipeline, there are no results associated with any node. You can get [Polars](https://pola.rs) DataFrame viewing into the results through the node's `df` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "96106e09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
idtotal
i64i64
01
13
25
37
49
511
613
715
817
919
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌─────┬───────┐\n", + "│ id ┆ total │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═══════╡\n", + "│ 0 ┆ 1 │\n", + "│ 1 ┆ 3 │\n", + "│ 2 ┆ 5 │\n", + "│ 3 ┆ 7 │\n", + "│ 4 ┆ 9 │\n", + "│ 5 ┆ 11 │\n", + "│ 6 ┆ 13 │\n", + "│ 7 ┆ 15 │\n", + "│ 8 ┆ 17 │\n", + "│ 9 ┆ 19 │\n", + "└─────┴───────┘" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.total.df" + ] + }, + { + "cell_type": "markdown", + "id": "62b7e59a", + "metadata": {}, + "source": [ + "Before we run, the source nodes is also not \"recorded\" and thus will appear empty." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "33b449b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 3)
idxy
i64i64i64
001
112
223
334
445
556
667
778
889
9910
" + ], + "text/plain": [ + "shape: (10, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ id ┆ x ┆ y │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═════╪═════╪═════╡\n", + "│ 0 ┆ 0 ┆ 1 │\n", + "│ 1 ┆ 1 ┆ 2 │\n", + "│ 2 ┆ 2 ┆ 3 │\n", + "│ 3 ┆ 3 ┆ 4 │\n", + "│ 4 ┆ 4 ┆ 5 │\n", + "│ 5 ┆ 5 ┆ 6 │\n", + "│ 6 ┆ 6 ┆ 7 │\n", + "│ 7 ┆ 7 ┆ 8 │\n", + "│ 8 ┆ 8 ┆ 9 │\n", + "│ 9 ┆ 9 ┆ 10 │\n", + "└─────┴─────┴─────┘" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.MySource.df" + ] + }, + { + "cell_type": "markdown", + "id": "408e8012", + "metadata": {}, + "source": [ + "We can trigger the entire pipeline to run and record all results by simply calling the `run` method." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "189f943f", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1674bec4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 3)
idxy
i64i64i64
001
112
223
334
445
556
667
778
889
9910
" + ], + "text/plain": [ + "shape: (10, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ id ┆ x ┆ y │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═════╪═════╪═════╡\n", + "│ 0 ┆ 0 ┆ 1 │\n", + "│ 1 ┆ 1 ┆ 2 │\n", + "│ 2 ┆ 2 ┆ 3 │\n", + "│ 3 ┆ 3 ┆ 4 │\n", + "│ 4 ┆ 4 ┆ 5 │\n", + "│ 5 ┆ 5 ┆ 6 │\n", + "│ 6 ┆ 6 ┆ 7 │\n", + "│ 7 ┆ 7 ┆ 8 │\n", + "│ 8 ┆ 8 ┆ 9 │\n", + "│ 9 ┆ 9 ┆ 10 │\n", + "└─────┴─────┴─────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.MySource.df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2b69d213", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
idtotal
i64i64
01
13
25
37
49
511
613
715
817
919
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌─────┬───────┐\n", + "│ id ┆ total │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═══════╡\n", + "│ 0 ┆ 1 │\n", + "│ 1 ┆ 3 │\n", + "│ 2 ┆ 5 │\n", + "│ 3 ┆ 7 │\n", + "│ 4 ┆ 9 │\n", + "│ 5 ┆ 11 │\n", + "│ 6 ┆ 13 │\n", + "│ 7 ┆ 15 │\n", + "│ 8 ┆ 17 │\n", + "│ 9 ┆ 19 │\n", + "└─────┴───────┘" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.total.df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "orcapod (3.13.3)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 73b2638aec7fe45989b8a35c805ee7ac8e777019 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:45:01 +0000 Subject: [PATCH 051/224] refactor: clean up store package --- src/orcapod/stores/arrow_data_stores.py | 28 +- ...a_store.py => dict_transfer_data_store.py} | 2 +- src/orcapod/stores/optimized_memory_store.py | 445 ------------------ 3 files changed, 20 insertions(+), 455 deletions(-) rename src/orcapod/stores/{transfer_data_store.py => dict_transfer_data_store.py} (96%) delete mode 100644 src/orcapod/stores/optimized_memory_store.py diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/arrow_data_stores.py index 9d001a6..93a2400 100644 --- a/src/orcapod/stores/arrow_data_stores.py +++ b/src/orcapod/stores/arrow_data_stores.py @@ -17,8 +17,11 @@ class MockArrowDataStore: """ Mock Arrow data store for testing purposes. - This class simulates the behavior of ParquetArrowDataStore without actually saving anything. - It is useful for unit tests where you want to avoid filesystem dependencies. + This class simulates the behavior of ArrowDataStore without actually saving anything. + It is useful for unit tests where you want to avoid any I/O operations or when you need + to test the behavior of your code without relying on external systems. If you need some + persistence of saved data, consider using SimpleParquetDataStore without providing a + file path instead. """ def __init__(self): @@ -93,13 +96,20 @@ def get_records_by_ids_as_polars( return None -class SimpleInMemoryDataStore: +class SimpleParquetDataStore: """ - In-memory Arrow data store, primarily to be used for testing purposes. - This class simulates the behavior of ParquetArrowDataStore without actual file I/O. - It is useful for unit tests where you want to avoid filesystem dependencies. - - Uses dict of dict of Arrow tables for efficient storage and retrieval. + Simple Parquet-based Arrow data store, primarily to be used for development purposes. + If no file path is provided, it will not save anything to disk. Instead, all data will be stored in memory. + If a file path is provided, it will save data to a single Parquet files in a directory structure reflecting + the provided source_path. To speed up the process, data will be stored in memory and only saved to disk + when the `flush` method is called. If used as part of pipeline, flush is automatically called + at the end of pipeline execution. + Note that this store provides only very basic functionality and is not suitable for production use. + For each distinct source_path, only a single parquet file is created to store all data entries. + Appending is not efficient as it requires reading the entire file into the memory, appending new data, + and then writing the entire file back to disk. This is not suitable for large datasets or frequent updates. + However, for development/testing purposes, this data store provides a simple way to store and retrieve + data without the overhead of a full database or file system and provides very high performance. """ def __init__( @@ -462,7 +472,7 @@ def load_from_parquet( target_path = Path(base_path) / source_key if not target_path.exists(): - logger.warning(f"Base path {base_path} does not exist") + logger.info(f"Base path {base_path} does not exist") return loaded_count = 0 diff --git a/src/orcapod/stores/transfer_data_store.py b/src/orcapod/stores/dict_transfer_data_store.py similarity index 96% rename from src/orcapod/stores/transfer_data_store.py rename to src/orcapod/stores/dict_transfer_data_store.py index 9e393e0..7e8762f 100644 --- a/src/orcapod/stores/transfer_data_store.py +++ b/src/orcapod/stores/dict_transfer_data_store.py @@ -6,7 +6,7 @@ class TransferDataStore(DataStore): """ - A data store that allows transferring memoized packets between different data stores. + A data store that allows transferring recorded data between different data stores. This is useful for moving data between different storage backends. """ diff --git a/src/orcapod/stores/optimized_memory_store.py b/src/orcapod/stores/optimized_memory_store.py deleted file mode 100644 index 1859113..0000000 --- a/src/orcapod/stores/optimized_memory_store.py +++ /dev/null @@ -1,445 +0,0 @@ -import polars as pl -import pyarrow as pa -import logging -from typing import Any, Dict, List, Tuple, cast -from collections import defaultdict - -# Module-level logger -logger = logging.getLogger(__name__) - - -class ArrowBatchedPolarsDataStore: - """ - Arrow-batched Polars data store that minimizes Arrow<->Polars conversions. - - Key optimizations: - 1. Keep data in Arrow format during batching - 2. Only convert to Polars when consolidating or querying - 3. Batch Arrow tables and concatenate before conversion - 4. Maintain Arrow-based indexing for fast lookups - 5. Lazy Polars conversion only when needed - """ - - def __init__(self, duplicate_entry_behavior: str = "error", batch_size: int = 100): - """ - Initialize the ArrowBatchedPolarsDataStore. - - Args: - duplicate_entry_behavior: How to handle duplicate entry_ids: - - 'error': Raise ValueError when entry_id already exists - - 'overwrite': Replace existing entry with new data - batch_size: Number of records to batch before consolidating - """ - if duplicate_entry_behavior not in ["error", "overwrite"]: - raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - - self.duplicate_entry_behavior = duplicate_entry_behavior - self.batch_size = batch_size - - # Arrow batch buffer: {source_key: [(entry_id, arrow_table), ...]} - self._arrow_batches: Dict[str, List[Tuple[str, pa.Table]]] = defaultdict(list) - - # Consolidated Polars store: {source_key: polars_dataframe} - self._polars_store: Dict[str, pl.DataFrame] = {} - - # Entry ID index for fast lookups: {source_key: set[entry_ids]} - self._entry_index: Dict[str, set] = defaultdict(set) - - # Schema cache - self._schema_cache: Dict[str, pa.Schema] = {} - - logger.info( - f"Initialized ArrowBatchedPolarsDataStore with " - f"duplicate_entry_behavior='{duplicate_entry_behavior}', batch_size={batch_size}" - ) - - def _get_source_key(self, source_name: str, source_id: str) -> str: - """Generate key for source storage.""" - return f"{source_name}:{source_id}" - - def _add_entry_id_to_arrow_table(self, table: pa.Table, entry_id: str) -> pa.Table: - """Add entry_id column to Arrow table efficiently.""" - # Create entry_id array with the same length as the table - entry_id_array = pa.array([entry_id] * len(table), type=pa.string()) - - # Add column at the beginning for consistent ordering - return table.add_column(0, "__entry_id", entry_id_array) - - def _consolidate_arrow_batch(self, source_key: str) -> None: - """Consolidate Arrow batch into Polars DataFrame.""" - if source_key not in self._arrow_batches or not self._arrow_batches[source_key]: - return - - logger.debug( - f"Consolidating {len(self._arrow_batches[source_key])} Arrow tables for {source_key}" - ) - - # Prepare all Arrow tables with entry_id columns - arrow_tables_with_id = [] - - for entry_id, arrow_table in self._arrow_batches[source_key]: - table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) - arrow_tables_with_id.append(table_with_id) - - # Concatenate all Arrow tables at once (very fast) - if len(arrow_tables_with_id) == 1: - consolidated_arrow = arrow_tables_with_id[0] - else: - consolidated_arrow = pa.concat_tables(arrow_tables_with_id) - - # Single conversion to Polars - new_polars_df = cast(pl.DataFrame, pl.from_arrow(consolidated_arrow)) - - # Combine with existing Polars DataFrame if it exists - if source_key in self._polars_store: - existing_df = self._polars_store[source_key] - self._polars_store[source_key] = pl.concat([existing_df, new_polars_df]) - else: - self._polars_store[source_key] = new_polars_df - - # Clear the Arrow batch - self._arrow_batches[source_key].clear() - - logger.debug( - f"Consolidated to Polars DataFrame with {len(self._polars_store[source_key])} total rows" - ) - - def _force_consolidation(self, source_key: str) -> None: - """Force consolidation of Arrow batches.""" - if source_key in self._arrow_batches and self._arrow_batches[source_key]: - self._consolidate_arrow_batch(source_key) - - def _get_consolidated_dataframe(self, source_key: str) -> pl.DataFrame | None: - """Get consolidated Polars DataFrame, forcing consolidation if needed.""" - self._force_consolidation(source_key) - return self._polars_store.get(source_key) - - def add_record( - self, - source_name: str, - source_id: str, - entry_id: str, - arrow_data: pa.Table, - ) -> pa.Table: - """ - Add a record to the store using Arrow batching. - - This is the fastest path - no conversions, just Arrow table storage. - """ - source_key = self._get_source_key(source_name, source_id) - - # Check for duplicate entry - if entry_id in self._entry_index[source_key]: - if self.duplicate_entry_behavior == "error": - raise ValueError( - f"Entry '{entry_id}' already exists in {source_name}/{source_id}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - else: - # Handle overwrite: remove from both Arrow batch and Polars store - # Remove from Arrow batch - self._arrow_batches[source_key] = [ - (eid, table) - for eid, table in self._arrow_batches[source_key] - if eid != entry_id - ] - - # Remove from Polars store if it exists - if source_key in self._polars_store: - self._polars_store[source_key] = self._polars_store[ - source_key - ].filter(pl.col("__entry_id") != entry_id) - - # Schema validation (cached) - if source_key in self._schema_cache: - if not self._schema_cache[source_key].equals(arrow_data.schema): - raise ValueError( - f"Schema mismatch for {source_key}. " - f"Expected: {self._schema_cache[source_key]}, " - f"Got: {arrow_data.schema}" - ) - else: - self._schema_cache[source_key] = arrow_data.schema - - # Add to Arrow batch (no conversion yet!) - self._arrow_batches[source_key].append((entry_id, arrow_data)) - self._entry_index[source_key].add(entry_id) - - # Consolidate if batch is full - if len(self._arrow_batches[source_key]) >= self.batch_size: - self._consolidate_arrow_batch(source_key) - - logger.debug(f"Added entry {entry_id} to Arrow batch for {source_key}") - return arrow_data - - def get_record( - self, source_name: str, source_id: str, entry_id: str - ) -> pa.Table | None: - """Get a specific record with optimized lookup.""" - source_key = self._get_source_key(source_name, source_id) - - # Quick existence check - if entry_id not in self._entry_index[source_key]: - return None - - # Check Arrow batch first (most recent data) - for batch_entry_id, arrow_table in self._arrow_batches[source_key]: - if batch_entry_id == entry_id: - return arrow_table - - # Check consolidated Polars store - df = self._get_consolidated_dataframe(source_key) - if df is None: - return None - - # Filter and convert back to Arrow - filtered_df = df.filter(pl.col("__entry_id") == entry_id).drop("__entry_id") - - if filtered_df.height == 0: - return None - - return filtered_df.to_arrow() - - def get_all_records( - self, source_name: str, source_id: str, add_entry_id_column: bool | str = False - ) -> pa.Table | None: - """Retrieve all records as a single Arrow table.""" - source_key = self._get_source_key(source_name, source_id) - - # Force consolidation to include all data - df = self._get_consolidated_dataframe(source_key) - if df is None or df.height == 0: - return None - - # Handle entry_id column - if add_entry_id_column is False: - result_df = df.drop("__entry_id") - elif add_entry_id_column is True: - result_df = df - elif isinstance(add_entry_id_column, str): - result_df = df.rename({"__entry_id": add_entry_id_column}) - else: - result_df = df.drop("__entry_id") - - return result_df.to_arrow() - - def get_all_records_as_polars( - self, source_name: str, source_id: str - ) -> pl.LazyFrame | None: - """Retrieve all records as a Polars LazyFrame.""" - source_key = self._get_source_key(source_name, source_id) - - df = self._get_consolidated_dataframe(source_key) - if df is None or df.height == 0: - return None - - return df.drop("__entry_id").lazy() - - def get_records_by_ids( - self, - source_name: str, - source_id: str, - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, - ) -> pa.Table | None: - """Retrieve records by entry IDs efficiently.""" - # Convert input to list for processing - if isinstance(entry_ids, list): - if not entry_ids: - return None - entry_ids_list = entry_ids - elif isinstance(entry_ids, pl.Series): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_list() - elif isinstance(entry_ids, pa.Array): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_pylist() - else: - raise TypeError(f"entry_ids must be list[str], pl.Series, or pa.Array") - - source_key = self._get_source_key(source_name, source_id) - - # Quick filter using index - existing_entries = [ - entry_id - for entry_id in entry_ids_list - if entry_id in self._entry_index[source_key] - ] - - if not existing_entries and not preserve_input_order: - return None - - # Collect from Arrow batch first - batch_tables = [] - found_in_batch = set() - - for entry_id, arrow_table in self._arrow_batches[source_key]: - if entry_id in entry_ids_list: - table_with_id = self._add_entry_id_to_arrow_table(arrow_table, entry_id) - batch_tables.append(table_with_id) - found_in_batch.add(entry_id) - - # Get remaining from consolidated store - remaining_ids = [eid for eid in existing_entries if eid not in found_in_batch] - - consolidated_tables = [] - if remaining_ids: - df = self._get_consolidated_dataframe(source_key) - if df is not None: - if preserve_input_order: - ordered_df = pl.DataFrame({"__entry_id": entry_ids_list}) - result_df = ordered_df.join(df, on="__entry_id", how="left") - else: - result_df = df.filter(pl.col("__entry_id").is_in(remaining_ids)) - - if result_df.height > 0: - consolidated_tables.append(result_df.to_arrow()) - - # Combine all results - all_tables = batch_tables + consolidated_tables - - if not all_tables: - return None - - # Concatenate Arrow tables - if len(all_tables) == 1: - result_table = all_tables[0] - else: - result_table = pa.concat_tables(all_tables) - - # Handle entry_id column - if add_entry_id_column is False: - # Remove __entry_id column - column_names = result_table.column_names - if "__entry_id" in column_names: - indices = [ - i for i, name in enumerate(column_names) if name != "__entry_id" - ] - result_table = result_table.select(indices) - elif isinstance(add_entry_id_column, str): - # Rename __entry_id column - schema = result_table.schema - new_names = [ - add_entry_id_column if name == "__entry_id" else name - for name in schema.names - ] - result_table = result_table.rename_columns(new_names) - - return result_table - - def get_records_by_ids_as_polars( - self, - source_name: str, - source_id: str, - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, - ) -> pl.LazyFrame | None: - """Retrieve records by entry IDs as Polars LazyFrame.""" - arrow_result = self.get_records_by_ids( - source_name, source_id, entry_ids, add_entry_id_column, preserve_input_order - ) - - if arrow_result is None: - return None - - pl_result = cast(pl.DataFrame, pl.from_arrow(arrow_result)) - - return pl_result.lazy() - - def entry_exists(self, source_name: str, source_id: str, entry_id: str) -> bool: - """Check if entry exists using the index.""" - source_key = self._get_source_key(source_name, source_id) - return entry_id in self._entry_index[source_key] - - def list_entries(self, source_name: str, source_id: str) -> set[str]: - """List all entry IDs using the index.""" - source_key = self._get_source_key(source_name, source_id) - return self._entry_index[source_key].copy() - - def list_sources(self) -> set[tuple[str, str]]: - """List all source combinations.""" - sources = set() - for source_key in self._entry_index.keys(): - if ":" in source_key: - source_name, source_id = source_key.split(":", 1) - sources.add((source_name, source_id)) - return sources - - def force_consolidation(self) -> None: - """Force consolidation of all Arrow batches.""" - for source_key in list(self._arrow_batches.keys()): - self._force_consolidation(source_key) - logger.info("Forced consolidation of all Arrow batches") - - def clear_source(self, source_name: str, source_id: str) -> None: - """Clear all data for a source.""" - source_key = self._get_source_key(source_name, source_id) - - if source_key in self._arrow_batches: - del self._arrow_batches[source_key] - if source_key in self._polars_store: - del self._polars_store[source_key] - if source_key in self._entry_index: - del self._entry_index[source_key] - if source_key in self._schema_cache: - del self._schema_cache[source_key] - - logger.debug(f"Cleared source {source_key}") - - def clear_all(self) -> None: - """Clear all data.""" - self._arrow_batches.clear() - self._polars_store.clear() - self._entry_index.clear() - self._schema_cache.clear() - logger.info("Cleared all data") - - def get_stats(self) -> dict[str, Any]: - """Get comprehensive statistics.""" - total_records = sum(len(entries) for entries in self._entry_index.values()) - total_batched = sum(len(batch) for batch in self._arrow_batches.values()) - total_consolidated = ( - sum(len(df) for df in self._polars_store.values()) - if self._polars_store - else 0 - ) - - source_stats = [] - for source_key in self._entry_index.keys(): - record_count = len(self._entry_index[source_key]) - batched_count = len(self._arrow_batches.get(source_key, [])) - consolidated_count = 0 - - if source_key in self._polars_store: - consolidated_count = len(self._polars_store[source_key]) - - source_stats.append( - { - "source_key": source_key, - "total_records": record_count, - "batched_records": batched_count, - "consolidated_records": consolidated_count, - } - ) - - return { - "total_records": total_records, - "total_sources": len(self._entry_index), - "total_batched": total_batched, - "total_consolidated": total_consolidated, - "batch_size": self.batch_size, - "duplicate_entry_behavior": self.duplicate_entry_behavior, - "source_details": source_stats, - } - - def optimize_for_reads(self) -> None: - """Optimize for read operations by consolidating all batches.""" - logger.info("Optimizing for reads - consolidating all Arrow batches...") - self.force_consolidation() - # Clear Arrow batches to save memory - self._arrow_batches.clear() - logger.info("Optimization complete") From 555a751c2cd808445c0fff9e41055f11c4a9b180 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:45:59 +0000 Subject: [PATCH 052/224] feat: improve pipeline usability with typechecks and convenience attributes --- src/orcapod/core/base.py | 15 ++++++++++++++- src/orcapod/core/pod.py | 19 ++++++++++++++++++- src/orcapod/pipeline/nodes.py | 14 ++++++++++++-- src/orcapod/pipeline/pipeline.py | 26 +++++++++++++++++++++----- src/orcapod/types/core.py | 2 +- 5 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index 64b99cb..367bc72 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -53,9 +53,22 @@ def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStrea def __call__( self, *streams: "SyncStream", label: str | None = None, **kwargs ) -> "SyncStream": + # check that inputs are stream instances and if it's source, instantiate it + verified_streams = [] + for stream in streams: + if not isinstance(stream, SyncStream): + raise TypeError( + f"Expected SyncStream, got {type(stream).__name__} for stream {stream}" + ) + if isinstance(stream, Source): + # if the stream is a Source, instantiate it + stream = stream() + verified_streams.append(stream) + # Special handling of Source: trigger call on source if passed as stream normalized_streams = [ - stream() if isinstance(stream, Source) else stream for stream in streams + stream() if isinstance(stream, Source) else stream + for stream in verified_streams ] pre_processed_streams = self.pre_forward_hook(*normalized_streams, **kwargs) diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index d64bafa..92d8568 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -8,7 +8,10 @@ ) from orcapod.types import Packet, Tag, TypeSpec, default_registry -from orcapod.types.typespec_utils import extract_function_typespecs +from orcapod.types.typespec_utils import ( + extract_function_typespecs, + check_typespec_compatibility, +) from orcapod.types.packets import PacketConverter from orcapod.hashing import ( @@ -221,6 +224,20 @@ def __init__( self.function_output_typespec, self.registry ) + def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: + assert len(streams) == 1, ( + "Only one stream is supported in forward() of FunctionPod" + ) + stream = streams[0] + _, packet_typespec = stream.types(trigger_run=False) + if packet_typespec is not None and not check_typespec_compatibility( + packet_typespec, self.function_input_typespec + ): + raise TypeError( + f"Input packet types {packet_typespec} is not compatible with the function's expected input types {self.function_input_typespec}" + ) + return super().forward(*streams, **kwargs) + def get_function_typespecs(self) -> tuple[TypeSpec, TypeSpec]: return self.function_input_typespec, self.function_output_typespec diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index fabb664..07d9eb4 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -169,7 +169,10 @@ def update_cached_values(self): self.kernel_hash = self.kernel_hasher.hash_to_hex( self.kernel, prefix_hasher_id=True ) - self.tag_keys, self.packet_keys = self.keys(trigger_run=False) + tag_keys, packet_keys = self.keys(trigger_run=False) + self.tag_keys = tuple(tag_keys) if tag_keys is not None else None + self.packet_keys = tuple(packet_keys) if packet_keys is not None else None + self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) if self.tag_typespec is None or self.packet_typespec is None: raise ValueError( @@ -248,7 +251,14 @@ def output_iterator_completion_hook(self) -> None: @property def lazy_df(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(self.store_path) + lazydf = self.output_store.get_all_records_as_polars(self.store_path) + if lazydf is None: + return None + if self.tag_keys is None or self.packet_keys is None: + raise ValueError( + "CachedKernelWrapper has no tag keys or packet keys defined, and currently this is not supported" + ) + return lazydf.select(self.tag_keys + self.packet_keys) @property def df(self) -> pl.DataFrame | None: diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/pipeline.py index 2a7d86e..1fb5236 100644 --- a/src/orcapod/pipeline/pipeline.py +++ b/src/orcapod/pipeline/pipeline.py @@ -53,7 +53,7 @@ def __init__( self.pipeline_store = pipeline_store self.results_store = results_store - self.labels_to_nodes = {} + self.nodes = {} self.auto_compile = auto_compile self._dirty = False self._ordered_nodes = [] # Track order of invocations @@ -167,7 +167,8 @@ def compile(self): nodes[0].label = label labels_to_nodes[label] = nodes[0] - self.labels_to_nodes = labels_to_nodes + # store as pipeline's nodes attribute + self.nodes = labels_to_nodes self._dirty = False return node_lut, edge_lut, proposed_labels, labels_to_nodes @@ -178,13 +179,28 @@ def __exit__(self, exc_type, exc_val, ext_tb): def __getattr__(self, item: str) -> Any: """Allow direct access to pipeline attributes""" - if item in self.labels_to_nodes: - return self.labels_to_nodes[item] + if item in self.nodes: + return self.nodes[item] raise AttributeError(f"Pipeline has no attribute '{item}'") def __dir__(self): # Include both regular attributes and dynamic ones - return list(super().__dir__()) + list(self.labels_to_nodes.keys()) + return list(super().__dir__()) + list(self.nodes.keys()) + + def rename(self, old_name: str, new_name: str) -> None: + """ + Rename a node in the pipeline. + This will update the label and the internal mapping. + """ + if old_name not in self.nodes: + raise KeyError(f"Node '{old_name}' does not exist in the pipeline.") + if new_name in self.nodes: + raise KeyError(f"Node '{new_name}' already exists in the pipeline.") + node = self.nodes[old_name] + del self.nodes[old_name] + node.label = new_name + self.nodes[new_name] = node + logger.info(f"Node '{old_name}' renamed to '{new_name}'") def run(self, full_sync: bool = False) -> None: """ diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 62c100d..22491ae 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -15,7 +15,7 @@ # an (optional) string or a collection of (optional) string values # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists -TagValue: TypeAlias = str | None | Collection["TagValue"] +TagValue: TypeAlias = int | str | None | Collection["TagValue"] # the top level tag is a mapping from string keys to values that can be a string or # an arbitrary depth of nested list of strings or None From 083134b050c2c443ad65a8100f4e90177c00634d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:46:11 +0000 Subject: [PATCH 053/224] fix: use new store name --- src/orcapod/stores/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 281874b..1114c11 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -1,5 +1,5 @@ from .types import DataStore, ArrowDataStore -from .arrow_data_stores import MockArrowDataStore, SimpleInMemoryDataStore +from .arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore from .dict_data_stores import DirDataStore, NoOpDataStore from .safe_dir_data_store import SafeDirDataStore @@ -10,5 +10,5 @@ "SafeDirDataStore", "NoOpDataStore", "MockArrowDataStore", - "SimpleInMemoryDataStore", + "SimpleParquetDataStore", ] From 7e33bae162e2296ca1f2dff366860ec942780261 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:46:24 +0000 Subject: [PATCH 054/224] test: update to use new package name --- tests/test_store/test_transfer_data_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 21ed4c9..4721691 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -7,7 +7,7 @@ from orcapod.hashing.types import LegacyPacketHasher from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore -from orcapod.stores.transfer_data_store import TransferDataStore +from orcapod.stores.dict_transfer_data_store import TransferDataStore class MockPacketHasher(LegacyPacketHasher): From 5641810382d8769ff925e3b7ba1ea8e156d4aaf5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 21:50:33 +0000 Subject: [PATCH 055/224] fix: wrong import --- src/orcapod/hashing/arrow_hashers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index a7b5a01..465b29b 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -4,11 +4,11 @@ import polars as pl import json from orcapod.hashing.types import SemanticTypeHasher, StringCacher -from orcapod.hashing import arrow_serialization_old +from orcapod.hashing import arrow_serialization from collections.abc import Callable SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { - "logical": arrow_serialization_old.serialize_table_logical, + "logical": arrow_serialization.serialize_table_logical, } From c66920c13613cb347d661b820c89041aaed45b91 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 3 Jul 2025 22:12:05 +0000 Subject: [PATCH 056/224] doc: handle typing corner cases --- src/orcapod/stores/arrow_data_stores.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/arrow_data_stores.py index 93a2400..0a9a7e9 100644 --- a/src/orcapod/stores/arrow_data_stores.py +++ b/src/orcapod/stores/arrow_data_stores.py @@ -1841,6 +1841,8 @@ def get_all_records( df = self.get_all_records_as_polars( source_name, source_id, add_entry_id_column=add_entry_id_column ) + if df is None: + return None return df.collect().to_arrow() def get_all_records_as_polars( @@ -1917,9 +1919,9 @@ def get_records_by_ids( elif isinstance(entry_ids, pa.Array): if len(entry_ids) == 0: return None - entry_ids_series = pl.from_arrow(pa.table({"entry_id": entry_ids}))[ - "entry_id" - ] + entry_ids_series: pl.Series = pl.from_arrow( + pa.table({"entry_id": entry_ids}) + )["entry_id"] # type: ignore else: raise TypeError( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" @@ -1993,7 +1995,8 @@ def get_records_by_ids_as_polars( return None # Convert to Polars LazyFrame - return pl.from_arrow(arrow_result).lazy() + df = cast(pl.DataFrame, pl.from_arrow(arrow_result)) + return df.lazy() def entry_exists(self, source_name: str, source_id: str, entry_id: str) -> bool: """Check if a specific entry exists.""" From 7ace5a454fc47a90abde778affee40595163b498 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 5 Jul 2025 05:36:32 +0000 Subject: [PATCH 057/224] doc: reorganize tutorials --- .../01_orcapod_core_concepts copy.ipynb | 0 .../02_orcapod_basic_usage copy.ipynb | 0 .../02_orcapod_basic_usage.ipynb | 0 .../03_orcacapod_qol_features.ipynb | 0 .../04_orcapod_tracker.ipynb | 0 .../05_orcabridge_dj_integration.ipynb | 0 .../01_quick_dive_into_orcapod.ipynb | 351 ++++++++++-------- 7 files changed, 203 insertions(+), 148 deletions(-) rename notebooks/{ => old_tutorials}/01_orcapod_core_concepts copy.ipynb (100%) rename notebooks/{ => old_tutorials}/02_orcapod_basic_usage copy.ipynb (100%) rename notebooks/{ => old_tutorials}/02_orcapod_basic_usage.ipynb (100%) rename notebooks/{ => old_tutorials}/03_orcacapod_qol_features.ipynb (100%) rename notebooks/{ => old_tutorials}/04_orcapod_tracker.ipynb (100%) rename notebooks/{ => old_tutorials}/05_orcabridge_dj_integration.ipynb (100%) diff --git a/notebooks/01_orcapod_core_concepts copy.ipynb b/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb similarity index 100% rename from notebooks/01_orcapod_core_concepts copy.ipynb rename to notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb diff --git a/notebooks/02_orcapod_basic_usage copy.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb similarity index 100% rename from notebooks/02_orcapod_basic_usage copy.ipynb rename to notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb diff --git a/notebooks/02_orcapod_basic_usage.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb similarity index 100% rename from notebooks/02_orcapod_basic_usage.ipynb rename to notebooks/old_tutorials/02_orcapod_basic_usage.ipynb diff --git a/notebooks/03_orcacapod_qol_features.ipynb b/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb similarity index 100% rename from notebooks/03_orcacapod_qol_features.ipynb rename to notebooks/old_tutorials/03_orcacapod_qol_features.ipynb diff --git a/notebooks/04_orcapod_tracker.ipynb b/notebooks/old_tutorials/04_orcapod_tracker.ipynb similarity index 100% rename from notebooks/04_orcapod_tracker.ipynb rename to notebooks/old_tutorials/04_orcapod_tracker.ipynb diff --git a/notebooks/05_orcabridge_dj_integration.ipynb b/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb similarity index 100% rename from notebooks/05_orcabridge_dj_integration.ipynb rename to notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb diff --git a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb index 2f99783..b09f745 100644 --- a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb +++ b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "27cdd37d", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "e776b8dc", + "id": "9cd4692c", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "78ab941b", "metadata": {}, "outputs": [ @@ -65,8 +65,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "c32596f5", + "execution_count": 3, + "id": "ef13511e", "metadata": {}, "outputs": [ { @@ -112,8 +112,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "8f5d5dbc", + "execution_count": 4, + "id": "f8781072", "metadata": {}, "outputs": [], "source": [ @@ -147,8 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "c0a191b2", + "execution_count": 5, + "id": "7b8f8056", "metadata": {}, "outputs": [ { @@ -158,7 +158,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:60\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m streams:\n\u001b[32m 59\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, SyncStream):\n\u001b[32m---> \u001b[39m\u001b[32m60\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 61\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mExpected SyncStream, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(stream).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m for stream \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstream\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 62\u001b[39m )\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source):\n\u001b[32m 64\u001b[39m \u001b[38;5;66;03m# if the stream is a Source, instantiate it\u001b[39;00m\n\u001b[32m 65\u001b[39m stream = stream()\n", "\u001b[31mTypeError\u001b[39m: Expected SyncStream, got int for stream 5" ] @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88a9b698", + "id": "fba23537", "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "11" ] }, - "execution_count": 7, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -193,8 +193,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "c8ad097f", + "execution_count": null, + "id": "e56ffa7d", "metadata": {}, "outputs": [], "source": [ @@ -213,24 +213,19 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "93c3f1a7", + "execution_count": 6, + "id": "4c9017c9", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'id': 0} {'total': 1}\n", - "{'id': 1} {'total': 3}\n", - "{'id': 2} {'total': 5}\n", - "{'id': 3} {'total': 7}\n", - "{'id': 4} {'total': 9}\n", - "{'id': 5} {'total': 11}\n", - "{'id': 6} {'total': 13}\n", - "{'id': 7} {'total': 15}\n", - "{'id': 8} {'total': 17}\n", - "{'id': 9} {'total': 19}\n" + "ename": "NameError", + "evalue": "name 'total_stream' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtotal_stream\u001b[49m:\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(tag, packet)\n", + "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" ] } ], @@ -241,28 +236,20 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "cfadfb8f", + "execution_count": 7, + "id": "59104716", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[({'id': 0}, {'total': 1}),\n", - " ({'id': 1}, {'total': 3}),\n", - " ({'id': 2}, {'total': 5}),\n", - " ({'id': 3}, {'total': 7}),\n", - " ({'id': 4}, {'total': 9}),\n", - " ({'id': 5}, {'total': 11}),\n", - " ({'id': 6}, {'total': 13}),\n", - " ({'id': 7}, {'total': 15}),\n", - " ({'id': 8}, {'total': 17}),\n", - " ({'id': 9}, {'total': 19})]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'total_stream' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtotal_stream\u001b[49m.flow()\n", + "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" + ] } ], "source": [ @@ -279,8 +266,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "2805282e", + "execution_count": 8, + "id": "77547b4d", "metadata": {}, "outputs": [ { @@ -297,7 +284,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:75\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 69\u001b[39m normalized_streams = [\n\u001b[32m 70\u001b[39m stream() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source) \u001b[38;5;28;01melse\u001b[39;00m stream\n\u001b[32m 71\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m verified_streams\n\u001b[32m 72\u001b[39m ]\n\u001b[32m 74\u001b[39m pre_processed_streams = \u001b[38;5;28mself\u001b[39m.pre_forward_hook(*normalized_streams, **kwargs)\n\u001b[32m---> \u001b[39m\u001b[32m75\u001b[39m output_stream = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43mpre_processed_streams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m post_processed_stream = \u001b[38;5;28mself\u001b[39m.post_forward_hook(output_stream, **kwargs)\n\u001b[32m 77\u001b[39m \u001b[38;5;66;03m# create an invocation instance\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:236\u001b[39m, in \u001b[36mFunctionPod.forward\u001b[39m\u001b[34m(self, *streams, **kwargs)\u001b[39m\n\u001b[32m 232\u001b[39m _, packet_typespec = stream.types(trigger_run=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m packet_typespec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m check_typespec_compatibility(\n\u001b[32m 234\u001b[39m packet_typespec, \u001b[38;5;28mself\u001b[39m.function_input_typespec\n\u001b[32m 235\u001b[39m ):\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 237\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInput packet types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is not compatible with the function\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms expected input types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.function_input_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 238\u001b[39m )\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().forward(*streams, **kwargs)\n", "\u001b[31mTypeError\u001b[39m: Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }" @@ -310,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "4c9c030a", "metadata": {}, "outputs": [ @@ -320,7 +307,29 @@ "({'id': int}, {'x': int, 'y': int})" ] }, - "execution_count": 11, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can check the tag and packet types of the stream\n", + "stream.types()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "34338baf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'id': int}, {'x': int, 'y': int})" + ] + }, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -348,18 +357,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "8083f54a", "metadata": {}, "outputs": [], "source": [ "# Use simple data store, saving data to Parquet files\n", - "pipeline_store = op.stores.SimpleParquetDataStore(\"./example_data_store\")" + "from orcapod.stores.delta_table_arrow_data_store import DeltaTableArrowDataStore\n", + "\n", + "pipeline_store = DeltaTableArrowDataStore(\"./delta_store\", batch_size=100)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "a475308c", "metadata": {}, "outputs": [], @@ -377,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "f923ecf1", "metadata": {}, "outputs": [], @@ -408,17 +419,77 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, + "id": "64746ada", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "Memoizing single packet return 2 packets!", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", + "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" + ] + } + ], + "source": [ + "pipeline.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "66230603", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "FunctionPodNode>" + "FunctionPodNode>" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -429,17 +500,17 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "6587f2f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "FunctionPodNode>" + "FunctionPodNode>" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -458,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "bd0dfba2", "metadata": {}, "outputs": [ @@ -468,7 +539,7 @@ "KernelNode" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -487,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "e22758ab", "metadata": {}, "outputs": [ @@ -495,15 +566,15 @@ "data": { "text/plain": [ "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", " 'MapPackets_0': KernelNode,\n", " 'MapPackets_1': KernelNode,\n", " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>}" + " 'mult': FunctionPodNode>}" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "0d1a470e", "metadata": {}, "outputs": [], @@ -533,7 +604,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "3a43984d", "metadata": {}, "outputs": [ @@ -541,15 +612,15 @@ "data": { "text/plain": [ "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>,\n", + " 'mult': FunctionPodNode>,\n", " 'total_map': KernelNode,\n", " 'mult_map': KernelNode}" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -584,45 +655,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "id": "96106e09", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "shape: (10, 2)
idtotal
i64i64
01
13
25
37
49
511
613
715
817
919
" - ], - "text/plain": [ - "shape: (10, 2)\n", - "┌─────┬───────┐\n", - "│ id ┆ total │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═══════╡\n", - "│ 0 ┆ 1 │\n", - "│ 1 ┆ 3 │\n", - "│ 2 ┆ 5 │\n", - "│ 3 ┆ 7 │\n", - "│ 4 ┆ 9 │\n", - "│ 5 ┆ 11 │\n", - "│ 6 ┆ 13 │\n", - "│ 7 ┆ 15 │\n", - "│ 8 ┆ 17 │\n", - "│ 9 ┆ 19 │\n", - "└─────┴───────┘" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n" + ] } ], "source": [ @@ -639,45 +681,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "id": "33b449b6", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "shape: (10, 3)
idxy
i64i64i64
001
112
223
334
445
556
667
778
889
9910
" - ], - "text/plain": [ - "shape: (10, 3)\n", - "┌─────┬─────┬─────┐\n", - "│ id ┆ x ┆ y │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ i64 │\n", - "╞═════╪═════╪═════╡\n", - "│ 0 ┆ 0 ┆ 1 │\n", - "│ 1 ┆ 1 ┆ 2 │\n", - "│ 2 ┆ 2 ┆ 3 │\n", - "│ 3 ┆ 3 ┆ 4 │\n", - "│ 4 ┆ 4 ┆ 5 │\n", - "│ 5 ┆ 5 ┆ 6 │\n", - "│ 6 ┆ 6 ┆ 7 │\n", - "│ 7 ┆ 7 ┆ 8 │\n", - "│ 8 ┆ 8 ┆ 9 │\n", - "│ 9 ┆ 9 ┆ 10 │\n", - "└─────┴─────┴─────┘" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n" + ] } ], "source": [ @@ -694,10 +707,52 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "id": "189f943f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "Memoizing single packet return 2 packets!", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", + "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" + ] + } + ], "source": [ "pipeline.run()" ] From 6f41f52497590152394114f2225403e0d28e2ea0 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 5 Jul 2025 05:37:58 +0000 Subject: [PATCH 058/224] feat: cleaned up delta store --- src/orcapod/pipeline/nodes.py | 22 +- .../stores/delta_table_arrow_data_store.py | 662 ++++++------------ 2 files changed, 239 insertions(+), 445 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 07d9eb4..405714f 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -196,13 +196,15 @@ def update_cached_values(self): def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") - if self.df is not None: + if (lazy_df := self.get_all_records_as_polars(flush=False)) is not None: if self.tag_keys is None: raise ValueError( "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" ) return PolarsStream( - self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys + lazy_df.collect(), + tag_keys=self.tag_keys, + packet_keys=self.packet_keys, ) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) @@ -235,7 +237,7 @@ def post_call(self, tag: Tag, packet: Packet) -> None: ) # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) - if not self.output_store.get_record(self.store_path, output_id): + if not self.output_store.get_record(self.store_path, output_id, flush=False): self.output_store.add_record( self.store_path, output_id, @@ -249,6 +251,9 @@ def output_iterator_completion_hook(self) -> None: logger.info(f"Results cached for {self}") self._cache_computed = True + def get_all_records_as_polars(self, flush: bool = True) -> pl.LazyFrame | None: + return self.output_store.get_all_records_as_polars(self.store_path, flush=flush) + @property def lazy_df(self) -> pl.LazyFrame | None: lazydf = self.output_store.get_all_records_as_polars(self.store_path) @@ -542,7 +547,9 @@ def _add_pipeline_record_with_packet_key( # TODO: add error handling # check if record already exists: - retrieved_table = self.tag_store.get_record(self.tag_store_path, entry_hash) + retrieved_table = self.tag_store.get_record( + self.tag_store_path, entry_hash, flush=False + ) if retrieved_table is None: self.tag_store.add_record(self.tag_store_path, entry_hash, table) @@ -565,6 +572,7 @@ def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: arrow_table = self.output_store.get_record( self.output_store_path, packet_key, + flush=False, ) if arrow_table is None: return None @@ -626,7 +634,9 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: output_packet = None if not self.skip_memoization_lookup: - output_packet = self._retrieve_memoized_with_packet_key(packet_key) + output_packet = self._retrieve_memoized_with_packet_key( + packet_key, + ) if output_packet is not None: logger.debug( f"Memoized output for {packet} with {packet_key} found, skipping computation" @@ -658,7 +668,7 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ) return tag, None - # result was successfully computed -- save the tag + # result was successfully computed/retrieved -- save the tag if not self.skip_tag_record and self.tag_store is not None: self._add_pipeline_record_with_packet_key( tag, packet_key, packet.source_info diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/delta_table_arrow_data_store.py index e5ddfb9..56bbbfa 100644 --- a/src/orcapod/stores/delta_table_arrow_data_store.py +++ b/src/orcapod/stores/delta_table_arrow_data_store.py @@ -1,14 +1,14 @@ import pyarrow as pa import pyarrow.compute as pc +import pyarrow.dataset as ds import polars as pl from pathlib import Path -from typing import Any, Dict, List +from typing import Any import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -import threading from collections import defaultdict -import json + # Module-level logger logger = logging.getLogger(__name__) @@ -31,7 +31,6 @@ def __init__( create_base_path: bool = True, max_hierarchy_depth: int = 10, batch_size: int = 100, - auto_flush_interval: float = 300.0, # 5 minutes ): """ Initialize the DeltaTableArrowDataStore. @@ -54,7 +53,6 @@ def __init__( self.base_path = Path(base_path) self.max_hierarchy_depth = max_hierarchy_depth self.batch_size = batch_size - self.auto_flush_interval = auto_flush_interval if create_base_path: self.base_path.mkdir(parents=True, exist_ok=True) @@ -66,56 +64,125 @@ def __init__( # Cache for Delta tables to avoid repeated initialization self._delta_table_cache: dict[str, DeltaTable] = {} - # Cache for original schemas (without __entry_id column) - self._schema_cache: dict[str, pa.Schema] = {} - # Batch management - self._pending_batches: Dict[str, List[pa.Table]] = defaultdict(list) - self._batch_lock = threading.Lock() - - # Auto-flush timer - self._flush_timer = None - # if auto_flush_interval > 0: - # self._start_auto_flush_timer() + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) logger.info( f"Initialized DeltaTableArrowDataStore at {self.base_path} " f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " - f"batch_size={batch_size}, auto_flush_interval={auto_flush_interval}s" + f"batch_size={batch_size}, as" ) - def _start_auto_flush_timer(self): - """Start the auto-flush timer.""" - if self._flush_timer: - self._flush_timer.cancel() - - if self.auto_flush_interval > 0: - self._flush_timer = threading.Timer( - self.auto_flush_interval, self._auto_flush - ) - self._flush_timer.daemon = True - self._flush_timer.start() + def flush(self) -> None: + """ + Flush all pending batches immediately. - def _auto_flush(self): - """Auto-flush all pending batches.""" + This method is called to ensure all pending data is written to the Delta tables. + """ try: - print("Flushing!", flush=True) self.flush_all_batches() except Exception as e: - logger.error(f"Error during auto-flush: {e}") - finally: - self._start_auto_flush_timer() + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") def __del__(self): """Cleanup when object is destroyed.""" - try: - if self._flush_timer: - self._flush_timer.cancel() - self.flush_all_batches() - except Exception: - pass # Ignore errors during cleanup + self.flush() def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly """ Validate source path components. @@ -154,174 +221,11 @@ def _get_source_key(self, source_path: tuple[str, ...]) -> str: def _get_table_path(self, source_path: tuple[str, ...]) -> Path: """Get the filesystem path for a given source path.""" path = self.base_path - for component in source_path: - path = path / component + for subpath in source_path: + path = path / subpath return path - def _get_schema_metadata_path(self, source_path: tuple[str, ...]) -> Path: - """Get the path for storing original schema metadata.""" - table_path = self._get_table_path(source_path) - return table_path / "_original_schema.json" - - def _save_original_schema( - self, source_path: tuple[str, ...], schema: pa.Schema - ) -> None: - """Save the original schema (without __entry_id) to metadata file.""" - source_key = self._get_source_key(source_path) - - # Cache the schema - self._schema_cache[source_key] = schema - - try: - # Save to file as well for persistence - schema_path = self._get_schema_metadata_path(source_path) - schema_path.parent.mkdir(parents=True, exist_ok=True) - - # Convert schema to JSON-serializable format - def convert_metadata(metadata): - """Convert Arrow metadata (bytes keys/values) to JSON-safe format.""" - if metadata is None: - return None - result = {} - for key, value in metadata.items(): - # Convert bytes keys and values to strings - str_key = ( - key.decode("utf-8") if isinstance(key, bytes) else str(key) - ) - str_value = ( - value.decode("utf-8") - if isinstance(value, bytes) - else str(value) - ) - result[str_key] = str_value - return result - - schema_dict = { - "fields": [ - { - "name": field.name, - "type": str(field.type), - "nullable": field.nullable, - "metadata": convert_metadata(field.metadata), - } - for field in schema - ], - "metadata": convert_metadata(schema.metadata), - } - - with open(schema_path, "w") as f: - json.dump(schema_dict, f, indent=2) - - except Exception as e: - logger.warning(f"Could not save schema metadata for {source_key}: {e}") - - def _load_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: - """Load the original schema from cache or metadata file.""" - source_key = self._get_source_key(source_path) - - # Check cache first - if source_key in self._schema_cache: - return self._schema_cache[source_key] - - # Try to load from file - try: - schema_path = self._get_schema_metadata_path(source_path) - if not schema_path.exists(): - return None - - with open(schema_path, "r") as f: - schema_dict = json.load(f) - - # Reconstruct schema from JSON - def convert_metadata_back(metadata_dict): - """Convert JSON metadata back to Arrow format (bytes keys/values).""" - if metadata_dict is None: - return None - result = {} - for key, value in metadata_dict.items(): - # Convert string keys and values back to bytes - bytes_key = key.encode("utf-8") - bytes_value = ( - value.encode("utf-8") - if isinstance(value, str) - else str(value).encode("utf-8") - ) - result[bytes_key] = bytes_value - return result - - fields = [] - for field_dict in schema_dict["fields"]: - # Parse the type string back to Arrow type - type_str = field_dict["type"] - arrow_type = self._parse_arrow_type_string(type_str) - - metadata = convert_metadata_back(field_dict.get("metadata")) - - field = pa.field( - field_dict["name"], - arrow_type, - nullable=field_dict["nullable"], - metadata=metadata, - ) - fields.append(field) - - schema_metadata = convert_metadata_back(schema_dict.get("metadata")) - - schema = pa.schema(fields, metadata=schema_metadata) - - # Cache it - self._schema_cache[source_key] = schema - return schema - - except Exception as e: - logger.warning(f"Could not load schema metadata for {source_key}: {e}") - return None - - def _parse_arrow_type_string(self, type_str: str) -> pa.DataType: - """Parse Arrow type string back to Arrow type object.""" - # This is a simplified parser for common types - # You might need to extend this for more complex types - type_str = type_str.strip() - - # Handle basic types - if type_str == "int64": - return pa.int64() - elif type_str == "int32": - return pa.int32() - elif type_str == "float64": - return pa.float64() - elif type_str == "float32": - return pa.float32() - elif type_str == "bool": - return pa.bool_() - elif type_str == "string": - return pa.string() - elif type_str == "large_string": - return pa.large_string() - elif type_str == "binary": - return pa.binary() - elif type_str == "large_binary": - return pa.large_binary() - elif type_str.startswith("timestamp"): - # Extract timezone if present - if "[" in type_str and "]" in type_str: - tz = type_str.split("[")[1].split("]")[0] - if tz == "UTC": - tz = "UTC" - return pa.timestamp("us", tz=tz) - else: - return pa.timestamp("us") - elif type_str.startswith("list<"): - # Parse list type - inner_type_str = type_str[5:-1] # Remove 'list<' and '>' - inner_type = self._parse_arrow_type_string(inner_type_str) - return pa.list_(inner_type) - else: - # Fallback to string for unknown types - logger.warning(f"Unknown Arrow type string: {type_str}, using string") - return pa.string() - - def _get_or_create_delta_table( + def _get_existing_delta_table( self, source_path: tuple[str, ...] ) -> DeltaTable | None: """ @@ -337,8 +241,8 @@ def _get_or_create_delta_table( table_path = self._get_table_path(source_path) # Check cache first - if source_key in self._delta_table_cache: - return self._delta_table_cache[source_key] + if dt := self._delta_table_cache.get(source_key): + return dt try: # Try to load existing table @@ -426,164 +330,57 @@ def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: """ return [("__entry_id", "in", entry_ids)] - def _read_table_with_schema_preservation( + def _read_table_with_filter( self, delta_table: DeltaTable, - source_path: tuple[str, ...], - filters: list = None, + filters: list | None = None, ) -> pa.Table: """ Read table using to_pyarrow_dataset with original schema preservation. Args: delta_table: The Delta table to read from - source_path: Source path for schema lookup filters: Optional filters to apply Returns: Arrow table with preserved schema """ - try: - # Get the original schema (without __entry_id) - original_schema = self._load_original_schema(source_path) - - if original_schema is not None: - # Create target schema with __entry_id column - entry_id_field = pa.field( - "__entry_id", pa.large_string(), nullable=False - ) - target_schema = pa.schema([entry_id_field] + list(original_schema)) - - # Use to_pyarrow_dataset with the target schema - dataset = delta_table.to_pyarrow_dataset(schema=target_schema) - if filters: - # Apply filters at dataset level for better performance - import pyarrow.compute as pc - - filter_expr = None - for filt in filters: - if len(filt) == 3: - col, op, val = filt - if op == "=": - expr = pc.equal(pc.field(col), pa.scalar(val)) - elif op == "in": - expr = pc.is_in(pc.field(col), pa.array(val)) - else: - # Fallback to table-level filtering - return delta_table.to_pyarrow_table(filters=filters) - - if filter_expr is None: - filter_expr = expr - else: - filter_expr = pc.and_(filter_expr, expr) - - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) - - return dataset.to_table() - else: - # Fallback to regular method if no schema found - logger.warning( - f"No original schema found for {'/'.join(source_path)}, using fallback" - ) - return delta_table.to_pyarrow_table(filters=filters) - - except Exception as e: - logger.warning( - f"Error reading with schema preservation: {e}, falling back to regular method" - ) - return delta_table.to_pyarrow_table(filters=filters) - - def _flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - print("Flushing triggered!!", flush=True) - source_key = self._get_source_key(source_path) - - with self._batch_lock: - if ( - source_key not in self._pending_batches - or not self._pending_batches[source_key] - ): - return - - # Get all pending records - pending_tables = self._pending_batches[source_key] - self._pending_batches[source_key] = [] - - if not pending_tables: - return - - try: - # Combine all tables in the batch - combined_table = pa.concat_tables(pending_tables) - - table_path = self._get_table_path(source_path) - table_path.mkdir(parents=True, exist_ok=True) - - # Check if table exists - delta_table = self._get_or_create_delta_table(source_path) - - if delta_table is None: - # Create new table - save original schema first - original_schema = self._remove_entry_id_column(combined_table).schema - self._save_original_schema(source_path, original_schema) - - write_deltalake(str(table_path), combined_table, mode="overwrite") - logger.debug( - f"Created new Delta table for {source_key} with {len(combined_table)} records" - ) - else: - # Handle duplicates if needed - if self.duplicate_entry_behavior == "overwrite": - # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) - - # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" - try: - delta_table.delete(delete_predicate) - logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing records to delete from {source_key}: {e}" - ) + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) - # Append new records - write_deltalake( - str(table_path), combined_table, mode="append", schema_mode="merge" - ) - logger.debug( - f"Appended batch of {len(combined_table)} records to {source_key}" - ) + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - # Put the tables back in the pending queue - with self._batch_lock: - self._pending_batches[source_key] = ( - pending_tables + self._pending_batches[source_key] - ) - raise + return dataset.to_table() def add_record( self, source_path: tuple[str, ...], entry_id: str, arrow_data: pa.Table, - ignore_duplicate: bool = False, force_flush: bool = False, ) -> pa.Table: """ @@ -605,24 +402,22 @@ def add_record( self._validate_source_path(source_path) source_key = self._get_source_key(source_path) - # Check for existing entry if needed (only for immediate duplicates, not batch) - if ( - not ignore_duplicate - and self.duplicate_entry_behavior == "error" - and not force_flush - ): + # Check for existing entry + if self.duplicate_entry_behavior == "error": # Only check existing table, not pending batch for performance - existing_record = self.get_record(source_path, entry_id) + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_record(source_path, entry_id, flush=False) if existing_record is not None: raise ValueError( f"Entry '{entry_id}' already exists in {'/'.join(source_path)}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - # Save original schema if this is the first record for this source - if source_key not in self._schema_cache: - self._save_original_schema(source_path, arrow_data.schema) - # Add entry_id column to the data data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) @@ -631,11 +426,10 @@ def add_record( table_path = self._get_table_path(source_path) table_path.mkdir(parents=True, exist_ok=True) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: # Create new table - save original schema first - self._save_original_schema(source_path, arrow_data.schema) write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") logger.debug(f"Created new Delta table for {source_key}") else: @@ -653,7 +447,7 @@ def add_record( ) write_deltalake( - str(table_path), + table_path, data_with_entry_id, mode="append", schema_mode="merge", @@ -662,55 +456,32 @@ def add_record( # Update cache self._delta_table_cache[source_key] = DeltaTable(str(table_path)) else: - # Add to batch - with self._batch_lock: - self._pending_batches[source_key].append(data_with_entry_id) - batch_size = len(self._pending_batches[source_key]) + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) # Check if we need to flush if batch_size >= self.batch_size: - self._flush_batch(source_path) + self.flush_batch(source_path) logger.debug(f"Added record {entry_id} to {source_key}") return arrow_data - def flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Manually flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - self._flush_batch(source_path) - - def flush_all_batches(self) -> None: - """Flush all pending batches.""" - with self._batch_lock: - source_keys = list(self._pending_batches.keys()) - - for source_key in source_keys: - source_path = tuple(source_key.split("/")) - try: - self._flush_batch(source_path) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - - def get_pending_batch_info(self) -> Dict[str, int]: + def get_pending_batch_info(self) -> dict[str, int]: """ Get information about pending batches. Returns: Dictionary mapping source keys to number of pending records """ - with self._batch_lock: - return { - source_key: len(tables) - for source_key, tables in self._pending_batches.items() - if tables - } + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } def get_record( - self, source_path: tuple[str, ...], entry_id: str + self, source_path: tuple[str, ...], entry_id: str, flush: bool = False ) -> pa.Table | None: """ Get a specific record by entry_id with schema preservation. @@ -720,20 +491,26 @@ def get_record( entry_id: Unique identifier for the record Returns: - Arrow table for the record with original schema, or None if not found + Arrow table for the record or None if not found """ + if flush: + self.flush_batch(source_path) self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) + # check if entry_id is found in pending batches + source_key = self._get_source_key(source_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None try: # Use schema-preserving read filter_expr = self._create_entry_id_filter(entry_id) - result = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None @@ -748,7 +525,11 @@ def get_record( raise e def get_all_records( - self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False + self, + source_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, ) -> pa.Table | None: """ Retrieve all records for a given source path as a single table with schema preservation. @@ -763,28 +544,43 @@ def get_all_records( Returns: Arrow table containing all records with original schema, or None if no records found """ + if flush: + self.flush_batch(source_path) self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) - if delta_table is None: - return None + collected_arrays = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(source_path) + ].items(): + collected_arrays.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) - try: - # Use schema-preserving read - result = self._read_table_with_schema_preservation(delta_table, source_path) + delta_table = self._get_existing_delta_table(source_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) - if len(result) == 0: - return None + if len(result) != 0: + collected_arrays.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(source_path)}: {e}" + ) + if collected_arrays: + total_table = pa.Table.concatenate(collected_arrays) # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) + return self._handle_entry_id_column(total_table, add_entry_id_column) - except Exception as e: - logger.error(f"Error getting all records from {'/'.join(source_path)}: {e}") - return None + return None def get_all_records_as_polars( - self, source_path: tuple[str, ...] + self, source_path: tuple[str, ...], flush: bool = True ) -> pl.LazyFrame | None: """ Retrieve all records for a given source path as a single Polars LazyFrame. @@ -795,7 +591,7 @@ def get_all_records_as_polars( Returns: Polars LazyFrame containing all records, or None if no records found """ - all_records = self.get_all_records(source_path) + all_records = self.get_all_records(source_path, flush=flush) if all_records is None: return None return pl.LazyFrame(all_records) @@ -806,6 +602,7 @@ def get_records_by_ids( entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, + flush: bool = False, ) -> pa.Table | None: """ Retrieve records by entry IDs as a single table with schema preservation. @@ -819,6 +616,9 @@ def get_records_by_ids( Returns: Arrow table containing all found records with original schema, or None if no records found """ + if flush: + self.flush_batch(source_path) + self._validate_source_path(source_path) # Convert input to list of strings for consistency @@ -839,16 +639,14 @@ def get_records_by_ids( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" ) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None try: # Use schema-preserving read with filters filter_expr = self._create_entry_ids_filter(entry_ids_list) - result = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None @@ -881,6 +679,7 @@ def get_records_by_ids_as_polars( entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, + flush: bool = False, ) -> pl.LazyFrame | None: """ Retrieve records by entry IDs as a single Polars LazyFrame. @@ -895,7 +694,11 @@ def get_records_by_ids_as_polars( Polars LazyFrame containing all found records, or None if no records found """ arrow_result = self.get_records_by_ids( - source_path, entry_ids, add_entry_id_column, preserve_input_order + source_path, + entry_ids, + add_entry_id_column, + preserve_input_order, + flush=flush, ) if arrow_result is None: @@ -947,7 +750,7 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: self._validate_source_path(source_path) # Flush any pending batches first - self._flush_batch(source_path) + self.flush_batch(source_path) table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) @@ -990,16 +793,14 @@ def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: # Flush any pending batches first self._flush_batch(source_path) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return False try: # Check if record exists using proper filter filter_expr = self._create_entry_id_filter(entry_id) - existing = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) if len(existing) == 0: return False @@ -1033,7 +834,7 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: """ self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None @@ -1047,14 +848,10 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: pending_info = self.get_pending_batch_info() pending_count = pending_info.get(source_key, 0) - # Get original schema info - original_schema = self._load_original_schema(source_path) - return { "path": str(self._get_table_path(source_path)), "source_path": source_path, "schema": schema, - "original_schema": original_schema, "version": delta_table.version(), "num_files": len(delta_table.files()), "history_length": len(history), @@ -1065,16 +862,3 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: except Exception as e: logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") return None - - def get_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: - """ - Get the original schema (without __entry_id column) for a source path. - - Args: - source_path: Tuple of path components - - Returns: - Original Arrow schema or None if not found - """ - self._validate_source_path(source_path) - return self._load_original_schema(source_path) From 523291f782dfa5e13fc2ff82312a0bec357ee317 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:42:59 +0000 Subject: [PATCH 059/224] feat: add protocols --- src/orcapod/protocols/__init__.py | 0 src/orcapod/protocols/data_protocols.py | 247 ++++++++++++++++++++ src/orcapod/protocols/hashing_protocols.py | 139 +++++++++++ src/orcapod/protocols/semantic_protocols.py | 38 +++ src/orcapod/protocols/store_protocols.py | 0 src/orcapod/protocols/types.py | 51 ++++ 6 files changed, 475 insertions(+) create mode 100644 src/orcapod/protocols/__init__.py create mode 100644 src/orcapod/protocols/data_protocols.py create mode 100644 src/orcapod/protocols/hashing_protocols.py create mode 100644 src/orcapod/protocols/semantic_protocols.py create mode 100644 src/orcapod/protocols/store_protocols.py create mode 100644 src/orcapod/protocols/types.py diff --git a/src/orcapod/protocols/__init__.py b/src/orcapod/protocols/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py new file mode 100644 index 0000000..677aab6 --- /dev/null +++ b/src/orcapod/protocols/data_protocols.py @@ -0,0 +1,247 @@ +from typing import Protocol +from orcapod.types import DataValue, TypeSpec +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from collections.abc import Iterator, Collection +import pyarrow as pa +from datetime import datetime + + +class Datagram(Protocol): + @property + def typespec(self) -> TypeSpec: ... + + def keys(self) -> Collection[str]: ... + + def as_table(self) -> pa.Table: ... + + def as_dict(self) -> dict[str, DataValue]: ... + + +class Tag(Datagram, Protocol): ... + + +class Packet(Datagram, Protocol): + def as_table(self, include_source: bool = False) -> pa.Table: + """ + Convert the packet to a PyArrow Table. + If include_source is True, the source information is included in the table. + """ + ... + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Convert the packet to a dictionary. + If include_source is True, the source information is included in the dictionary. + """ + ... + + def content_hash(self) -> str: ... + + def source_info(self) -> dict[str, str | None]: ... + + # def join(self, other: "Packet") -> "Packet": ... + + # def get_as(self, packet_type: PacketType) -> PacketType: ... + + +class PodFunction(Protocol): + """ + A function suitable to be used in a FunctionPod. + It takes one or more named arguments, each corresponding to either: + - A path to a file or directory (PathSet) - for backward compatibility + - A simple data value (str, int, float, bool, bytes, Path) + and returns either None, a single value, or a list of values + """ + + def __call__(self, **kwargs: DataValue) -> None | DataValue: ... + + +class Labelable(Protocol): + """ + A protocol for objects that can have a label. + This is used to provide a human-readable name for the object. + """ + + @property + def label(self) -> str | None: + """ + Return the label of the object. + If no label is set, return None. + """ + ... + + +class Kernel(ContentIdentifiable, Labelable, Protocol): + """ + Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. + It is the base class for all computations and transformations that can be performed on a collection of streams + (including an empty collection). + A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + Each "invocation" of the kernel on a collection of streams is assigned a unique ID. + The corresponding invocation information is stored as Invocation object and attached to the output stream + for computational graph tracking. + """ + + def __call__( + self, *streams: "Stream", label: str | None = None, **kwargs + ) -> "Stream": + """ + This is the main interface for invoking the kernel and perform any side-effects such as registering the invocation with the computational graph. + This method should be called with a collection of streams, which can be empty, and is expected to trigger + the call to the forward method of the kernel. + """ + ... + + def forward(self, *streams: "Stream") -> "Stream": + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior. + The method should return a new stream that represents the output of the kernel, but should not register the invocation + with the computational graph, allowing for the computation to be performed without side effects. + """ + ... + + def types(self, *streams: "Stream") -> tuple[TypeSpec, TypeSpec]: ... + + def validate_inputs(self, *streams: "Stream") -> None: ... + + +class Pod(Kernel, Protocol): + @property + def input_typespec(self) -> TypeSpec: ... + + @property + def output_typespec(self) -> TypeSpec: ... + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + """ + Call the function pod with a single input packet. + This is used to invoke the function pod with a single packet. + """ + ... + + +class Stream(ContentIdentifiable, Labelable, Protocol): + """ + A stream that is generated by an invocation of a kernel. + This stream is used to represent the output of a kernel invocation. + It is a concrete implementation of the SyncStream that has an associated + invocation that generated the stream. + """ + + @property + def source(self) -> Kernel | None: ... + + @property + def upstreams(self) -> tuple["Stream", ...]: ... + + @property + def last_modified(self) -> datetime | None: + """ + Returns when the stream's content was last modified. + + Returns: + datetime: Timestamp of last modification (cacheable streams) + None: Content is never stable - always recompute + (async streams, dynamic streams, etc.) + """ + ... + + @property + def is_current(self) -> bool: + """ + Returns whether the stream is current. + A stream is current if the content is up-to-date with respect to its source. + This can be used to determine if a stream with non-None last_modified is up-to-date. + Note that for asynchronous streams, this status is not applicable and always returns False. + """ + ... + + def as_table(self) -> pa.Table: + """ + Convert the stream to a PyArrow Table. + To avoid collision, tags should be prefixed with "_tag_". + """ + ... + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: ... + + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: ... + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Return the keys of the pipeline property. + This is used to define the keys of the pipeline property. + """ + ... + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Return the types of the pipeline property. + This is used to define the types of the graph property. + """ + ... + + +class Source(Kernel, Stream, Protocol): + """ + A source is a special type of kernel that produces a stream of data. + It is the entry point for data into the computational graph. + Sources are typically used to read data from external sources such as files, databases, etc. + """ + + +class Tracker(Protocol): + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the tracker. + This is used to activate or deactivate the tracker. + If the tracker is active, it will record the invocations of kernels. + """ + ... + + def is_active(self) -> bool: + """ + Check if the tracker is active. + This is used to determine if the tracker is currently recording invocations. + """ + ... + + def record(self, stream: Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + ... + + +class TrackerManager(Protocol): + def get_active_trackers(self) -> list[Tracker]: + """ + Get the list of active trackers. + This is used to retrieve the currently active trackers in the system. + """ + ... + + def register_tracker(self, tracker: Tracker) -> None: + """ + Register a new tracker in the system. + This is used to add a new tracker to the list of active trackers. + """ + ... + + def deregister_tracker(self, tracker: Tracker) -> None: + """ + Deregister a tracker from the system. + This is used to remove a tracker from the list of active trackers. + """ + ... + + def record(self, stream: Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + ... diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py new file mode 100644 index 0000000..7c14e2e --- /dev/null +++ b/src/orcapod/protocols/hashing_protocols.py @@ -0,0 +1,139 @@ +"""Hash strategy protocols for dependency injection.""" + +from collections.abc import Callable +from typing import Any, Protocol, runtime_checkable +import uuid + +from orcapod.types import TypeSpec, PathLike +import pyarrow as pa + + +@runtime_checkable +class ContentIdentifiable(Protocol): + """Protocol for objects that can provide an identity structure.""" + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this object. + + Returns: + Any: A structure representing this object's content. + Should be deterministic and include all identity-relevant data. + Return None to indicate no custom identity is available. + """ + ... + + def __eq__(self, other: object) -> bool: + """ + Equality check that compares the identity structures of two objects. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the identity structures are equal, False otherwise. + """ + ... + + def __hash__(self) -> int: + """ + Hash implementation that uses the identity structure if provided, + otherwise falls back to the default hash. + + Returns: + int: A hash value based on either content or identity. + """ + ... + + +class ObjectHasher(Protocol): + """Protocol for general object hashing.""" + + # TODO: consider more explicitly stating types of objects accepted + def hash(self, obj: Any) -> bytes: + """ + Hash an object to a byte representation. + + Args: + obj (Any): The object to hash. + + Returns: + bytes: The byte representation of the hash. + """ + ... + + def get_hasher_id(self) -> str: + """ + Returns a unique identifier/name assigned to the hasher + """ + ... + + def hash_to_hex( + self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + ) -> str: ... + + def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: + """ + Hash an object to an integer. + + Args: + obj (Any): The object to hash. + hexdigits (int): Number of hexadecimal digits to use for the hash. + + Returns: + int: The integer representation of the hash. + """ + ... + + def hash_to_uuid( + self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + ) -> uuid.UUID: ... + + +class FileContentHasher(Protocol): + """Protocol for file-related hashing.""" + + def hash_file(self, file_path: PathLike) -> bytes: ... + + +class ArrowHasher(Protocol): + """Protocol for hashing arrow packets.""" + + def get_hasher_id(self) -> str: ... + + def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... + + +class StringCacher(Protocol): + """Protocol for caching string key value pairs.""" + + def get_cached(self, cache_key: str) -> str | None: ... + def set_cached(self, cache_key: str, value: str) -> None: ... + def clear_cache(self) -> None: ... + + +class FunctionInfoExtractor(Protocol): + """Protocol for extracting function information.""" + + def extract_function_info( + self, + func: Callable[..., Any], + function_name: str | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | None = None, + ) -> dict[str, Any]: ... + + +class SemanticTypeHasher(Protocol): + """Abstract base class for semantic type-specific hashers.""" + + def hash_column( + self, + column: pa.Array, + ) -> pa.Array: + """Hash a column with this semantic type and return the hash bytes.""" + ... + + def set_cacher(self, cacher: StringCacher) -> None: + """Add a string cacher for caching hash values.""" + ... diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_protocols.py new file mode 100644 index 0000000..5458cad --- /dev/null +++ b/src/orcapod/protocols/semantic_protocols.py @@ -0,0 +1,38 @@ +from typing import Protocol, Any + + +class TypeHandler(Protocol): + """Protocol for handling conversion between Python type and Arrow + data types used for storage. + + The handler itself IS the definition of a semantic type. The semantic type + name/identifier is provided by the registerer when registering the handler. + + TypeHandlers should clearly communicate what Python types they can handle, + and focus purely on conversion logic. + """ + + def python_type(self) -> type: + """Return the Python type(s) this handler can process. + + Returns: + Python type the handler supports + + Examples: + - PathHandler: return Path + - NumericHandler: return (int, float) + - CollectionHandler: return (list, tuple, set) + """ + ... + + def storage_type(self) -> type: + """Return the Arrow DataType instance for schema definition.""" + ... + + def python_to_storage(self, value: Any) -> Any: + """Convert Python value to Arrow-compatible storage representation.""" + ... + + def storage_to_python(self, value: Any) -> Any: + """Convert storage representation back to Python object.""" + ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/protocols/types.py b/src/orcapod/protocols/types.py new file mode 100644 index 0000000..73e67f1 --- /dev/null +++ b/src/orcapod/protocols/types.py @@ -0,0 +1,51 @@ +# from typing import TypeAlias +# from collections.abc import Collection, Mapping +# from pathlib import Path +# import logging +# import os + +# logger = logging.getLogger(__name__) + + +# # class TypeSpec(dict[str, DataType]): +# # def __init__(self, *args, **kwargs): +# # """ +# # TypeSpec is a mapping of parameter names to their types. +# # It can be used to define the expected types of parameters in a function or a pod. +# # """ +# # super().__init__(*args, **kwargs) + + +# # Convenience alias for anything pathlike +# PathLike: TypeAlias = str | os.PathLike + +# # an (optional) string or a collection of (optional) string values +# # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists +# TagValue: TypeAlias = int | str | None | Collection["TagValue"] + +# # the top level tag is a mapping from string keys to values that can be a string or +# # an arbitrary depth of nested list of strings or None +# Tag: TypeAlias = Mapping[str, TagValue] + +# # a pathset is a path or an arbitrary depth of nested list of paths +# PathSet: TypeAlias = PathLike | Collection[PathLike | None] + +# # Simple data types that we support (with clear Polars correspondence) +# SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes + +# ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike + +# TypeSpec = dict[str, type] # Mapping of parameter names to their types + +# # Extended data values that can be stored in packets +# # Either the original PathSet or one of our supported simple data types +# DataValue: TypeAlias = ( +# PathSet +# | SupportedNativePythonData +# | None +# | Collection["DataValue"] +# | Mapping[str, "DataValue"] +# ) + + +# PacketLike = Mapping[str, DataValue] From 83118ab6834c70b8b7c1a907143308b0f4a34394 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:48:42 +0000 Subject: [PATCH 060/224] refactor: use protocols in hashing package --- src/orcapod/hashing/__init__.py | 12 +- src/orcapod/hashing/arrow_hashers.py | 2 +- src/orcapod/hashing/defaults.py | 47 ++- src/orcapod/hashing/file_hashers.py | 345 +++++++++--------- .../hashing/function_info_extractors.py | 2 +- src/orcapod/hashing/hash_utils.py | 7 +- src/orcapod/hashing/legacy_core.py | 23 +- src/orcapod/hashing/object_hashers.py | 54 ++- src/orcapod/hashing/semantic_type_hashers.py | 6 +- src/orcapod/hashing/string_cachers.py | 2 +- src/orcapod/hashing/versioned_hashers.py | 2 +- src/orcapod/types/core.py | 45 ++- src/orcapod/types/packet_converter.py | 182 --------- 13 files changed, 297 insertions(+), 432 deletions(-) delete mode 100644 src/orcapod/types/packet_converter.py diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index b1e5849..eb94afe 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -2,21 +2,11 @@ get_default_object_hasher, get_default_arrow_hasher, ) -from .types import ( - FileContentHasher, - LegacyPacketHasher, - ArrowHasher, - ObjectHasher, - StringCacher, - FunctionInfoExtractor, - LegacyCompositeFileHasher, -) -from .content_identifiable import ContentIdentifiableBase + __all__ = [ "FileContentHasher", "LegacyPacketHasher", - "ArrowHasher", "StringCacher", "ObjectHasher", "LegacyCompositeFileHasher", diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 465b29b..2b66b52 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -3,7 +3,7 @@ import pyarrow as pa import polars as pl import json -from orcapod.hashing.types import SemanticTypeHasher, StringCacher +from orcapod.protocols.hashing_protocols import SemanticTypeHasher, StringCacher from orcapod.hashing import arrow_serialization from collections.abc import Callable diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 3bae548..c9e404b 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,13 +1,8 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. -from orcapod.hashing.types import ( - LegacyCompositeFileHasher, - ArrowHasher, - StringCacher, -) -from orcapod.hashing.file_hashers import LegacyPathLikeHasherFactory +from orcapod.protocols import hashing_protocols as hp + from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.hashing.object_hashers import ObjectHasher from orcapod.hashing.object_hashers import LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.versioned_hashers import ( @@ -17,8 +12,8 @@ def get_default_arrow_hasher( - cache_file_hash: bool | StringCacher = True, -) -> ArrowHasher: + cache_file_hash: bool | hp.StringCacher = True, +) -> hp.ArrowHasher: """ Get the default Arrow hasher with semantic type support. If `cache_file_hash` is True, it uses an in-memory cacher for caching hash values. If a `StringCacher` is provided, it uses that for caching file hashes. @@ -26,22 +21,22 @@ def get_default_arrow_hasher( arrow_hasher = get_versioned_semantic_arrow_hasher() if cache_file_hash: # use unlimited caching - if isinstance(cache_file_hash, StringCacher): - string_cacher = cache_file_hash - else: + if cache_file_hash is True: string_cacher = InMemoryCacher(max_size=None) + else: + string_cacher = cache_file_hash arrow_hasher.set_cacher("path", string_cacher) return arrow_hasher -def get_default_object_hasher() -> ObjectHasher: +def get_default_object_hasher() -> hp.ObjectHasher: object_hasher = get_versioned_object_hasher() return object_hasher -def get_legacy_object_hasher() -> ObjectHasher: +def get_legacy_object_hasher() -> hp.ObjectHasher: function_info_extractor = ( FunctionInfoExtractorFactory.create_function_info_extractor( strategy="signature" @@ -50,17 +45,17 @@ def get_legacy_object_hasher() -> ObjectHasher: return LegacyObjectHasher(function_info_extractor=function_info_extractor) -def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: - if with_cache: - # use unlimited caching - string_cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_legacy_composite(string_cacher) - return LegacyPathLikeHasherFactory.create_basic_legacy_composite() +# def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: +# if with_cache: +# # use unlimited caching +# string_cacher = InMemoryCacher(max_size=None) +# return LegacyPathLikeHasherFactory.create_cached_legacy_composite(string_cacher) +# return LegacyPathLikeHasherFactory.create_basic_legacy_composite() -def get_default_composite_file_hasher_with_cacher( - cacher=None, -) -> LegacyCompositeFileHasher: - if cacher is None: - cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) +# def get_default_composite_file_hasher_with_cacher( +# cacher=None, +# ) -> LegacyCompositeFileHasher: +# if cacher is None: +# cacher = InMemoryCacher(max_size=None) +# return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index f0ca8d1..d5fc761 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,13 +1,10 @@ from orcapod.hashing import legacy_core from orcapod.hashing.hash_utils import hash_file -from orcapod.hashing.types import ( +from orcapod.protocols.hashing_protocols import ( FileContentHasher, StringCacher, - LegacyFileHasher, - LegacyPathSetHasher, - LegacyCompositeFileHasher, ) -from orcapod.types import PacketLike, PathLike, PathSet +from orcapod.types import PathLike, PathSet, PacketLike class BasicFileHasher: @@ -52,172 +49,172 @@ def hash_file(self, file_path: PathLike) -> bytes: # ----------------Legacy implementations for backward compatibility----------------- -class LegacyDefaultFileHasher: - def __init__( - self, - algorithm: str = "sha256", - buffer_size: int = 65536, - ): - self.algorithm = algorithm - self.buffer_size = buffer_size - - def hash_file(self, file_path: PathLike) -> str: - return legacy_core.hash_file( - file_path, algorithm=self.algorithm, buffer_size=self.buffer_size - ) - - -class LegacyCachedFileHasher: - """File hasher with caching.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - string_cacher: StringCacher, - ): - self.file_hasher = file_hasher - self.string_cacher = string_cacher - - def hash_file(self, file_path: PathLike) -> str: - cache_key = f"file:{file_path}" - cached_value = self.string_cacher.get_cached(cache_key) - if cached_value is not None: - return cached_value - - value = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, value) - return value - - -class LegacyDefaultPathsetHasher: - """Default pathset hasher that composes file hashing.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - char_count: int | None = 32, - ): - self.file_hasher = file_hasher - self.char_count = char_count - - def _hash_file_to_hex(self, file_path: PathLike) -> str: - return self.file_hasher.hash_file(file_path) - - def hash_pathset(self, pathset: PathSet) -> str: - """Hash a pathset using the injected file hasher.""" - return legacy_core.hash_pathset( - pathset, - char_count=self.char_count, - file_hasher=self.file_hasher.hash_file, # Inject the method - ) - - -class LegacyDefaultPacketHasher: - """Default packet hasher that composes pathset hashing.""" - - def __init__( - self, - pathset_hasher: LegacyPathSetHasher, - char_count: int | None = 32, - prefix: str = "", - ): - self.pathset_hasher = pathset_hasher - self.char_count = char_count - self.prefix = prefix - - def _hash_pathset_to_hex(self, pathset: PathSet): - return self.pathset_hasher.hash_pathset(pathset) - - def hash_packet(self, packet: PacketLike) -> str: - """Hash a packet using the injected pathset hasher.""" - hash_str = legacy_core.hash_packet( - packet, - char_count=self.char_count, - prefix_algorithm=False, # Will apply prefix on our own - pathset_hasher=self._hash_pathset_to_hex, # Inject the method - ) - return f"{self.prefix}-{hash_str}" if self.prefix else hash_str - - -# Convenience composite implementation -class LegacyDefaultCompositeFileHasher: - """Composite hasher that implements all interfaces.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - char_count: int | None = 32, - packet_prefix: str = "", - ): - self.file_hasher = file_hasher - self.pathset_hasher = LegacyDefaultPathsetHasher(self.file_hasher, char_count) - self.packet_hasher = LegacyDefaultPacketHasher( - self.pathset_hasher, char_count, packet_prefix - ) - - def hash_file(self, file_path: PathLike) -> str: - return self.file_hasher.hash_file(file_path) - - def hash_pathset(self, pathset: PathSet) -> str: - return self.pathset_hasher.hash_pathset(pathset) - - def hash_packet(self, packet: PacketLike) -> str: - return self.packet_hasher.hash_packet(packet) - - -# Factory for easy construction -class LegacyPathLikeHasherFactory: - """Factory for creating various hasher combinations.""" - - @staticmethod - def create_basic_legacy_composite( - algorithm: str = "sha256", - buffer_size: int = 65536, - char_count: int | None = 32, - ) -> LegacyCompositeFileHasher: - """Create a basic composite hasher.""" - file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - # use algorithm as the prefix for the packet hasher - return LegacyDefaultCompositeFileHasher( - file_hasher, char_count, packet_prefix=algorithm - ) - - @staticmethod - def create_cached_legacy_composite( - string_cacher: StringCacher, - algorithm: str = "sha256", - buffer_size: int = 65536, - char_count: int | None = 32, - ) -> LegacyCompositeFileHasher: - """Create a composite hasher with file caching.""" - basic_file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - cached_file_hasher = LegacyCachedFileHasher(basic_file_hasher, string_cacher) - return LegacyDefaultCompositeFileHasher( - cached_file_hasher, char_count, packet_prefix=algorithm - ) - - @staticmethod - def create_legacy_file_hasher( - string_cacher: StringCacher | None = None, - algorithm: str = "sha256", - buffer_size: int = 65536, - ) -> LegacyFileHasher: - """Create just a file hasher, optionally with caching.""" - default_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - if string_cacher is None: - return default_hasher - else: - return LegacyCachedFileHasher(default_hasher, string_cacher) - - @staticmethod - def create_file_hasher( - string_cacher: StringCacher | None = None, - algorithm: str = "sha256", - buffer_size: int = 65536, - ) -> FileContentHasher: - """Create just a file hasher, optionally with caching.""" - basic_hasher = BasicFileHasher(algorithm, buffer_size) - if string_cacher is None: - return basic_hasher - else: - return CachedFileHasher(basic_hasher, string_cacher) +# class LegacyDefaultFileHasher: +# def __init__( +# self, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ): +# self.algorithm = algorithm +# self.buffer_size = buffer_size + +# def hash_file(self, file_path: PathLike) -> str: +# return legacy_core.hash_file( +# file_path, algorithm=self.algorithm, buffer_size=self.buffer_size +# ) + + +# class LegacyCachedFileHasher: +# """File hasher with caching.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# string_cacher: StringCacher, +# ): +# self.file_hasher = file_hasher +# self.string_cacher = string_cacher + +# def hash_file(self, file_path: PathLike) -> str: +# cache_key = f"file:{file_path}" +# cached_value = self.string_cacher.get_cached(cache_key) +# if cached_value is not None: +# return cached_value + +# value = self.file_hasher.hash_file(file_path) +# self.string_cacher.set_cached(cache_key, value) +# return value + + +# class LegacyDefaultPathsetHasher: +# """Default pathset hasher that composes file hashing.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# char_count: int | None = 32, +# ): +# self.file_hasher = file_hasher +# self.char_count = char_count + +# def _hash_file_to_hex(self, file_path: PathLike) -> str: +# return self.file_hasher.hash_file(file_path) + +# def hash_pathset(self, pathset: PathSet) -> str: +# """Hash a pathset using the injected file hasher.""" +# return legacy_core.hash_pathset( +# pathset, +# char_count=self.char_count, +# file_hasher=self.file_hasher.hash_file, # Inject the method +# ) + + +# class LegacyDefaultPacketHasher: +# """Default packet hasher that composes pathset hashing.""" + +# def __init__( +# self, +# pathset_hasher: LegacyPathSetHasher, +# char_count: int | None = 32, +# prefix: str = "", +# ): +# self.pathset_hasher = pathset_hasher +# self.char_count = char_count +# self.prefix = prefix + +# def _hash_pathset_to_hex(self, pathset: PathSet): +# return self.pathset_hasher.hash_pathset(pathset) + +# def hash_packet(self, packet: PacketLike) -> str: +# """Hash a packet using the injected pathset hasher.""" +# hash_str = legacy_core.hash_packet( +# packet, +# char_count=self.char_count, +# prefix_algorithm=False, # Will apply prefix on our own +# pathset_hasher=self._hash_pathset_to_hex, # Inject the method +# ) +# return f"{self.prefix}-{hash_str}" if self.prefix else hash_str + + +# # Convenience composite implementation +# class LegacyDefaultCompositeFileHasher: +# """Composite hasher that implements all interfaces.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# char_count: int | None = 32, +# packet_prefix: str = "", +# ): +# self.file_hasher = file_hasher +# self.pathset_hasher = LegacyDefaultPathsetHasher(self.file_hasher, char_count) +# self.packet_hasher = LegacyDefaultPacketHasher( +# self.pathset_hasher, char_count, packet_prefix +# ) + +# def hash_file(self, file_path: PathLike) -> str: +# return self.file_hasher.hash_file(file_path) + +# def hash_pathset(self, pathset: PathSet) -> str: +# return self.pathset_hasher.hash_pathset(pathset) + +# def hash_packet(self, packet: PacketLike) -> str: +# return self.packet_hasher.hash_packet(packet) + + +# # Factory for easy construction +# class LegacyPathLikeHasherFactory: +# """Factory for creating various hasher combinations.""" + +# @staticmethod +# def create_basic_legacy_composite( +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# char_count: int | None = 32, +# ) -> LegacyCompositeFileHasher: +# """Create a basic composite hasher.""" +# file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# # use algorithm as the prefix for the packet hasher +# return LegacyDefaultCompositeFileHasher( +# file_hasher, char_count, packet_prefix=algorithm +# ) + +# @staticmethod +# def create_cached_legacy_composite( +# string_cacher: StringCacher, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# char_count: int | None = 32, +# ) -> LegacyCompositeFileHasher: +# """Create a composite hasher with file caching.""" +# basic_file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# cached_file_hasher = LegacyCachedFileHasher(basic_file_hasher, string_cacher) +# return LegacyDefaultCompositeFileHasher( +# cached_file_hasher, char_count, packet_prefix=algorithm +# ) + +# @staticmethod +# def create_legacy_file_hasher( +# string_cacher: StringCacher | None = None, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ) -> LegacyFileHasher: +# """Create just a file hasher, optionally with caching.""" +# default_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# if string_cacher is None: +# return default_hasher +# else: +# return LegacyCachedFileHasher(default_hasher, string_cacher) + +# @staticmethod +# def create_file_hasher( +# string_cacher: StringCacher | None = None, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ) -> FileContentHasher: +# """Create just a file hasher, optionally with caching.""" +# basic_hasher = BasicFileHasher(algorithm, buffer_size) +# if string_cacher is None: +# return basic_hasher +# else: +# return CachedFileHasher(basic_hasher, string_cacher) diff --git a/src/orcapod/hashing/function_info_extractors.py b/src/orcapod/hashing/function_info_extractors.py index 816208b..27cae33 100644 --- a/src/orcapod/hashing/function_info_extractors.py +++ b/src/orcapod/hashing/function_info_extractors.py @@ -1,4 +1,4 @@ -from .types import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from collections.abc import Callable from typing import Any, Literal from orcapod.types import TypeSpec diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 0dc0777..476b0a0 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -1,5 +1,6 @@ from typing import Any from .function_info_extractors import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import ContentIdentifiable import logging import json from uuid import UUID @@ -71,9 +72,9 @@ def process_structure( if obj is None: return None - from .content_identifiable import ContentIdentifiableBase - - if isinstance(obj, ContentIdentifiableBase): + # TODO: currently using runtime_checkable on ContentIdentifiable protocol + # Re-evaluate this strategy to see if a faster / more robust check could be used + if isinstance(obj, ContentIdentifiable): logger.debug( f"Processing ContentHashableBase instance of type {type(obj).__name__}" ) diff --git a/src/orcapod/hashing/legacy_core.py b/src/orcapod/hashing/legacy_core.py index a5b4319..e338a89 100644 --- a/src/orcapod/hashing/legacy_core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -1,18 +1,9 @@ -""" -Stable Hashing Library -====================== - -A library for creating stable, content-based hashes that remain consistent across Python sessions, -suitable for arbitrarily nested data structures and custom objects via HashableMixin. -""" - -WARN_NONE_IDENTITY = False import hashlib import inspect import json import logging import zlib -from .types import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from functools import partial from os import PathLike from pathlib import Path @@ -33,9 +24,19 @@ import xxhash -from orcapod.types import Packet, PacketLike, PathSet +from orcapod.types import PathSet, Packet, PacketLike from orcapod.utils.name import find_noncolliding_name +WARN_NONE_IDENTITY = False +""" +Stable Hashing Library +====================== + +A library for creating stable, content-based hashes that remain consistent across Python sessions, +suitable for arbitrarily nested data structures and custom objects via HashableMixin. +""" + + # Configure logging with __name__ for proper hierarchy logger = logging.getLogger(__name__) diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 3401574..97568f5 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,9 +1,57 @@ -from orcapod.hashing.types import FunctionInfoExtractor, ObjectHasher +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor, ObjectHasher from orcapod.hashing import legacy_core from orcapod.hashing import hash_utils +from typing import Any +import uuid +from abc import ABC, abstractmethod + + +class ObjectHasherBase(ABC): + @abstractmethod + def hash(self, obj: object) -> bytes: ... + + @abstractmethod + def get_hasher_id(self) -> str: ... + + def hash_to_hex( + self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + ) -> str: + hash_bytes = self.hash(obj) + hex_str = hash_bytes.hex() + + # TODO: clean up this logic, as char_count handling is messy + if char_count is not None: + if char_count > len(hex_str): + raise ValueError( + f"Cannot truncate to {char_count} chars, hash only has {len(hex_str)}" + ) + hex_str = hex_str[:char_count] + if prefix_hasher_id: + hex_str = self.get_hasher_id() + "@" + hex_str + return hex_str + + def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: + """ + Hash an object to an integer. + + Args: + obj (Any): The object to hash. + hexdigits (int): Number of hexadecimal digits to use for the hash. + + Returns: + int: The integer representation of the hash. + """ + hex_hash = self.hash_to_hex(obj, char_count=hexdigits) + return int(hex_hash, 16) + + def hash_to_uuid( + self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + ) -> uuid.UUID: + """Convert hash to proper UUID5.""" + return uuid.uuid5(namespace, self.hash(obj)) -class BasicObjectHasher(ObjectHasher): +class BasicObjectHasher(ObjectHasherBase): """ Default object hasher used throughout the codebase. """ @@ -34,7 +82,7 @@ def hash(self, obj: object) -> bytes: ) -class LegacyObjectHasher(ObjectHasher): +class LegacyObjectHasher(ObjectHasherBase): """ Legacy object hasher that returns the string representation of the object. diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 5be28b0..4508f95 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -1,4 +1,8 @@ -from orcapod.hashing.types import SemanticTypeHasher, FileContentHasher, StringCacher +from orcapod.protocols.hashing_protocols import ( + SemanticTypeHasher, + FileContentHasher, + StringCacher, +) import os import hashlib import pyarrow as pa diff --git a/src/orcapod/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py index 620dece..bb09eff 100644 --- a/src/orcapod/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TYPE_CHECKING -from orcapod.hashing.types import StringCacher +from orcapod.protocols.hashing_protocols import StringCacher logger = logging.getLogger(__name__) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index d2fec4d..c5c1919 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,6 +1,6 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher -from .types import ObjectHasher +from orcapod.protocols.hashing_protocols import ObjectHasher import importlib from typing import Any diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 22491ae..98b49b8 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -2,6 +2,8 @@ import os from collections.abc import Collection, Mapping +import logging + DataType: TypeAlias = type @@ -9,6 +11,17 @@ str, DataType ] # Mapping of parameter names to their types +logger = logging.getLogger(__name__) + + +# class TypeSpec(dict[str, DataType]): +# def __init__(self, *args, **kwargs): +# """ +# TypeSpec is a mapping of parameter names to their types. +# It can be used to define the expected types of parameters in a function or a pod. +# """ +# super().__init__(*args, **kwargs) + # Convenience alias for anything pathlike PathLike = str | os.PathLike @@ -27,29 +40,27 @@ # Simple data types that we support (with clear Polars correspondence) SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes -ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike +ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathSet # Extended data values that can be stored in packets # Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = ( - PathSet - | SupportedNativePythonData - | None - | Collection["DataValue"] - | Mapping[str, "DataValue"] -) +DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None +StoreValue: TypeAlias = SupportedNativePythonData | Collection["StoreValue"] | None -class PodFunction(Protocol): - """ - A function suitable to be used in a FunctionPod. - It takes one or more named arguments, each corresponding to either: - - A path to a file or directory (PathSet) - for backward compatibility - - A simple data value (str, int, float, bool, bytes, Path) - and returns either None, a single value, or a list of values - """ +PacketLike: TypeAlias = Mapping[str, DataValue] + + +# class PodFunction(Protocol): +# """ +# A function suitable to be used in a FunctionPod. +# It takes one or more named arguments, each corresponding to either: +# - A path to a file or directory (PathSet) - for backward compatibility +# - A simple data value (str, int, float, bool, bytes, Path) +# and returns either None, a single value, or a list of values +# """ - def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... +# def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... class TypeHandler(Protocol): diff --git a/src/orcapod/types/packet_converter.py b/src/orcapod/types/packet_converter.py deleted file mode 100644 index 6edea00..0000000 --- a/src/orcapod/types/packet_converter.py +++ /dev/null @@ -1,182 +0,0 @@ -# from orcapod.types.core import TypeSpec, TypeHandler -# from orcapod.types.packets import Packet, PacketLike -# from orcapod.types.semantic_type_registry import ( -# SemanticTypeRegistry, -# TypeInfo, -# get_metadata_from_schema, -# arrow_to_dicts, -# ) -# from typing import Any -# from collections.abc import Mapping, Sequence -# import pyarrow as pa -# import logging - -# logger = logging.getLogger(__name__) - - -# def is_packet_supported( -# python_type_info: TypeSpec, -# registry: SemanticTypeRegistry, -# type_lut: dict | None = None, -# ) -> bool: -# """Check if all types in the packet are supported by the registry or known to the default lut.""" -# if type_lut is None: -# type_lut = {} -# return all( -# python_type in registry or python_type in type_lut -# for python_type in python_type_info.values() -# ) - - -# class PacketConverter: -# def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): -# self.python_type_spec = python_type_spec -# self.registry = registry - -# # Lookup handlers and type info for fast access -# self.handlers: dict[str, TypeHandler] = {} -# self.storage_type_info: dict[str, TypeInfo] = {} - -# self.expected_key_set = set(python_type_spec.keys()) - -# # prepare the corresponding arrow table schema with metadata -# self.keys_with_handlers, self.schema = create_schema_from_python_type_info( -# python_type_spec, registry -# ) - -# self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") - -# def _check_key_consistency(self, keys): -# """Check if the provided keys match the expected keys.""" -# keys_set = set(keys) -# if keys_set != self.expected_key_set: -# missing_keys = self.expected_key_set - keys_set -# extra_keys = keys_set - self.expected_key_set -# error_parts = [] -# if missing_keys: -# error_parts.append(f"Missing keys: {missing_keys}") -# if extra_keys: -# error_parts.append(f"Extra keys: {extra_keys}") - -# raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - -# def _to_storage_packet(self, packet: PacketLike) -> dict[str, Any]: -# """Convert packet to storage representation. - -# Args: -# packet: Dictionary mapping parameter names to Python values - -# Returns: -# Dictionary with same keys but values converted to storage format - -# Raises: -# KeyError: If packet keys don't match the expected type_info keys -# TypeError: If value type doesn't match expected type -# ValueError: If conversion fails -# """ -# # Validate packet keys -# packet_keys = set(packet.keys()) - -# self._check_key_consistency(packet_keys) - -# # Convert each value -# storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet - -# for key, handler in self.keys_with_handlers: -# try: -# storage_packet[key] = handler.python_to_storage(storage_packet[key]) -# except Exception as e: -# raise ValueError(f"Failed to convert value for '{key}': {e}") from e - -# return storage_packet - -# def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> PacketLike: -# """Convert storage packet back to Python packet. - -# Args: -# storage_packet: Dictionary with values in storage format - -# Returns: -# Packet with values converted back to Python types - -# Raises: -# KeyError: If storage packet keys don't match the expected type_info keys -# TypeError: If value type doesn't match expected type -# ValueError: If conversion fails -# """ -# # Validate storage packet keys -# storage_keys = set(storage_packet.keys()) - -# self._check_key_consistency(storage_keys) - -# # Convert each value back to Python type -# packet: PacketLike = dict(storage_packet) - -# for key, handler in self.keys_with_handlers: -# try: -# packet[key] = handler.storage_to_python(storage_packet[key]) -# except Exception as e: -# raise ValueError(f"Failed to convert value for '{key}': {e}") from e - -# return packet - -# def to_arrow_table(self, packet: PacketLike | Sequence[PacketLike]) -> pa.Table: -# """Convert packet to PyArrow Table with field metadata. - -# Args: -# packet: Dictionary mapping parameter names to Python values - -# Returns: -# PyArrow Table with the packet data as a single row -# """ -# # Convert packet to storage format -# if not isinstance(packet, Sequence): -# packets = [packet] -# else: -# packets = packet - -# storage_packets = [self._to_storage_packet(p) for p in packets] - -# # Create arrays -# arrays = [] -# for field in self.schema: -# values = [p[field.name] for p in storage_packets] -# array = pa.array(values, type=field.type) -# arrays.append(array) - -# return pa.Table.from_arrays(arrays, schema=self.schema) - -# def from_arrow_table( -# self, table: pa.Table, verify_semantic_equivalence: bool = True -# ) -> list[Packet]: -# """Convert Arrow table to packet with field metadata. - -# Args: -# table: PyArrow Table with metadata - -# Returns: -# List of packets converted from the Arrow table -# """ -# # Check for consistency in the semantic type mapping: -# semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") - -# if semantic_type_info != self.semantic_type_lut: -# if not verify_semantic_equivalence: -# logger.warning( -# "Arrow table semantic types do not match expected type registry. " -# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" -# ) -# else: -# raise ValueError( -# "Arrow table semantic types do not match expected type registry. " -# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" -# ) - -# # Create packets from the Arrow table -# # TODO: make this more efficient -# storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore -# if not self.keys_with_handlers: -# # no special handling required -# return storage_packets - -# return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] From 5fc78f888fb25402fdd35f335d704a06b468dc52 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:49:06 +0000 Subject: [PATCH 061/224] refactor: temporarily stop top level import while refactoring --- src/orcapod/__init__.py | 66 ++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index ad00035..01cd5db 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,36 +1,36 @@ -from .core import operators, sources, streams -from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -from . import hashing, stores -from .core.operators import Join, MapPackets, MapTags, packet, tag -from .core.pod import FunctionPod, function_pod -from .core.sources import GlobSource -from .stores import DirDataStore, SafeDirDataStore -from .core.tracker import GraphTracker -from .pipeline import Pipeline +# from .core import operators, sources, streams +# from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator +# from . import hashing, stores +# from .core.operators import Join, MapPackets, MapTags, packet, tag +# from .core.pod import FunctionPod, function_pod +# from .core.sources import GlobSource +# from .stores import DirDataStore, SafeDirDataStore +# from .core.tracker import GraphTracker +# from .pipeline import Pipeline -DEFAULT_TRACKER = GraphTracker() -DEFAULT_TRACKER.activate() +# DEFAULT_TRACKER = GraphTracker() +# DEFAULT_TRACKER.activate() -__all__ = [ - "hashing", - "stores", - "pod", - "operators", - "streams", - "sources", - "MapTags", - "MapPackets", - "Join", - "tag", - "packet", - "FunctionPod", - "function_pod", - "GlobSource", - "DirDataStore", - "SafeDirDataStore", - "DEFAULT_TRACKER", - "SyncStreamFromLists", - "SyncStreamFromGenerator", - "Pipeline", -] +# __all__ = [ +# "hashing", +# "stores", +# "pod", +# "operators", +# "streams", +# "sources", +# "MapTags", +# "MapPackets", +# "Join", +# "tag", +# "packet", +# "FunctionPod", +# "function_pod", +# "GlobSource", +# "DirDataStore", +# "SafeDirDataStore", +# "DEFAULT_TRACKER", +# "SyncStreamFromLists", +# "SyncStreamFromGenerator", +# "Pipeline", +# ] From 93beb0f37430e4492883b6120c11b032bb9de90e Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:50:42 +0000 Subject: [PATCH 062/224] refactor: remove protocol-relevant definitions --- src/orcapod/types/__init__.py | 10 ++++----- src/orcapod/types/schemas.py | 18 ++++++++++----- src/orcapod/types/typespec_utils.py | 35 ++++++++++++----------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index 03a3b4b..179a253 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,10 +1,11 @@ -from .core import Tag, PathLike, PathSet, PodFunction, TypeSpec -from .packets import Packet, PacketLike +from .core import Tag, PathLike, PathSet, TypeSpec, DataValue, StoreValue from .semantic_type_registry import SemanticTypeRegistry from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler from . import semantic_type_handlers from . import typespec_utils +Packet = dict[str, str] +PacketLike = Packet # Create default registry and register handlers default_registry = SemanticTypeRegistry() @@ -19,12 +20,11 @@ __all__ = [ "default_registry", "Tag", - "Packet", - "PacketLike", "TypeSpec", "PathLike", "PathSet", - "PodFunction", "semantic_type_handlers", "typespec_utils", + "DataValue", + "StoreValue", ] diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 35cc4f0..31e56d5 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,4 +1,4 @@ -from orcapod.types import TypeSpec +from orcapod.types.core import DataType, TypeSpec from orcapod.types.semantic_type_registry import SemanticTypeRegistry import pyarrow as pa import datetime @@ -38,7 +38,7 @@ def arrow_to_python_type(arrow_type: pa.DataType) -> type: raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") -class PythonSchema(dict[str, type]): +class PythonSchema(dict[str, DataType]): """ A schema for Python data types, mapping string keys to Python types. @@ -70,6 +70,9 @@ def with_source_info(self) -> dict[str, type]: """ return {**self, **{f"_source_info_{k}": str for k in self.keys()}} + def copy(self) -> "PythonSchema": + return PythonSchema(self) + class SemanticSchema(dict[str, tuple[type, str | None]]): """ @@ -299,11 +302,16 @@ def from_arrow_schema_to_semantic_schema( """ semantic_schema = {} for field in arrow_schema: - if field.metadata.get(b"field_type", b"") == b"source_info": + if field.name.startswith("_source_info_") or ( + field.metadata and field.metadata.get(b"field_type", b"") == b"source_info" + ): # Skip source info fields continue - semantic_type = field.metadata.get(b"semantic_type", None) - semantic_type = semantic_type.decode() if semantic_type else None + + semantic_type = None + if field.metadata is not None: + semantic_type = field.metadata.get(b"semantic_type", None) + semantic_type = semantic_type.decode() if semantic_type else None python_type = arrow_to_python_type(field.type) semantic_schema[field.name] = (python_type, semantic_type) return SemanticSchema(semantic_schema) diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index a0a3c58..71318aa 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -55,8 +55,8 @@ def check_typespec_compatibility( def extract_function_typespecs( func: Callable, output_keys: Collection[str], - input_types: TypeSpec | None = None, - output_types: TypeSpec | Sequence[type] | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | Sequence[type] | None = None, ) -> tuple[TypeSpec, TypeSpec]: """ Extract input and output data types from a function signature. @@ -137,23 +137,23 @@ def extract_function_typespecs( {'count': , 'total': , 'repr': } """ verified_output_types: TypeSpec = {} - if output_types is not None: - if isinstance(output_types, dict): - verified_output_types = output_types - elif isinstance(output_types, Sequence): + if output_typespec is not None: + if isinstance(output_typespec, dict): + verified_output_types = output_typespec + elif isinstance(output_typespec, Sequence): # If output_types is a collection, convert it to a dict with keys from return_keys - if len(output_types) != len(output_keys): + if len(output_typespec) != len(output_keys): raise ValueError( - f"Output types collection length {len(output_types)} does not match return keys length {len(output_keys)}." + f"Output types collection length {len(output_typespec)} does not match return keys length {len(output_keys)}." ) - verified_output_types = {k: v for k, v in zip(output_keys, output_types)} + verified_output_types = {k: v for k, v in zip(output_keys, output_typespec)} signature = inspect.signature(func) param_info: TypeSpec = {} for name, param in signature.parameters.items(): - if input_types and name in input_types: - param_info[name] = input_types[name] + if input_typespec and name in input_typespec: + param_info[name] = input_typespec[name] else: # check if the parameter has annotation if param.annotation is not inspect.Signature.empty: @@ -232,11 +232,7 @@ def get_compatible_type(type1: Any, type2: Any) -> Any: raise TypeError(f"Types {type1} and {type2} are not compatible") -def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: - if left is None: - return right - if right is None: - return left +def union_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: # Merge the two TypeSpecs but raise an error if conflicts in types are found merged = dict(left) for key, right_type in right.items(): @@ -248,15 +244,12 @@ def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | return merged -def intersection_typespecs( - left: TypeSpec | None, right: TypeSpec | None -) -> TypeSpec | None: +def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: """ Returns the intersection of two TypeSpecs, only returning keys that are present in both. If a key is present in both TypeSpecs, the type must be the same. """ - if left is None or right is None: - return None + # Find common keys and ensure types match common_keys = set(left.keys()).intersection(set(right.keys())) intersection = {} From 4d7761f9aa80b9dba64e2f95a3a824f864b47c72 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:51:46 +0000 Subject: [PATCH 063/224] refactor: add concrete component implementation in data package --- src/orcapod/data/__init__.py | 0 src/orcapod/data/base.py | 149 +++++++++ src/orcapod/data/datagrams.py | 608 ++++++++++++++++++++++++++++++++++ src/orcapod/data/kernels.py | 104 ++++++ src/orcapod/data/operators.py | 156 +++++++++ src/orcapod/data/pods.py | 340 +++++++++++++++++++ src/orcapod/data/streams.py | 487 +++++++++++++++++++++++++++ src/orcapod/data/trackers.py | 150 +++++++++ 8 files changed, 1994 insertions(+) create mode 100644 src/orcapod/data/__init__.py create mode 100644 src/orcapod/data/base.py create mode 100644 src/orcapod/data/datagrams.py create mode 100644 src/orcapod/data/kernels.py create mode 100644 src/orcapod/data/operators.py create mode 100644 src/orcapod/data/pods.py create mode 100644 src/orcapod/data/streams.py create mode 100644 src/orcapod/data/trackers.py diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py new file mode 100644 index 0000000..5082c9b --- /dev/null +++ b/src/orcapod/data/base.py @@ -0,0 +1,149 @@ +from abc import ABC, abstractmethod +from typing import Any +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import TypeSpec +from orcapod.hashing.defaults import get_default_object_hasher +import pyarrow as pa +import logging + + +logger = logging.getLogger(__name__) + + +class DatagramBase(ABC): + """ + Base class for data packets that can be processed in a pipeline. + This class provides a common interface for data packets, allowing them to be processed + and transformed in a consistent manner. + """ + + @property + @abstractmethod + def typespec(self) -> TypeSpec: + """Return the type specification of the data packet.""" + pass + + @abstractmethod + def keys(self) -> tuple[str, ...]: + """Return the keys of the data packet.""" + pass + + @abstractmethod + def as_table(self) -> pa.Table: + """Convert the data packet to a PyArrow Table.""" + pass + + @abstractmethod + def as_dict(self) -> dict[str, Any]: + """Convert the data packet to a dictionary.""" + pass + + +class LabeledContentIdentifiableBase: + """ + Base class for content-identifiable objects. + This class provides a way to define objects that can be uniquely identified + based on their content rather than their identity in memory. Specifically, the identity of the + object is determined by the structure returned by the `identity_structure` method. + The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, + which defaults to the one returned by `get_default_object_hasher`. + Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. + """ + + def __init__( + self, + identity_structure_hasher: hp.ObjectHasher | None = None, + label: str | None = None, + ) -> None: + """ + Initialize the ContentHashable with an optional ObjectHasher. + + Args: + identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. + """ + self.identity_structure_hasher = ( + identity_structure_hasher or get_default_object_hasher() + ) + self._label = label + + @property + def has_assigned_label(self) -> bool: + """ + Check if the label is explicitly set for this object. + + Returns: + bool: True if the label is explicitly set, False otherwise. + """ + return self._label is not None + + @property + def label(self) -> str: + """ + Get the label of this object. + + Returns: + str | None: The label of the object, or None if not set. + """ + return self._label or self.computed_label() or self.__class__.__name__ + + @label.setter + def label(self, label: str | None) -> None: + """ + Set the label of this object. + + Args: + label (str | None): The label to set for this object. + """ + self._label = label + + def computed_label(self) -> str | None: + """ + Compute a label for this object based on its content. If label is not explicitly set for this object + and computed_label returns a valid value, it will be used as label of this object. + """ + return None + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this object. + + Override this method in your subclass to provide a stable representation + of your object's content. The structure should contain all fields that + determine the object's identity. + + Returns: + Any: A structure representing this object's content, or None to use default hash + """ + # TODO: come up with a way to signify non-determinate identity structure + return None + + def __hash__(self) -> int: + """ + Hash implementation that uses the identity structure if provided, + otherwise falls back to the superclass's hash method. + + Returns: + int: A hash value based on either content or identity + """ + # Get the identity structure + structure = self.identity_structure() + if structure is None: + # If no identity structure is provided, use the default hash + return super().__hash__() + + return self.identity_structure_hasher.hash_to_int(structure) + + def __eq__(self, other: object) -> bool: + """ + Equality check that compares the identity structures of two objects. + + Args: + other (object): The object to compare against. + + Returns: + bool: True if both objects have the same identity structure, False otherwise. + """ + if not isinstance(other, LabeledContentIdentifiableBase): + return NotImplemented + + return self.identity_structure() == other.identity_structure() diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py new file mode 100644 index 0000000..c21c46b --- /dev/null +++ b/src/orcapod/data/datagrams.py @@ -0,0 +1,608 @@ +from orcapod.types.core import DataValue, StoreValue +from typing import TypeAlias, cast +from collections.abc import Callable, Mapping, Collection +from orcapod.types import TypeSpec, default_registry +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.core import TypeHandler +from orcapod.types import schemas +from orcapod.types.typespec_utils import get_typespec_from_dict +import pyarrow as pa + +from orcapod.hashing.defaults import get_default_arrow_hasher + + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +SemanticStore: TypeAlias = Mapping[str, StoreValue] +PythonStore: TypeAlias = Mapping[str, DataValue] + + +def check_arrow_schema_compatibility( + incoming_schema: pa.Schema, current_schema: pa.Schema +) -> tuple[bool, list[str]]: + """ + Check if incoming schema is compatible with current schema. + + Args: + incoming_schema: Schema to validate + current_schema: Expected schema to match against + + Returns: + Tuple of (is_compatible, list_of_errors) + """ + errors = [] + + # Create lookup dictionaries for efficient access + incoming_fields = {field.name: field for field in incoming_schema} + current_fields = {field.name: field for field in current_schema} + + # Check each field in current_schema + for field_name, current_field in current_fields.items(): + if field_name not in incoming_fields: + errors.append(f"Missing field '{field_name}' in incoming schema") + continue + + incoming_field = incoming_fields[field_name] + + # Check data type compatibility + if not current_field.type.equals(incoming_field.type): + errors.append( + f"Type mismatch for field '{field_name}': " + f"expected {current_field.type}, got {incoming_field.type}" + ) + + # Check semantic_type metadata if present in current schema + current_metadata = current_field.metadata or {} + incoming_metadata = incoming_field.metadata or {} + + if b"semantic_type" in current_metadata: + expected_semantic_type = current_metadata[b"semantic_type"] + + if b"semantic_type" not in incoming_metadata: + errors.append( + f"Missing 'semantic_type' metadata for field '{field_name}'" + ) + elif incoming_metadata[b"semantic_type"] != expected_semantic_type: + errors.append( + f"Semantic type mismatch for field '{field_name}': " + f"expected {expected_semantic_type.decode()}, " + f"got {incoming_metadata[b'semantic_type'].decode()}" + ) + elif b"semantic_type" in incoming_metadata: + errors.append( + f"Unexpected 'semantic_type' metadata for field '{field_name}': " + f"{incoming_metadata[b'semantic_type'].decode()}" + ) + + return len(errors) == 0, errors + + +class SemanticConverter: + @staticmethod + def prepare_handler( + semantic_schema: schemas.SemanticSchema, + semantic_type_registry: SemanticTypeRegistry, + ) -> dict[str, TypeHandler]: + handler_lut = {} + for key, (_, semantic_type) in semantic_schema.items(): + if semantic_type is None: + continue # Skip keys without semantic type + handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( + semantic_type + ) + return handler_lut + + @classmethod + def from_typespec( + cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticConverter": + semantic_schema = schemas.from_typespec_to_semantic_schema( + typespec, semantic_type_registry + ) + python_schema = schemas.PythonSchema(typespec) + handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) + return cls(python_schema, semantic_schema, handler_lut) + + @classmethod + def from_arrow_schema( + cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticConverter": + semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) + python_schema = schemas.from_semantic_schema_to_python_schema( + semantic_schema, semantic_type_registry=semantic_type_registry + ) + handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) + return cls(python_schema, semantic_schema, handler_lut) + + def __init__( + self, + python_schema: schemas.PythonSchema, + semantic_schema: schemas.SemanticSchema, + handler_lut: dict[str, TypeHandler] | None = None, + ): + self.python_schema = python_schema + self.semantic_schema = semantic_schema + self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( + semantic_schema, include_source_info=False + ) + if handler_lut is None: + handler_lut = {} + self.handler_lut = handler_lut + + def from_semantic_store_to_python_store( + self, semantic_store: SemanticStore + ) -> PythonStore: + python_store = dict(semantic_store) + for key, handler in self.handler_lut.items(): + python_store[key] = handler.storage_to_python(semantic_store[key]) + return python_store + + def from_python_store_to_semantic_store( + self, python_store: PythonStore + ) -> SemanticStore: + semantic_store = dict(python_store) + for key, handler in self.handler_lut.items(): + semantic_store[key] = handler.python_to_storage(python_store[key]) + return semantic_store # type: ignore[return-value] + + def from_semantic_store_to_arrow_table( + self, semantic_store: SemanticStore + ) -> pa.Table: + """Convert a semantic store to an Arrow table.""" + return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) + + def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: + """Convert a Python store to an Arrow table.""" + semantic_store = self.from_python_store_to_semantic_store(python_store) + return self.from_semantic_store_to_arrow_table(semantic_store) + + def from_arrow_table_to_semantic_stores( + self, arrow_table: pa.Table + ) -> list[SemanticStore]: + """Convert an Arrow table to a list of semantic stores.""" + self.verify_compatible_arrow_schema(arrow_table.schema) + return arrow_table.to_pylist() # Ensure the table is materialized + + def from_arrow_table_to_python_stores( + self, arrow_table: pa.Table + ) -> list[PythonStore]: + """Convert an Arrow table to a Python store.""" + return [ + self.from_semantic_store_to_python_store(semantic_store) + for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) + ] + + def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): + compatible, errors = check_arrow_schema_compatibility( + arrow_schema, self.arrow_schema + ) + if not compatible: + raise ValueError( + "Arrow table schema is not compatible with the expected schema: " + + ", ".join(errors) + ) + + +class PythonDictTag(dict[str, DataValue]): + def as_dict(self) -> dict[str, DataValue]: + return dict(self) + + def as_table(self) -> pa.Table: + return pa.Table.from_pylist([self]) + + @property + def typespec(self) -> schemas.PythonSchema: + # TODO: provide correct implementation + return schemas.PythonSchema({k: str for k in self.keys()}) + + +class ArrowTag: + def __init__(self, table: pa.Table) -> None: + self.table = table + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + + def keys(self) -> tuple[str, ...]: + return tuple(self.table.column_names) + + @property + def typespec(self) -> schemas.PythonSchema: + if self._cached_python_schema is None: + self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( + self.table.schema + ).storage_schema + return self._cached_python_schema.copy() + + def as_dict(self) -> dict[str, DataValue]: + if self._cached_python_dict is None: + self._cached_python_dict = cast( + dict[str, DataValue], self.table.to_pylist()[0] + ) + return self._cached_python_dict + + def as_table(self) -> pa.Table: + return self.table + + def clear_cache(self) -> None: + self._cached_python_schema = None + self._cached_python_dict = None + + def __repr__(self) -> str: + return f"{self.as_dict()}" + + +class PythonDictPacket(dict[str, DataValue]): + @classmethod + def create_from( + cls, + object: dp.Packet, + finger_print: str | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> "PythonDictPacket": + if isinstance(object, PythonDictPacket): + return object.copy() + + new_packet = PythonDictPacket( + object.as_dict(include_source=False), + object.source_info(), + dict(object.typespec), + finger_print=finger_print, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + post_hash_callback=post_hash_callback, + ) + return new_packet + + def __init__( + self, + data: dict[str, DataValue], + source_info: dict[str, str | None] | None = None, + typespec: TypeSpec | None = None, + finger_print: str | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data = {k: v for k, v in data.items() if not k.startswith("_source_info_")} + contained_source_info = { + k.removeprefix("_source_info_"): v + for k, v in data.items() + if k.startswith("_source_info_") + } + super().__init__(data) + + self._source_info = {**contained_source_info, **(source_info or {})} + + verified_typespec = {} + if typespec is not None: + verified_typespec = dict(typespec) + inferred_typespec = get_typespec_from_dict(self) + for key in self: + if key not in verified_typespec: + verified_typespec[key] = inferred_typespec[key] + self._typespec = verified_typespec + + self._python_schema = schemas.PythonSchema(self._typespec) + + if semantic_converter is not None: + if semantic_converter.python_schema != self._python_schema.with_source_info: + raise ValueError( + "Incompatible Python schema between packet and semantic converter: " + + str(self._python_schema.with_source_info) + + " vs " + + str(semantic_converter.python_schema) + ) + else: + semantic_converter = SemanticConverter.from_typespec( + self._python_schema.with_source_info, + semantic_type_registry or default_registry, + ) + self.semantic_converter = semantic_converter + + self._finger_print = finger_print + self._post_hash_callback = post_hash_callback + self._cached_table: pa.Table | None = None + self._cached_content_hash: str | None = None + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + def as_table(self, include_source: bool = False) -> pa.Table: + """Convert the packet to an Arrow table.""" + if self._cached_table is None: + self._cached_table = ( + self.semantic_converter.from_python_store_to_arrow_table( + self.as_dict(include_source=True) + ) + ) + assert self._cached_table is not None, "Cached table should not be None" + if include_source: + return self._cached_table + else: + # drop source info columns if not needed + return self._cached_table.select(list(self.keys())) + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + dict_copy = self.copy() + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"_source_info_{key}"] = value + return dict_copy + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(include_source=False), prefix_hasher_id=True + ) + if self._post_hash_callback is not None and self._finger_print is not None: + self._post_hash_callback(self._finger_print, self._cached_content_hash) + return self._cached_content_hash + + @property + def typespec(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def source_info(self) -> dict[str, str | None]: + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> "PythonDictPacket": + """Return a shallow copy of the packet.""" + new_packet = PythonDictPacket(self, self.source_info()) + new_packet._finger_print = self._finger_print + new_packet._cached_table = self._cached_table + new_packet._cached_content_hash = self._cached_content_hash + new_packet._python_schema = self._python_schema.copy() + new_packet.semantic_converter = self.semantic_converter + new_packet.arrow_hasher = self.arrow_hasher + new_packet._post_hash_callback = self._post_hash_callback + return new_packet + + +def process_table_with_source_info( + table: pa.Table, source_info: dict[str, str | None] | None = None +) -> tuple[tuple[str, ...], pa.Table]: + """ + Process a table to ensure proper source_info columns. + + Args: + table: Input PyArrow table + source_info: optional dictionary mapping column names to source info values. If present, + it will take precedence over existing source_info columns in the table. + + Returns: + Processed table with source_info columns + """ + if source_info is None: + source_info = {} + + # Step 1: Separate source_info columns from regular columns + regular_columns = [] + regular_names = [] + existing_source_info = {} + + for i, name in enumerate(table.column_names): + if name.startswith("_source_info_"): + # Extract the base column name + base_name = name.removeprefix("_source_info_") + existing_source_info[base_name] = table.column(i) + else: + regular_columns.append(table.column(i)) + regular_names.append(name) + + # Step 2: Create source_info columns for each regular column + final_columns = [] + final_names = [] + + # Add all regular columns first + final_columns.extend(regular_columns) + final_names.extend(regular_names) + + # Create source_info columns for each regular column + num_rows = table.num_rows + + for col_name in regular_names: + source_info_col_name = f"_source_info_{col_name}" + + # if col_name is in source_info, use that value + if col_name in source_info: + # Use value from source_info dictionary + source_value = source_info[col_name] + source_values = pa.array([source_value] * num_rows, type=pa.large_string()) + # if col_name is in existing_source_info, use that column + elif col_name in existing_source_info: + # Use existing source_info column, but convert to large_string + existing_col = existing_source_info[col_name] + if existing_col.type == pa.large_string(): + source_values = existing_col + else: + # Convert to large_string + source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + + else: + # Use null values + source_values = pa.array([None] * num_rows, type=pa.large_string()) + + final_columns.append(source_values) + final_names.append(source_info_col_name) + + # Step 3: Create the final table + result: pa.Table = pa.Table.from_arrays(final_columns, names=final_names) + return tuple(regular_names), result + + +class ArrowPacket: + @classmethod + def create_from( + cls, + object: dp.Packet, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + finger_print: str | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> "ArrowPacket": + if isinstance(object, ArrowPacket): + return object.copy() + + new_packet = ArrowPacket( + object.as_table(include_source=True), + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + finger_print=finger_print, + arrow_hasher=arrow_hasher, + post_hash_callback=post_hash_callback, + skip_source_info_extraction=True, + ) + return new_packet + + def __init__( + self, + table: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + finger_print: str | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + skip_source_info_extraction: bool = False, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + if not skip_source_info_extraction: + # normalize the table to ensure it has the expected source_info columns + self._keys, self._arrow_table = process_table_with_source_info( + table, source_info + ) + else: + self._keys: tuple[str, ...] = tuple( + [c for c in table.column_names if not c.startswith("_source_info_")] + ) + for k in self._keys: + if f"_source_info_{k}" not in table.column_names: + raise ValueError( + f"Source info column '_source_info_{k}' is missing in the table." + ) + self._arrow_table = table + + self._finger_print = finger_print + self._post_hash_callback = post_hash_callback + + if semantic_converter is not None: + check_arrow_schema_compatibility( + semantic_converter.arrow_schema, self._arrow_table.schema + ) + else: + semantic_converter = SemanticConverter.from_arrow_schema( + self._arrow_table.schema, semantic_type_registry or default_registry + ) + self.semantic_converter = semantic_converter + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + self._cached_python_packet: PythonStore | None = None + self._cached_content_hash: str | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_source_info: dict[str, str | None] | None = None + + def as_table(self, include_source: bool = False) -> pa.Table: + """Return the Arrow table representation of the packet.""" + base_table = self._arrow_table + if not include_source: + # Select only the keys that are not source info + base_table = base_table.select(self._keys) + return base_table + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + if self._cached_python_packet is None: + self._cached_python_packet = ( + self.semantic_converter.from_arrow_table_to_python_stores( + self._arrow_table + )[0] + ) + if include_source: + return dict(self._cached_python_packet) + + return {k: self._cached_python_packet[k] for k in self._keys} + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self._arrow_table, prefix_hasher_id=True + ) + if self._post_hash_callback is not None and self._finger_print is not None: + self._post_hash_callback(self._finger_print, self._cached_content_hash) + return self._cached_content_hash + + @property + def typespec(self) -> schemas.PythonSchema: + return self.semantic_converter.python_schema.copy() + + def keys(self) -> tuple[str, ...]: + """Return the keys of the packet.""" + return tuple(self._keys) + + def source_info(self) -> dict[str, str | None]: + if self._cached_source_info is None: + self._cached_source_info = { + k: self._arrow_table[f"_source_info_{k}"][0].as_py() for k in self._keys + } + return self._cached_source_info.copy() + + def copy(self) -> "ArrowPacket": + """Return a shallow copy of the packet.""" + new_packet = ArrowPacket( + self._arrow_table, + semantic_converter=self.semantic_converter, + finger_print=self._finger_print, + arrow_hasher=self.arrow_hasher, + post_hash_callback=self._post_hash_callback, + skip_source_info_extraction=True, + ) + new_packet._cached_content_hash = self._cached_content_hash + new_packet._cached_source_info = ( + self._cached_source_info.copy() + if self._cached_source_info is not None + else None + ) + new_packet._cached_python_packet = ( + dict(self._cached_python_packet) + if self._cached_python_packet is not None + else None + ) + return new_packet + + def __repr__(self) -> str: + return f"{self.as_dict(include_source=False)}" + + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py new file mode 100644 index 0000000..0695715 --- /dev/null +++ b/src/orcapod/data/kernels.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod +from typing import Any +from orcapod.protocols import data_protocols as dp +import logging +from orcapod.data.streams import KernelStream +from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER +from orcapod.types import TypeSpec + +logger = logging.getLogger(__name__) + + +def get_tracker_manager() -> dp.TrackerManager: ... + + +class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): + """ + Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. + It is the base class for all computations and transformations that can be performed on a collection of streams + (including an empty collection). + A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + Each "invocation" of the kernel on a collection of streams is assigned a unique ID. + The corresponding invocation information is stored as Invocation object and attached to the output stream + for computational graph tracking. + """ + + def __init__( + self, + label: str | None = None, + skip_tracking: bool = False, + tracker_manager: dp.TrackerManager | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._label = label + self._skip_tracking = skip_tracking + self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + + def __call__( + self, *streams: dp.Stream, label: str | None = None, **kwargs + ) -> dp.Stream: + output_stream = self.forward(*streams, **kwargs) + + kernel_stream: dp.Stream + if output_stream.source is not None: + kernel_stream = KernelStream(output_stream, label=label) + else: + logger.warning( + "Output stream does not have a source. " + "This may lead to unexpected behavior when tracking the kernel invocation." + ) + kernel_stream = KernelStream(source=self, upstreams=streams, label=label) + + # TODO: consider the logic around tracker manager more carefully + if not self._skip_tracking and self._tracker_manager is not None: + # register the invocation to all active trackers + active_trackers = self._tracker_manager.get_active_trackers() + for tracker in active_trackers: + tracker.record(kernel_stream) + + return kernel_stream + + @abstractmethod + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + + @abstractmethod + def validate_inputs(self, *streams: dp.Stream) -> None: ... + + @abstractmethod + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior + """ + + def __repr__(self): + return self.__class__.__name__ + + def __str__(self): + if self._label is not None: + return f"{self.__class__.__name__}({self._label})" + return self.__class__.__name__ + + def identity_structure(self, *streams: dp.Stream) -> Any: + # Default implementation of identity_structure for the kernel only + # concerns the kernel class and the streams if present. Subclasses of + # Kernels should override this method to provide a more meaningful + # representation of the kernel. Note that kernel must provide the notion + # of identity under possibly two distinct contexts: + # 1) identity of the kernel in itself when invoked without any stream + # 2) identity of the specific invocation of the kernel with a collection of streams + # While the latter technically corresponds to the identity of the invocation and not + # the kernel, only kernel can provide meaningful information as to the uniqueness of + # the invocation as only kernel would know if / how the input stream(s) alter the identity + # of the invocation. For example, if the kernel corresponds to an commutative computation + # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the + # equivalence of the two by returning the same identity structure for both invocations. + # This can be achieved, for example, by returning a set over the streams instead of a tuple. + logger.warning( + f"Identity structure not implemented for {self.__class__.__name__}" + ) + return (self.__class__.__name__,) + streams diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py new file mode 100644 index 0000000..15d255a --- /dev/null +++ b/src/orcapod/data/operators.py @@ -0,0 +1,156 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import ImmutableTableStream +from orcapod.types import TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from abc import abstractmethod +from typing import Any + + +class InputValidationError(Exception): + """ + Exception raised when the inputs are not valid. + This is used to indicate that the inputs do not meet the requirements of the operator. + """ + + +class BinaryOperator(TrackedKernelBase): + """ + Base class for all operators. + """ + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_validate_inputs(left_stream, right_stream) + + @abstractmethod + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + def check_binary_inputs( + self, *streams: dp.Stream, allow_zero: bool = False + ) -> None: + """ + Check that the inputs to the binary operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if not (allow_zero and len(streams) == 0) and len(streams) != 2: + raise ValueError("BinaryOperator requires exactly two input streams.") + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for binary operators. + It expects exactly two streams as input. + """ + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_forward(left_stream, right_stream) + + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_types(left_stream, right_stream) + + def identity_structure(self, *streams: dp.Stream) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + self.check_binary_inputs(*streams, allow_zero=True) + return self.op_identity_structure(*streams) + + @abstractmethod + def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_types( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure(self, *streams: dp.Stream) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes two streams as input and returns a tuple containing the operator name and a set of streams. + """ + ... + + +class Join(BinaryOperator): + def op_identity_structure(self, *streams: dp.Stream) -> Any: + # Join does not depend on the order of the streams -- convert it onto a set + id_struct = (self.__class__.__name__,) + if len(streams) == 2: + id_struct += (set(streams),) + return id_struct + + def op_forward( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> ImmutableTableStream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + common_tag_keys = tuple( + intersection_typespecs(left_tag_typespec, right_tag_typespec).keys() + ) + joined_tag_keys = tuple( + union_typespecs(left_tag_typespec, right_tag_typespec).keys() + ) + + # performing a check to ensure that packets are compatible + union_typespecs(left_packet_typespec, right_packet_typespec) + + joined_table = left_stream.as_table().join( + right_stream.as_table(), + keys=common_tag_keys, + join_type="inner", + ) + + return ImmutableTableStream( + joined_table, + tag_columns=tuple(joined_tag_keys), + source=self, + upstreams=(left_stream, right_stream), + ) + + def op_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) + joined_packet_typespec = union_typespecs( + left_packet_typespec, right_packet_typespec + ) + return joined_tag_typespec, joined_packet_typespec + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + try: + self.op_types(left_stream, right_stream) + except Exception as e: + raise InputValidationError(f"Input streams are not compatible: {e}") + + def __repr__(self) -> str: + return "Join()" diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py new file mode 100644 index 0000000..256c34b --- /dev/null +++ b/src/orcapod/data/pods.py @@ -0,0 +1,340 @@ +from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.streams import PodStream +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.types import SemanticTypeRegistry, default_registry +from orcapod.types import typespec_utils as tsutils +from abc import abstractmethod + +import logging +import sys +from collections.abc import Callable, Collection, Iterable, Sequence +from typing import Any, Literal, cast + + +from orcapod.types.typespec_utils import ( + extract_function_typespecs, + check_typespec_compatibility, +) +from orcapod.types import TypeSpec + +from orcapod.hashing.legacy_core import get_function_signature +from orcapod.data.operators import Join + + +logger = logging.getLogger(__name__) + +error_handling_options = Literal["raise", "ignore", "warn"] + + +class PodBase(TrackedKernelBase): + """ + FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. + It allows for the execution of a function with a specific label and can be tracked by the system. + """ + + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Return the input and output typespecs for the pod. + This is used to validate the input and output streams. + """ + input_stream = self.process_and_verify_streams(*streams) + tag_typespec, _ = input_stream.types() + return tag_typespec, self.output_typespec + + @property + @abstractmethod + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the pod. This is used to validate the input streams. + """ + ... + + @property + @abstractmethod + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the pod. This is used to validate the output streams. + """ + ... + + @abstractmethod + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, dp.Packet | None]: ... + + def __init__( + self, + error_handling: error_handling_options = "raise", + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(label=label, **kwargs) + self._active = True + self.error_handling = error_handling + + def is_active(self) -> bool: + """ + Check if the pod is active. If not, it will not process any packets. + """ + return self._active + + def set_active(self, active: bool) -> None: + """ + Set the active state of the pod. If set to False, the pod will not process any packets. + """ + self._active = active + + def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: + """ + Prepare the incoming streams for execution in the pod. This default implementation + joins all the input streams together. + """ + # if multiple streams are provided, join them + # otherwise, return as is + combined_streams = list(streams) + if len(streams) > 1: + stream = streams[0] + for next_stream in streams[1:]: + stream = Join()(stream, next_stream) + combined_streams = [stream] + input_stream = combined_streams[0] + _, input_typespec = input_stream.types() + if not tsutils.check_typespec_compatibility( + input_typespec, self.input_typespec + ): + raise ValueError( + f"Input typespec {input_typespec} is not compatible with expected input typespec {self.input_typespec}" + ) + return input_stream + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.process_and_verify_streams(*streams) + + def forward(self, *streams: dp.Stream) -> PodStream: + input_stream = self.process_and_verify_streams(*streams) + # at this point, streams should have been joined into one + + return PodStream( + self, + input_stream, + error_handling=cast(error_handling_options, self.error_handling), + ) + + +def function_pod( + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + label: str | None = None, + **kwargs, +) -> Callable[..., "FunctionPod"]: + """ + Decorator that wraps a function in a FunctionPod instance. + + Args: + output_keys: Keys for the function output(s) + function_name: Name of the function pod; if None, defaults to the function name + **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. + + Returns: + FunctionPod instance wrapping the decorated function + """ + + def decorator(func) -> FunctionPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + if not hasattr(func, "__module__") or func.__module__ is None: + raise ValueError( + f"Function {func.__name__} must be defined at module level" + ) + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + module = sys.modules[func.__module__] + base_function_name = func.__name__ + new_function_name = f"_original_{func.__name__}" + setattr(module, new_function_name, func) + # rename the function to be consistent and make it pickleable + setattr(func, "__name__", new_function_name) + setattr(func, "__qualname__", new_function_name) + + # Create a simple typed function pod + pod = FunctionPod( + function=func, + output_keys=output_keys, + function_name=function_name or base_function_name, + label=label, + **kwargs, + ) + return pod + + return decorator + + +class FunctionPod(PodBase): + def __init__( + self, + function: dp.PodFunction, + output_keys: str | Collection[str] | None = None, + function_name=None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | Sequence[type] | None = None, + label: str | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + function_info_extractor: hp.FunctionInfoExtractor | None = None, + **kwargs, + ) -> None: + self.function = function + if output_keys is None: + output_keys = [] + if isinstance(output_keys, str): + output_keys = [output_keys] + self.output_keys = output_keys + if function_name is None: + if hasattr(self.function, "__name__"): + function_name = getattr(self.function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__ attribute" + ) + self.function_name = function_name + super().__init__(label=label or self.function_name, **kwargs) + + if semantic_type_registry is None: + # TODO: reconsider the use of default registry here + semantic_type_registry = default_registry + + self.semantic_type_registry = semantic_type_registry + self.function_info_extractor = function_info_extractor + + # extract input and output types from the function signature + self._input_typespec, self._output_typespec = extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, + ) + + @property + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the function pod. + This is used to validate the input streams. + """ + return self._input_typespec + + @property + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the function pod. + This is used to validate the output streams. + """ + return self._output_typespec + + def __repr__(self) -> str: + return f"FunctionPod:{self.function!r}" + + def __str__(self) -> str: + include_module = self.function.__module__ != "__main__" + func_sig = get_function_signature( + self.function, + name_override=self.function_name, + include_module=include_module, + ) + return f"FunctionPod:{func_sig}" + + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, PythonDictPacket | None]: + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + output_values = [] + + values = self.function(**packet.as_dict(include_source=False)) + + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + + # TODO: add source info based on this function call + output_packet = PythonDictPacket( + {k: v for k, v in zip(self.output_keys, output_values)} + ) + return tag, output_packet + + def identity_structure(self, *streams: dp.Stream) -> Any: + # construct identity structure for the function + # if function_info_extractor is available, use that but substitute the function_name + if self.function_info_extractor is not None: + function_info = self.function_info_extractor.extract_function_info( + self.function, + function_name=self.function_name, + input_typespec=self.input_typespec, + output_typespec=self.output_typespec, + ) + else: + # use basic information only + function_info = { + "name": self.function_name, + "input_typespec": self.input_typespec, + "output_typespec": self.output_typespec, + } + function_info["output_keys"] = tuple(self.output_keys) + + return ( + self.__class__.__name__, + function_info, + ) + streams + + +class StoredPod(PodBase): + def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: + super().__init__(**kwargs) + self.pod = pod + + def computed_label(self) -> str | None: + return self.pod.label + + @property + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the stored pod. + This is used to validate the input streams. + """ + return self.pod.input_typespec + + @property + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the stored pod. + This is used to validate the output streams. + """ + return self.pod.output_typespec + + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + return self.pod.call(tag, packet) + + def identity_structure(self, *streams: dp.Stream) -> Any: + return self.pod.identity_structure(*streams) + + def __repr__(self) -> str: + return f"StoredPod({self.pod!r})" + + def __str__(self) -> str: + return f"StoredPod:{self.pod!s}" diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py new file mode 100644 index 0000000..9389949 --- /dev/null +++ b/src/orcapod/data/streams.py @@ -0,0 +1,487 @@ +from orcapod.protocols import data_protocols as dp +from orcapod.types import SemanticTypeRegistry, default_registry, schemas, TypeSpec +from orcapod.data.datagrams import ArrowPacket, ArrowTag, SemanticConverter +from orcapod.data.base import LabeledContentIdentifiableBase +import pyarrow as pa +from collections.abc import Iterator, Collection +from abc import ABC, abstractmethod +from datetime import timezone, datetime +from typing import Any, Literal +import logging +import warnings + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class StreamBase(ABC, LabeledContentIdentifiableBase): + """ + A stream is a collection of tagged-packets that are generated by an operation. + The stream is iterable and can be used to access the packets in the stream. + + A stream has property `invocation` that is an instance of Invocation that generated the stream. + This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). + """ + + def __init__( + self, + source: dp.Kernel | None = None, + upstreams: tuple[dp.Stream, ...] = (), + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._source = source + self._upstreams = upstreams + self._last_modified: datetime | None = None + self._update_modified_time() + + @abstractmethod + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + + @abstractmethod + def types(self) -> tuple[TypeSpec, TypeSpec]: ... + + @abstractmethod + def as_table(self) -> pa.Table: ... + + @abstractmethod + def iter_packets( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + def _update_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return + + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) + + @property + def last_modified(self) -> datetime | None: + """ + Returns when the stream's content was last modified. + This is used to track the time when the stream was last accessed. + Returns None if the stream has not been accessed yet. + """ + return self._last_modified + + @property + def is_current(self) -> bool: + """ + Returns whether the stream is current. + A stream is current if the content is up-to-date with respect to its source. + This can be used to determine if a stream with non-None last_modified is up-to-date. + Note that for asynchronous streams, this status is not applicable and always returns False. + """ + if self.last_modified is None: + # If there is no last_modified timestamp, we cannot determine if the stream is current + return False + + for upstream in self.upstreams: + if ( + not upstream.is_current + or upstream.last_modified is None + or upstream.last_modified > self.last_modified + ): + return False + return True + + @property + def source(self) -> dp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + def __iter__( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self.iter_packets() + + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + """ + Flow everything through the stream, returning the entire collection of + (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. + """ + return [e for e in self] + + # --------------------- Recursive methods --------------------------- + # These methods form a step in the multi-class recursive invocation that follows the pattern of + # Stream -> Invocation -> Kernel -> Stream ... -> Invocation -> Kernel + # Most of the method logic would be found in Kernel's implementation of the method with + # Stream and Invocation simply serving as recursive steps + + def identity_structure(self) -> Any: + """ + Identity structure of a stream is deferred to the identity structure + of the associated invocation, if present. + A bare stream without invocation has no well-defined identity structure. + Specialized stream subclasses should override this method to provide more meaningful identity structure + """ + if self.source is not None: + # if the stream is generated by an operation, use the identity structure from the invocation + return self.source.identity_structure(*self.upstreams) + return super().identity_structure() + + +class KernelStream(StreamBase): + """ + Recomputable stream that wraps a streams produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + output_stream: dp.Stream | None = None, + source: dp.Kernel | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (output_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._update_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.update_stream() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.update_stream() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.update_stream() + if not status: # if it failed to update for whatever reason + return False + return True + + def update_stream(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._update_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table(self) -> pa.Table: + self.update_stream() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.update_stream() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + yield from self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" + + +class ImmutableTableStream(StreamBase): + """ + An immutable stream based on a PyArrow Table. + This stream is designed to be used with data that is already in a tabular format, + such as data loaded from a file or database. The columns to be treated as tags are + specified at initialization, and the rest of the columns are treated as packets. + The stream is immutable, meaning that once it is created, it cannot be modified. + This is useful for ensuring that the data in the stream remains consistent and unchanging. + + The types of the tag and packet columns are inferred from the PyArrow Table schema. + """ + + def __init__( + self, + table: pa.Table, + tag_columns: Collection[str] = (), + source: dp.Kernel | None = None, + upstreams: tuple[dp.Stream, ...] = (), + semantic_type_registry: SemanticTypeRegistry | None = None, + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=upstreams, **kwargs) + + self._table = table + + self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + self._packet_columns = tuple( + c for c in table.column_names if c not in tag_columns + ) + + semantic_type_registry = semantic_type_registry or default_registry + tag_schema = pa.schema( + f for f in self._table.schema if f.name in self._tag_columns + ) + packet_schema = pa.schema( + f for f in self._table.schema if f.name in self._packet_columns + ) + self._tag_converter = SemanticConverter.from_arrow_schema( + tag_schema, semantic_type_registry + ) + self._packet_converter = SemanticConverter.from_arrow_schema( + packet_schema, semantic_type_registry + ) + + self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None + self._update_modified_time() # set modified time to now + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + return self._tag_columns, self._packet_columns + + def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + # TODO: consider using MappingProxyType to avoid copying the dicts + return ( + self._tag_converter.python_schema.copy(), + self._packet_converter.python_schema.copy(), + ) + + def as_table(self) -> pa.Table: + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + return self._table + + def clear_cache(self) -> None: + """ + Resets the cached elements of the stream. + This is useful for re-iterating over the stream. + """ + self._cached_elements = None + + def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + if self._cached_elements is None: + self._cached_elements = [] + tags = self._table.select(self._tag_columns) + packets = self._table.select(self._packet_columns) + for tag_batch, packet_batch in zip(tags.to_batches(), packets.to_batches()): + for i in range(len(tag_batch)): + self._cached_elements.append( + ( + ArrowTag(tag_batch.slice(i, 1)), + ArrowPacket(packet_batch.slice(i, 1)), + ) + ) + yield from self._cached_elements + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(table={self._table.column_names}, " + f"tag_columns={self._tag_columns})" + ) + + +class PodStream(StreamBase): + def __init__( + self, + pod: dp.Pod, + input_stream: dp.Stream, + error_handling: Literal["raise", "ignore", "warn"] = "raise", + **kwargs, + ) -> None: + super().__init__(upstreams=(input_stream,), **kwargs) + self.pod = pod + self.input_stream = input_stream + self.error_handling = error_handling + self._source = pod + + # Cache for processed packets + # This is a dictionary mapping the index of the packet in the input stream to a tuple of (Tag, Packet) + # This allows us to efficiently access the processed packets without re-processing them + self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} + self._computation_complete: bool = False + self._cached_output_table: pa.Table | None = None + + @property + def source(self) -> dp.Pod | None: + """ + The source of the stream, which is the pod that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + tag_keys, _ = self.input_stream.keys() + packet_keys = tuple(self.pod.output_typespec.keys()) + return tag_keys, packet_keys + + def types(self) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.input_stream.types() + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_typespec) + return tag_typespec, packet_typespec + + def clear_cache(self) -> None: + """ + Clears the cached results of the processed stream. + This is useful for re-processing the stream with the same processor. + """ + self._cached_output_packets = {} + self._computation_complete = False + self._cached_output_table = None + + def validate_cache(self) -> None: + if not self.is_current: + self.clear_cache() + self._update_modified_time(invalidate=True) + + def as_table(self) -> pa.Table: + self.validate_cache() + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + for tag, packet in self.iter_packets(): + # TODO: evaluate handling efficiency here + all_tags.append(tag.as_dict()) + all_packets.append(packet.as_dict()) + all_tags: pa.Table = pa.Table.from_pylist(all_tags) + all_packets: pa.Table = pa.Table.from_pylist(all_packets) + # assert that column names do not overlap + overlapping_columns = set(all_tags.column_names) & set( + all_packets.column_names + ) + if overlapping_columns: + raise ValueError( + f"Column names overlap between tags and packets: {overlapping_columns}. Overlapping tag and packet columns are not supported yet." + ) + self._cached_output_table = pa.Table.from_arrays( + all_tags.columns + all_packets.columns, + names=all_tags.column_names + all_packets.column_names, + ) + + return self._cached_output_table + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.validate_cache() + if not self._computation_complete or self._cached_output_packets is None: + for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): + if i not in self._cached_output_packets: + try: + processed_tag, processed_packet = self.pod.call(tag, packet) + except Exception as e: + logger.error(f"Error processing packet {packet}: {e}") + if self.error_handling == "raise": + raise e + elif self.error_handling == "warn": + warnings.warn(f"Error processing packet {packet}: {e}") + continue + elif self.error_handling == "ignore": + continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e + if processed_packet is None: + # call returning None means the packet should be skipped + logger.debug( + f"Packet {packet} with tag {tag} was processed but returned None, skipping." + ) + continue + self._cached_output_packets[i] = (processed_tag, processed_packet) + yield processed_tag, processed_packet + self._computation_complete = True + self._update_modified_time() + + else: + for i in range(len(self._cached_output_packets)): + yield self._cached_output_packets[i] diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py new file mode 100644 index 0000000..fab481a --- /dev/null +++ b/src/orcapod/data/trackers.py @@ -0,0 +1,150 @@ +from orcapod.protocols import data_protocols as dp +from collections import defaultdict +from abc import ABC, abstractmethod + + +class BasicTrackerManager: + def __init__(self) -> None: + self._active_trackers: list[dp.Tracker] = [] + + def register_tracker(self, tracker: dp.Tracker) -> None: + """ + Register a new tracker in the system. + This is used to add a new tracker to the list of active trackers. + """ + if tracker not in self._active_trackers: + self._active_trackers.append(tracker) + + def deregister_tracker(self, tracker: dp.Tracker) -> None: + """ + Remove a tracker from the system. + This is used to deactivate a tracker and remove it from the list of active trackers. + """ + if tracker in self._active_trackers: + self._active_trackers.remove(tracker) + + def get_active_trackers(self) -> list[dp.Tracker]: + """ + Get the list of active trackers. + This is used to retrieve the currently active trackers in the system. + """ + return [t for t in self._active_trackers if t.is_active()] + + def record(self, stream: dp.Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + for tracker in self.get_active_trackers(): + tracker.record(stream) + + +class AutoRegisteringContextBasedTracker(ABC): + def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self._active = False + + def set_active(self, active: bool = True) -> None: + if active: + self._tracker_manager.register_tracker(self) + else: + self._tracker_manager.deregister_tracker(self) + self._active = active + + def is_active(self) -> bool: + return self._active + + @abstractmethod + def record(self, stream: dp.Stream) -> None: ... + + def __enter__(self): + self.set_active(True) + return self + + def __exit__(self, exc_type, exc_val, ext_tb): + self.set_active(False) + + +class GraphTracker(AutoRegisteringContextBasedTracker): + """ + A tracker that records the invocations of operations and generates a graph + of the invocations and their dependencies. + """ + + # Thread-local storage to track active trackers + + def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + super().__init__(tracker_manager=tracker_manager) + self.kernel_to_invoked_stream_lut: dict[dp.Kernel, list[dp.Stream]] = ( + defaultdict(list) + ) + + def record(self, stream: dp.Stream) -> None: + assert stream.source is not None, ( + "Stream must have a source kernel when recording." + ) + stream_list = self.kernel_to_invoked_stream_lut[stream.source] + if stream not in stream_list: + stream_list.append(stream) + + def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: + """ + Reset the tracker and return the recorded invocations. + """ + recorded_streams = self.kernel_to_invoked_stream_lut + self.kernel_to_invoked_stream_lut = defaultdict(list) + return recorded_streams + + def generate_graph(self): + import networkx as nx + + G = nx.DiGraph() + + # Add edges for each invocation + for _, streams in self.kernel_to_invoked_stream_lut.items(): + for stream in streams: + if stream not in G: + G.add_node(stream) + for upstream in stream.upstreams: + G.add_edge(upstream, stream) + return G + + # def generate_namemap(self) -> dict[Invocation, str]: + # namemap = {} + # for kernel, invocations in self.invocation_lut.items(): + # # if only one entry present, use the kernel name alone + # if kernel.label is not None: + # node_label = kernel.label + # else: + # node_label = str(kernel) + # if len(invocations) == 1: + # namemap[invocations[0]] = node_label + # continue + # # if multiple entries, use the kernel name and index + # for idx, invocation in enumerate(invocations): + # namemap[invocation] = f"{node_label}_{idx}" + # return namemap + + # def draw_graph(self): + # import networkx as nx + # import matplotlib.pyplot as plt + + # G = self.generate_graph() + # labels = self.generate_namemap() + + # pos = nx.drawing.nx_agraph.graphviz_layout(G, prog="dot") + # nx.draw( + # G, + # pos, + # labels=labels, + # node_size=2000, + # node_color="lightblue", + # with_labels=True, + # font_size=10, + # font_weight="bold", + # arrowsize=20, + # ) + # plt.tight_layout() + + +DEFAULT_TRACKER_MANAGER = BasicTrackerManager() From cac1855e139a2840b0a9ee5db31ff8b93d2ae297 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 01:42:18 +0000 Subject: [PATCH 064/224] refactor: cleanup protocols --- src/orcapod/data/datagrams.py | 16 +- src/orcapod/data/kernels.py | 5 +- src/orcapod/data/operators.py | 10 +- src/orcapod/data/pods.py | 56 +- src/orcapod/data/streams.py | 135 ++-- src/orcapod/protocols/data_protocols.py | 803 +++++++++++++++++++++--- 6 files changed, 822 insertions(+), 203 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index c21c46b..717b928 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -199,8 +199,7 @@ def as_dict(self) -> dict[str, DataValue]: def as_table(self) -> pa.Table: return pa.Table.from_pylist([self]) - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: # TODO: provide correct implementation return schemas.PythonSchema({k: str for k in self.keys()}) @@ -219,8 +218,7 @@ def __init__(self, table: pa.Table) -> None: def keys(self) -> tuple[str, ...]: return tuple(self.table.column_names) - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: if self._cached_python_schema is None: self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( self.table.schema @@ -262,7 +260,7 @@ def create_from( new_packet = PythonDictPacket( object.as_dict(include_source=False), object.source_info(), - dict(object.typespec), + dict(object.types()), finger_print=finger_print, semantic_converter=semantic_converter, semantic_type_registry=semantic_type_registry, @@ -359,8 +357,9 @@ def content_hash(self) -> str: self._post_hash_callback(self._finger_print, self._cached_content_hash) return self._cached_content_hash - @property - def typespec(self) -> schemas.PythonSchema: + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: @@ -562,8 +561,7 @@ def content_hash(self) -> str: self._post_hash_callback(self._finger_print, self._cached_content_hash) return self._cached_content_hash - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: return self.semantic_converter.python_schema.copy() def keys(self) -> tuple[str, ...]: diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 0695715..acccf4e 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -39,10 +39,9 @@ def __init__( def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs - ) -> dp.Stream: + ) -> dp.LiveStream: output_stream = self.forward(*streams, **kwargs) - kernel_stream: dp.Stream if output_stream.source is not None: kernel_stream = KernelStream(output_stream, label=label) else: @@ -62,7 +61,7 @@ def __call__( return kernel_stream @abstractmethod - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... @abstractmethod def validate_inputs(self, *streams: dp.Stream) -> None: ... diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 15d255a..3db4949 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -53,10 +53,10 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: left_stream, right_stream = streams return self.op_forward(left_stream, right_stream) - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: self.check_binary_inputs(*streams) left_stream, right_stream = streams - return self.op_types(left_stream, right_stream) + return self.op_output_types(left_stream, right_stream) def identity_structure(self, *streams: dp.Stream) -> Any: """ @@ -75,7 +75,7 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre ... @abstractmethod - def op_types( + def op_output_types( self, left_stream: dp.Stream, right_stream: dp.Stream ) -> tuple[TypeSpec, TypeSpec]: """ @@ -135,7 +135,7 @@ def op_forward( upstreams=(left_stream, right_stream), ) - def op_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: left_tag_typespec, left_packet_typespec = left_stream.types() right_tag_typespec, right_packet_typespec = right_stream.types() joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) @@ -148,7 +148,7 @@ def op_validate_inputs( self, left_stream: dp.Stream, right_stream: dp.Stream ) -> None: try: - self.op_types(left_stream, right_stream) + self.op_output_types(left_stream, right_stream) except Exception as e: raise InputValidationError(f"Input streams are not compatible: {e}") diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 256c34b..7e2ce48 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -33,26 +33,24 @@ class PodBase(TrackedKernelBase): It allows for the execution of a function with a specific label and can be tracked by the system. """ - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ input_stream = self.process_and_verify_streams(*streams) tag_typespec, _ = input_stream.types() - return tag_typespec, self.output_typespec + return tag_typespec, self.output_packet_types() - @property @abstractmethod - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the pod. This is used to validate the input streams. """ ... - @property @abstractmethod - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the pod. This is used to validate the output streams. """ @@ -99,12 +97,12 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: stream = Join()(stream, next_stream) combined_streams = [stream] input_stream = combined_streams[0] - _, input_typespec = input_stream.types() + _, incoming_packet_types = input_stream.types() if not tsutils.check_typespec_compatibility( - input_typespec, self.input_typespec + incoming_packet_types, self.input_packet_types() ): raise ValueError( - f"Input typespec {input_typespec} is not compatible with expected input typespec {self.input_typespec}" + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" ) return input_stream @@ -209,28 +207,28 @@ def __init__( self.function_info_extractor = function_info_extractor # extract input and output types from the function signature - self._input_typespec, self._output_typespec = extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_typespec, - output_typespec=output_typespec, + self._input_packet_types, self._output_packet_types = ( + extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, + ) ) - @property - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the function pod. This is used to validate the input streams. """ - return self._input_typespec + return self._input_packet_types - @property - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the function pod. This is used to validate the output streams. """ - return self._output_typespec + return self._output_packet_types def __repr__(self) -> str: return f"FunctionPod:{self.function!r}" @@ -285,15 +283,15 @@ def identity_structure(self, *streams: dp.Stream) -> Any: function_info = self.function_info_extractor.extract_function_info( self.function, function_name=self.function_name, - input_typespec=self.input_typespec, - output_typespec=self.output_typespec, + input_typespec=self.input_packet_types(), + output_typespec=self.output_packet_types(), ) else: # use basic information only function_info = { "name": self.function_name, - "input_typespec": self.input_typespec, - "output_typespec": self.output_typespec, + "input_packet_types": self.input_packet_types, + "output_packet_types": self.output_packet_types, } function_info["output_keys"] = tuple(self.output_keys) @@ -311,21 +309,19 @@ def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: def computed_label(self) -> str | None: return self.pod.label - @property - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the stored pod. This is used to validate the input streams. """ - return self.pod.input_typespec + return self.pod.input_packet_types() - @property - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the stored pod. This is used to validate the output streams. """ - return self.pod.output_typespec + return self.pod.output_packet_types() def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 9389949..2454f85 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -35,33 +35,35 @@ def __init__( self._source = source self._upstreams = upstreams self._last_modified: datetime | None = None - self._update_modified_time() + self._set_modified_time() - @abstractmethod - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + @property + def source(self) -> dp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source - @abstractmethod - def types(self) -> tuple[TypeSpec, TypeSpec]: ... + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams - @abstractmethod - def as_table(self) -> pa.Table: ... + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None @abstractmethod - def iter_packets( - self, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... - - def _update_modified_time( - self, timestamp: datetime | None = None, invalidate: bool = False - ) -> None: - if invalidate: - self._last_modified = None - return + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... - if timestamp is not None: - self._last_modified = timestamp - else: - self._last_modified = datetime.now(timezone.utc) + @abstractmethod + def types(self) -> tuple[TypeSpec, TypeSpec]: ... @property def last_modified(self) -> datetime | None: @@ -93,33 +95,31 @@ def is_current(self) -> bool: return False return True - @property - def source(self) -> dp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._source - - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._upstreams + def _set_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) def __iter__( self, ) -> Iterator[tuple[dp.Tag, dp.Packet]]: return self.iter_packets() + @abstractmethod + def iter_packets( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + @abstractmethod + def as_table(self) -> pa.Table: ... + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ Flow everything through the stream, returning the entire collection of @@ -185,14 +185,14 @@ def clear_cache(self) -> None: This is useful for re-processing the stream with the same kernel. """ self._cached_stream = None - self._update_modified_time(invalidate=True) + self._set_modified_time(invalidate=True) def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "_cached_stream should not be None here." ) @@ -203,7 +203,7 @@ def types(self) -> tuple[TypeSpec, TypeSpec]: Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "_cached_stream should not be None here." ) @@ -212,12 +212,12 @@ def types(self) -> tuple[TypeSpec, TypeSpec]: @property def is_current(self) -> bool: if self._cached_stream is None or not super().is_current: - status = self.update_stream() + status = self.refresh() if not status: # if it failed to update for whatever reason return False return True - def update_stream(self, force: bool = False) -> bool: + def refresh(self, force: bool = False) -> bool: updated = False if force or (self._cached_stream is not None and not super().is_current): self.clear_cache() @@ -227,7 +227,7 @@ def update_stream(self, force: bool = False) -> bool: "Stream source must be set to recompute the stream." ) self._cached_stream = self.source.forward(*self.upstreams) - self._update_modified_time() + self._set_modified_time() updated = True if self._cached_stream is None: @@ -238,6 +238,14 @@ def update_stream(self, force: bool = False) -> bool: return updated + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + @property def last_modified(self) -> datetime | None: if self._cached_stream is None: @@ -245,14 +253,14 @@ def last_modified(self) -> datetime | None: return self._cached_stream.last_modified def as_table(self) -> pa.Table: - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) return self._cached_stream.as_table() def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) @@ -307,7 +315,7 @@ def __init__( ) self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None - self._update_modified_time() # set modified time to now + self._set_modified_time() # set modified time to now def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ @@ -402,13 +410,13 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: This is useful for accessing the columns in the stream. """ tag_keys, _ = self.input_stream.keys() - packet_keys = tuple(self.pod.output_typespec.keys()) + packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys def types(self) -> tuple[TypeSpec, TypeSpec]: tag_typespec, _ = self.input_stream.types() # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_typespec) + packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec def clear_cache(self) -> None: @@ -420,13 +428,22 @@ def clear_cache(self) -> None: self._computation_complete = False self._cached_output_table = None - def validate_cache(self) -> None: - if not self.is_current: - self.clear_cache() - self._update_modified_time(invalidate=True) + def refresh(self, force: bool = False) -> bool: + if not self.is_current or force: + self.invalidate() + return True + return False + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) def as_table(self) -> pa.Table: - self.validate_cache() + self.refresh() if self._cached_output_table is None: all_tags = [] all_packets = [] @@ -452,7 +469,7 @@ def as_table(self) -> pa.Table: return self._cached_output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.validate_cache() + self.refresh() if not self._computation_complete or self._cached_output_packets is None: for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): if i not in self._cached_output_packets: @@ -480,7 +497,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self._cached_output_packets[i] = (processed_tag, processed_packet) yield processed_tag, processed_packet self._computation_complete = True - self._update_modified_time() + self._set_modified_time() else: for i in range(len(self._cached_output_packets)): diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 677aab6..a997302 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -7,37 +7,163 @@ class Datagram(Protocol): - @property - def typespec(self) -> TypeSpec: ... + """ + Base protocol for all data containers in Orcapod. - def keys(self) -> Collection[str]: ... + Datagrams are the fundamental units of data that flow through the system. + They provide a unified interface for data access and conversion, ensuring + consistent behavior across different data types and sources. + + TypeSpec is a dict[str, type] mapping field names to their Python types, + enabling type checking and validation throughout the computational graph. + """ - def as_table(self) -> pa.Table: ... + def types(self) -> TypeSpec: + """ + Return the type specification for this datagram. - def as_dict(self) -> dict[str, DataValue]: ... + The TypeSpec maps field names to their Python types, enabling + type checking and validation throughout the system. + Returns: + TypeSpec: Dictionary mapping field names to Python types + """ + ... + + def keys(self) -> Collection[str]: + """ + Return the available keys/fields in this datagram. + + This provides a way to inspect the structure of the datagram + without accessing the actual data values. + + Returns: + Collection[str]: Available field names + """ + ... -class Tag(Datagram, Protocol): ... + def as_table(self) -> pa.Table: + """ + Convert to PyArrow Table format. + + Provides a standardized way to convert datagram content to + a columnar format suitable for analysis and processing. + + Returns: + pa.Table: PyArrow table representation + """ + ... + + def as_dict(self) -> dict[str, DataValue]: + """ + Convert to dictionary format. + + Provides a simple key-value representation of the datagram + content, useful for debugging and simple data access. + + Returns: + dict[str, DataValue]: Dictionary representation + """ + ... + + +class Tag(Datagram, Protocol): + """ + Metadata associated with each data item in a stream. + + Tags carry contextual information about data packets as they flow through + the computational graph. They are immutable and provide metadata that + helps with: + - Data lineage tracking + - Grouping and aggregation operations + - Temporal information (timestamps) + - Source identification + - Processing context + + Common examples include: + - Timestamps indicating when data was created/processed + - Source identifiers showing data origin + - Processing metadata like batch IDs or session information + - Grouping keys for aggregation operations + - Quality indicators or confidence scores + """ + + pass class Packet(Datagram, Protocol): + """ + The actual data payload in a stream. + + Packets represent the core data being processed through the computational + graph. Unlike Tags (which are metadata), Packets contain the actual + information that computations operate on. + + Packets extend Datagram with additional capabilities for: + - Source tracking and lineage + - Content-based hashing for caching + - Metadata inclusion for debugging + + The distinction between Tag and Packet is crucial for understanding + data flow: Tags provide context, Packets provide content. + """ + def as_table(self, include_source: bool = False) -> pa.Table: """ Convert the packet to a PyArrow Table. - If include_source is True, the source information is included in the table. + + Args: + include_source: If True, source information is included in the table + for debugging and lineage tracking + + Returns: + pa.Table: PyArrow table representation of packet data """ ... def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ Convert the packet to a dictionary. - If include_source is True, the source information is included in the dictionary. + + Args: + include_source: If True, source information is included in the dictionary + for debugging and lineage tracking + + Returns: + dict[str, DataValue]: Dictionary representation of packet data + """ + ... + + def content_hash(self) -> str: + """ + Return a hash of the packet content for caching/comparison. + + This hash should be deterministic and based only on the packet content, + not on source information or metadata. Used for: + - Caching computation results + - Detecting data changes + - Deduplication operations + + Returns: + str: Deterministic hash of packet content """ ... - def content_hash(self) -> str: ... + def source_info(self) -> dict[str, str | None]: + """ + Return metadata about the packet's source/origin. - def source_info(self) -> dict[str, str | None]: ... + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information + + Returns: + dict[str, str | None]: Source metadata for debugging/lineage + """ + ... # def join(self, other: "Packet") -> "Packet": ... @@ -46,202 +172,685 @@ def source_info(self) -> dict[str, str | None]: ... class PodFunction(Protocol): """ - A function suitable to be used in a FunctionPod. - It takes one or more named arguments, each corresponding to either: - - A path to a file or directory (PathSet) - for backward compatibility - - A simple data value (str, int, float, bool, bytes, Path) - and returns either None, a single value, or a list of values + A function suitable for use in a FunctionPod. + + PodFunctions define the computational logic that operates on individual + packets within a Pod. They represent pure functions that transform + data values without side effects. + + These functions are designed to be: + - Stateless: No dependency on external state + - Deterministic: Same inputs always produce same outputs + - Serializable: Can be cached and distributed + - Type-safe: Clear input/output contracts + + PodFunctions accept named arguments corresponding to packet fields + and return transformed data values. """ - def __call__(self, **kwargs: DataValue) -> None | DataValue: ... + def __call__(self, **kwargs: DataValue) -> None | DataValue: + """ + Execute the pod function with the given arguments. + + The function receives packet data as named arguments and returns + either transformed data or None (for filtering operations). + + Args: + **kwargs: Named arguments mapping packet fields to data values + + Returns: + None: Filter out this packet (don't include in output) + DataValue: Single transformed value + + Raises: + TypeError: If required arguments are missing + ValueError: If argument values are invalid + """ + ... class Labelable(Protocol): """ - A protocol for objects that can have a label. - This is used to provide a human-readable name for the object. + Protocol for objects that can have a human-readable label. + + Labels provide meaningful names for objects in the computational graph, + making debugging, visualization, and monitoring much easier. They serve + as human-friendly identifiers that complement the technical identifiers + used internally. + + Labels are optional but highly recommended for: + - Debugging complex computational graphs + - Visualization and monitoring tools + - Error messages and logging + - User interfaces and dashboards """ @property def label(self) -> str | None: """ - Return the label of the object. - If no label is set, return None. + Return the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Returns: + str: Human-readable label for this object + None: No label is set (will use default naming) """ ... -class Kernel(ContentIdentifiable, Labelable, Protocol): +class Stream(ContentIdentifiable, Labelable, Protocol): """ - Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. - It is the base class for all computations and transformations that can be performed on a collection of streams - (including an empty collection). - A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). - Each "invocation" of the kernel on a collection of streams is assigned a unique ID. - The corresponding invocation information is stored as Invocation object and attached to the output stream - for computational graph tracking. + Base protocol for all streams in Orcapod. + + Streams represent sequences of (Tag, Packet) pairs flowing through the + computational graph. They are the fundamental data structure connecting + kernels and carrying both data and metadata. + + Streams can be either: + - Static: Immutable snapshots created at a specific point in time + - Live: Dynamic streams that stay current with upstream dependencies + + All streams provide: + - Iteration over (tag, packet) pairs + - Type information and schema access + - Lineage information (source kernel and upstream streams) + - Basic caching and freshness tracking + - Conversion to common formats (tables, dictionaries) """ - def __call__( - self, *streams: "Stream", label: str | None = None, **kwargs - ) -> "Stream": + @property + def source(self) -> "Kernel | None": + """ + The kernel that produced this stream. + + This provides lineage information for tracking data flow through + the computational graph. Root streams (like file sources) may + have no source kernel. + + Returns: + Kernel: The source kernel that created this stream + None: This is a root stream with no source kernel """ - This is the main interface for invoking the kernel and perform any side-effects such as registering the invocation with the computational graph. - This method should be called with a collection of streams, which can be empty, and is expected to trigger - the call to the forward method of the kernel. + ... + + @property + def upstreams(self) -> tuple["Stream", ...]: + """ + Input streams used to produce this stream. + + These are the streams that were provided as input to the source + kernel when this stream was created. Used for dependency tracking + and cache invalidation. + + Returns: + tuple[Stream, ...]: Upstream dependency streams (empty for sources) """ ... - def forward(self, *streams: "Stream") -> "Stream": + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior. - The method should return a new stream that represents the output of the kernel, but should not register the invocation - with the computational graph, allowing for the computation to be performed without side effects. + Available keys/fields in the stream content. + + Returns the field names present in both tags and packets. + This provides schema information without requiring type details, + useful for: + - Schema inspection and exploration + - Query planning and optimization + - Field validation and mapping + + Returns: + tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) """ ... - def types(self, *streams: "Stream") -> tuple[TypeSpec, TypeSpec]: ... + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Type specifications for the stream content. - def validate_inputs(self, *streams: "Stream") -> None: ... + Returns the type schema for both tags and packets in this stream. + This information is used for: + - Type checking and validation + - Schema inference and planning + - Compatibility checking between kernels + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) + """ + ... -class Pod(Kernel, Protocol): @property - def input_typespec(self) -> TypeSpec: ... + def last_modified(self) -> datetime | None: + """ + When the stream's content was last modified. + + This property is crucial for caching decisions and dependency tracking: + - datetime: Content was last modified at this time (cacheable) + - None: Content is never stable, always recompute (some dynamic streams) + + Both static and live streams typically return datetime values, but + live streams update this timestamp whenever their content changes. + + Returns: + datetime: Timestamp of last modification for most streams + None: Stream content is never stable (some special dynamic streams) + """ + ... @property - def output_typespec(self) -> TypeSpec: ... + def is_current(self) -> bool: + """ + Whether the stream is up-to-date with its dependencies. - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + A stream is current if its content reflects the latest state of its + source kernel and upstream streams. This is used for cache validation + and determining when refresh is needed. + + For live streams, this should always return True since they stay + current automatically. For static streams, this indicates whether + the cached content is still valid. + + Returns: + bool: True if stream is up-to-date, False if refresh needed """ - Call the function pod with a single input packet. - This is used to invoke the function pod with a single packet. + ... + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + """ + Iterate over (tag, packet) pairs in the stream. + + This is the primary way to access stream data. The behavior depends + on the stream type: + - Static streams: Return cached/precomputed data + - Live streams: May trigger computation and always reflect current state + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs """ ... + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: + """ + Alias for __iter__ for explicit packet iteration. -class Stream(ContentIdentifiable, Labelable, Protocol): + Provides a more explicit method name when the intent is to iterate + over packets specifically, improving code readability. + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs + """ + ... + + def as_table(self) -> pa.Table: + """ + Convert the entire stream to a PyArrow Table. + + Materializes all (tag, packet) pairs into a single table for + analysis and processing. This operation may be expensive for + large streams or live streams that need computation. + + Tag fields are prefixed with "_tag_" to avoid naming conflicts + with packet fields. + + Returns: + pa.Table: Complete stream data as a PyArrow Table + """ + ... + + +class LiveStream(Stream, Protocol): """ - A stream that is generated by an invocation of a kernel. - This stream is used to represent the output of a kernel invocation. - It is a concrete implementation of the SyncStream that has an associated - invocation that generated the stream. + A stream that automatically stays up-to-date with its upstream dependencies. + + LiveStream extends the base Stream protocol with capabilities for "up-to-date" + data flow and reactive computation. Unlike static streams which represent + snapshots, LiveStreams provide the guarantee that their content always + reflects the current state of their dependencies. + + Key characteristics: + - Automatically refresh the stream if changes in the upstreams are detected + - Track last_modified timestamp when content changes + - Support manual refresh triggering and invalidation + - By design, LiveStream would return True for is_current except when auto-update fails. + + LiveStreams are always returned by Kernel.__call__() methods, ensuring + that normal kernel usage produces live, up-to-date results. + + Caching behavior: + - last_modified updates whenever content changes + - Can be cached based on dependency timestamps + - Invalidation happens automatically when upstreams change + + Use cases: + - Real-time data processing pipelines + - Reactive user interfaces + - Monitoring and alerting systems + - Dynamic dashboard updates + - Any scenario requiring current data """ - @property - def source(self) -> Kernel | None: ... + def refresh(self, force: bool = False) -> bool: + """ + Manually trigger a refresh of this stream's content. - @property - def upstreams(self) -> tuple["Stream", ...]: ... + Forces the stream to check its upstream dependencies and update + its content if necessary. This is useful when: + - You want to ensure the latest data before a critical operation + - You need to force computation at a specific time + - You're debugging data flow issues + - You want to pre-compute results for performance + Args: + force: If True, always refresh even if the stream is current. + If False, only refresh if the stream is not current. - @property - def last_modified(self) -> datetime | None: + Returns: + bool: True if the stream was refreshed, False if it was already current. + Note: LiveStream refreshes automatically on access, so this + method may be a no-op for some implementations. However, it's + always safe to call if you need to control when the cache is refreshed. + """ + ... + + def invalidate(self) -> None: """ - Returns when the stream's content was last modified. + Mark this stream as invalid, forcing a refresh on next access. + + This method is typically called when: + - Upstream dependencies have changed + - The source kernel has been modified + - External data sources have been updated + - Manual cache invalidation is needed + + The stream will automatically refresh its content the next time + it's accessed (via iteration, as_table(), etc.). + + This is more efficient than immediate refresh when you know the + data will be accessed later. + """ + ... + + +class Kernel(ContentIdentifiable, Labelable, Protocol): + """ + The fundamental unit of computation in Orcapod. + + Kernels are the building blocks of computational graphs, transforming + zero, one, or more input streams into a single output stream. They + encapsulate computation logic while providing consistent interfaces + for validation, type checking, and execution. + + Key design principles: + - Immutable: Kernels don't change after creation + - Deterministic: Same inputs always produce same outputs + - Composable: Kernels can be chained and combined + - Trackable: All invocations are recorded for lineage + - Type-safe: Strong typing and validation throughout + + Execution modes: + - __call__(): Full-featured execution with tracking, returns LiveStream + - forward(): Pure computation without side effects, returns Stream + + The distinction between these modes enables both production use (with + full tracking) and testing/debugging (without side effects). + """ + + def __call__( + self, *streams: Stream, label: str | None = None, **kwargs + ) -> LiveStream: + """ + Main interface for kernel invocation with full tracking and guarantees. + + This is the primary way to invoke kernels in production. It provides + a complete execution pipeline: + 1. Validates input streams against kernel requirements + 2. Registers the invocation with the computational graph + 3. Calls forward() to perform the actual computation + 4. Ensures the result is a LiveStream that stays current + + The returned LiveStream automatically stays up-to-date with its + upstream dependencies, making it suitable for real-time processing + and reactive applications. + + Args: + *streams: Input streams to process (can be empty for source kernels) + label: Optional label for this invocation (overrides kernel.label) + **kwargs: Additional arguments for kernel configuration Returns: - datetime: Timestamp of last modification (cacheable streams) - None: Content is never stable - always recompute - (async streams, dynamic streams, etc.) + LiveStream: Live stream that stays up-to-date with upstreams + + Raises: + ValidationError: If input streams are invalid for this kernel + TypeMismatchError: If stream types are incompatible + ValueError: If required arguments are missing """ ... - @property - def is_current(self) -> bool: + def forward(self, *streams: Stream) -> Stream: """ - Returns whether the stream is current. - A stream is current if the content is up-to-date with respect to its source. - This can be used to determine if a stream with non-None last_modified is up-to-date. - Note that for asynchronous streams, this status is not applicable and always returns False. + Perform the actual computation without side effects. + + This method contains the core computation logic and should be + overridden by subclasses. It performs pure computation without: + - Registering with the computational graph + - Performing validation (caller's responsibility) + - Guaranteeing result type (may return static or live streams) + + The returned stream must be accurate at the time of invocation but + need not stay up-to-date with upstream changes. This makes forward() + suitable for: + - Testing and debugging + - Batch processing where currency isn't required + - Internal implementation details + + Args: + *streams: Input streams to process + + Returns: + Stream: Result of the computation (may be static or live) """ ... - def as_table(self) -> pa.Table: + def output_types(self, *streams: Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Determine output types without triggering computation. + + This method performs type inference based on input stream types, + enabling efficient type checking and stream property queries. + It should be fast and not trigger any expensive computation. + + Used for: + - Pre-execution type validation + - Query planning and optimization + - Schema inference in complex pipelines + - IDE support and developer tooling + + Args: + *streams: Input streams to analyze + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + + Raises: + ValidationError: If input types are incompatible + TypeError: If stream types cannot be processed + """ + ... + + def validate_inputs(self, *streams: Stream) -> None: """ - Convert the stream to a PyArrow Table. - To avoid collision, tags should be prefixed with "_tag_". + Validate input streams, raising exceptions if incompatible. + + This method is called automatically by __call__ before computation + to provide fail-fast behavior. It should check: + - Number of input streams + - Stream types and schemas + - Any kernel-specific requirements + - Business logic constraints + + The goal is to catch errors early, before expensive computation + begins, and provide clear error messages for debugging. + + Args: + *streams: Input streams to validate + + Raises: + ValidationError: If streams are invalid for this kernel + TypeError: If stream types are incompatible + ValueError: If stream content violates business rules """ ... - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: ... - def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: ... +class Pod(Kernel, Protocol): + """ + Specialized kernel for packet-level processing with advanced caching. + + Pods represent a different computational model from regular kernels: + - Process data one packet at a time (enabling fine-grained parallelism) + - Support just-in-time evaluation (computation deferred until needed) + - Provide stricter type contracts (clear input/output schemas) + - Enable advanced caching strategies (packet-level caching) + + The Pod abstraction is ideal for: + - Expensive computations that benefit from caching + - Operations that can be parallelized at the packet level + - Transformations with strict type contracts + - Processing that needs to be deferred until access time + - Functions that operate on individual data items + + Pods use a different execution model where computation is deferred + until results are actually needed, enabling efficient resource usage + and fine-grained caching. + """ + + def input_packet_types(self) -> TypeSpec: + """ + TypeSpec for input packets that this Pod can process. + + Defines the exact schema that input packets must conform to. + Pods are typically much stricter about input types than regular + kernels, requiring precise type matching for their packet-level + processing functions. - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + This specification is used for: + - Runtime type validation + - Compile-time type checking + - Schema inference and documentation + - Input validation and error reporting + + Returns: + TypeSpec: Dictionary mapping field names to required packet types + """ + ... + + def output_packet_types(self) -> TypeSpec: """ - Return the keys of the pipeline property. - This is used to define the keys of the pipeline property. + TypeSpec for output packets that this Pod produces. + + Defines the schema of packets that will be produced by this Pod. + This is typically determined by the Pod's computational function + and is used for: + - Type checking downstream kernels + - Schema inference in complex pipelines + - Query planning and optimization + - Documentation and developer tooling + + Returns: + TypeSpec: Dictionary mapping field names to output packet types """ ... - def types(self) -> tuple[TypeSpec, TypeSpec]: + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: """ - Return the types of the pipeline property. - This is used to define the types of the graph property. + Process a single packet with its associated tag. + + This is the core method that defines the Pod's computational behavior. + It processes one (tag, packet) pair at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Tag transformation (modify metadata) + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + tag: Metadata associated with the packet + packet: The data payload to process + + Returns: + tuple[Tag, Packet | None]: + - Tag: Output tag (may be modified from input) + - Packet: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing """ ... class Source(Kernel, Stream, Protocol): """ - A source is a special type of kernel that produces a stream of data. - It is the entry point for data into the computational graph. - Sources are typically used to read data from external sources such as files, databases, etc. + Entry point for data into the computational graph. + + Sources are special objects that serve dual roles: + - As Kernels: Can be invoked to produce streams + - As Streams: Directly provide data without upstream dependencies + + Sources represent the roots of computational graphs and typically + interface with external data sources. They bridge the gap between + the outside world and the Orcapod computational model. + + Common source types: + - File readers (CSV, JSON, Parquet, etc.) + - Database connections and queries + - API endpoints and web services + - Generated data sources (synthetic data) + - Manual data input and user interfaces + - Message queues and event streams + + Sources have unique properties: + - No upstream dependencies (upstreams is empty) + - Can be both invoked and iterated + - Serve as the starting point for data lineage + - May have their own refresh/update mechanisms """ + pass + class Tracker(Protocol): + """ + Records kernel invocations and stream creation for computational graph tracking. + + Trackers are responsible for maintaining the computational graph by recording + relationships between kernels, streams, and invocations. They enable: + - Lineage tracking and data provenance + - Caching and memoization strategies + - Debugging and error analysis + - Performance monitoring and optimization + - Reproducibility and auditing + + Multiple trackers can be active simultaneously, each serving different + purposes (e.g., one for caching, another for debugging, another for + monitoring). This allows for flexible and composable tracking strategies. + + Trackers can be selectively activated/deactivated to control overhead + and focus on specific aspects of the computational graph. + """ + def set_active(self, active: bool = True) -> None: """ Set the active state of the tracker. - This is used to activate or deactivate the tracker. - If the tracker is active, it will record the invocations of kernels. + + When active, the tracker will record all kernel invocations and + stream creations. When inactive, no recording occurs, reducing + overhead for performance-critical sections. + + Args: + active: True to activate recording, False to deactivate """ ... def is_active(self) -> bool: """ - Check if the tracker is active. - This is used to determine if the tracker is currently recording invocations. + Check if the tracker is currently recording invocations. + + Returns: + bool: True if tracker is active and recording, False otherwise """ ... def record(self, stream: Stream) -> None: """ - Record the output stream of a kernel invocation in the tracker. - This is used to track the computational graph and the invocations of kernels. + Record a stream in the computational graph. + + This method is called whenever a kernel produces a new stream. + The tracker should record: + - The stream and its properties + - The source kernel that created it + - The upstream streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + stream: The stream to record in the computational graph """ ... class TrackerManager(Protocol): + """ + Manages multiple trackers and coordinates their activity. + + The TrackerManager provides a centralized way to: + - Register and manage multiple trackers + - Coordinate recording across all active trackers + - Provide a single interface for graph recording + - Enable dynamic tracker registration/deregistration + + This design allows for: + - Multiple concurrent tracking strategies + - Pluggable tracking implementations + - Easy testing and debugging (mock trackers) + - Performance optimization (selective tracking) + """ + def get_active_trackers(self) -> list[Tracker]: """ - Get the list of active trackers. - This is used to retrieve the currently active trackers in the system. + Get all currently active trackers. + + Returns only trackers that are both registered and active, + providing the list of trackers that will receive recording events. + + Returns: + list[Tracker]: List of trackers that are currently recording """ ... def register_tracker(self, tracker: Tracker) -> None: """ Register a new tracker in the system. - This is used to add a new tracker to the list of active trackers. + + The tracker will be included in future recording operations + if it is active. Registration is separate from activation + to allow for dynamic control of tracking overhead. + + Args: + tracker: The tracker to register """ ... def deregister_tracker(self, tracker: Tracker) -> None: """ - Deregister a tracker from the system. - This is used to remove a tracker from the list of active trackers. + Remove a tracker from the system. + + The tracker will no longer receive recording notifications + even if it is still active. This is useful for: + - Cleaning up temporary trackers + - Removing failed or problematic trackers + - Dynamic tracker management + + Args: + tracker: The tracker to remove """ ... def record(self, stream: Stream) -> None: """ - Record the output stream of a kernel invocation in the tracker. - This is used to track the computational graph and the invocations of kernels. + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers """ ... From 5a178b55a4411112dc980f517dbfabdb5a5fef6f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 02:02:49 +0000 Subject: [PATCH 065/224] refactor: further refinement of tracker protocols --- src/orcapod/data/kernels.py | 29 +++++--------- src/orcapod/data/pods.py | 19 ++++++++++ src/orcapod/data/trackers.py | 25 +++++++++++-- src/orcapod/protocols/data_protocols.py | 50 +++++++++++++++++++++---- 4 files changed, 93 insertions(+), 30 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index acccf4e..831a51e 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -37,28 +37,19 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + def record_kernel_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + """ + Register the pod with the upstream streams. This is used to track the pod in the system. + """ + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_kernel_invocation(self, upstreams) + def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - output_stream = self.forward(*streams, **kwargs) - - if output_stream.source is not None: - kernel_stream = KernelStream(output_stream, label=label) - else: - logger.warning( - "Output stream does not have a source. " - "This may lead to unexpected behavior when tracking the kernel invocation." - ) - kernel_stream = KernelStream(source=self, upstreams=streams, label=label) - - # TODO: consider the logic around tracker manager more carefully - if not self._skip_tracking and self._tracker_manager is not None: - # register the invocation to all active trackers - active_trackers = self._tracker_manager.get_active_trackers() - for tracker in active_trackers: - tracker.record(kernel_stream) - - return kernel_stream + output_stream = KernelStream(source=self, upstreams=streams, label=label) + self.record_kernel_invocation(streams) + return output_stream @abstractmethod def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 7e2ce48..7e7ba7b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -109,6 +109,13 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: def validate_inputs(self, *streams: dp.Stream) -> None: self.process_and_verify_streams(*streams) + def record_pod_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + """ + Register the pod with the upstream streams. This is used to track the pod in the system. + """ + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_pod_invocation(self, upstreams) + def forward(self, *streams: dp.Stream) -> PodStream: input_stream = self.process_and_verify_streams(*streams) # at this point, streams should have been joined into one @@ -119,6 +126,18 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) + def __call__( + self, *streams: dp.Stream, label: str | None = None, **kwargs + ) -> PodStream: + """ + Invoke the pod with a collection of streams. This will process the streams and return a PodStream. + """ + output_stream = self.forward(*streams, **kwargs) + + self.record_pod_invocation(output_stream.upstreams) + + return output_stream + def function_pod( output_keys: str | Collection[str] | None = None, diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index fab481a..456711d 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -30,13 +30,25 @@ def get_active_trackers(self) -> list[dp.Tracker]: """ return [t for t in self._active_trackers if t.is_active()] - def record(self, stream: dp.Stream) -> None: + def record_kernel_invocation( + self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + ) -> None: """ Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ for tracker in self.get_active_trackers(): - tracker.record(stream) + tracker.record_kernel_invocation(kernel, upstreams) + + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + ) -> None: + """ + Record the output stream of a pod invocation in the tracker. + This is used to track the computational graph and the invocations of pods. + """ + for tracker in self.get_active_trackers(): + tracker.record_pod_invocation(pod, upstreams) class AutoRegisteringContextBasedTracker(ABC): @@ -55,7 +67,14 @@ def is_active(self) -> bool: return self._active @abstractmethod - def record(self, stream: dp.Stream) -> None: ... + def record_kernel_invocation( + self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + ) -> None: ... + + @abstractmethod + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + ) -> None: ... def __enter__(self): self.set_active(True) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index a997302..24b6861 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -767,20 +767,39 @@ def is_active(self) -> bool: """ ... - def record(self, stream: Stream) -> None: + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...] + ) -> None: """ - Record a stream in the computational graph. + Record a kernel invocation in the computational graph. - This method is called whenever a kernel produces a new stream. - The tracker should record: - - The stream and its properties - - The source kernel that created it + This method is called whenever a kernel is invoked. The tracker + should record: + - The kernel and its properties + - The input streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + kernel: The kernel that was invoked + upstreams: The input streams used for this invocation + """ + ... + + def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + """ + Record a pod invocation in the computational graph. + + This method is called whenever a pod is invoked. The tracker + should record: + - The pod and its properties - The upstream streams that were used as input - Timing and performance information - Any relevant metadata Args: - stream: The stream to record in the computational graph + pod: The pod that was invoked + upstreams: The input streams used for this invocation """ ... @@ -842,7 +861,9 @@ def deregister_tracker(self, tracker: Tracker) -> None: """ ... - def record(self, stream: Stream) -> None: + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...] + ) -> None: """ Record a stream in all active trackers. @@ -854,3 +875,16 @@ def record(self, stream: Stream) -> None: stream: The stream to record in all active trackers """ ... + + def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + """ + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently` + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers + """ + ... From 53527b1078869fc45f84fa412c1a7836266f8c50 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 07:16:03 +0000 Subject: [PATCH 066/224] feat: refine kernel and pod interaction with tracker --- src/orcapod/__init__.py | 9 ++ src/orcapod/data/__init__.py | 1 + src/orcapod/data/kernels.py | 61 +++++++-- src/orcapod/data/pods.py | 162 +++++++++++++----------- src/orcapod/data/trackers.py | 136 ++++++++++++++++++-- src/orcapod/hashing/hash_utils.py | 56 +++++++- src/orcapod/hashing/legacy_core.py | 1 + src/orcapod/protocols/data_protocols.py | 37 +++++- 8 files changed, 362 insertions(+), 101 deletions(-) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 01cd5db..b4de8e1 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,3 +1,12 @@ +from .data import DEFAULT_TRACKER_MANAGER + +no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking + +__all__ = [ + "DEFAULT_TRACKER_MANAGER", + "no_tracking", +] + # from .core import operators, sources, streams # from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator # from . import hashing, stores diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py index e69de29..6d7e206 100644 --- a/src/orcapod/data/__init__.py +++ b/src/orcapod/data/__init__.py @@ -0,0 +1 @@ +from .trackers import DEFAULT_TRACKER_MANAGER diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 831a51e..538cf11 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -27,6 +27,7 @@ class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): def __init__( self, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, @@ -36,27 +37,62 @@ def __init__( self._label = label self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self.fixed_input_streams = fixed_input_streams - def record_kernel_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Register the pod with the upstream streams. This is used to track the pod in the system. + Resolve the input streams for the kernel. If the kernel has fixed input streams, + it returns those. Otherwise, it returns the provided streams. + """ + if self.fixed_input_streams is not None: + if len(streams) != 0: + raise ValueError( + f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted." + ) + return self.fixed_input_streams + return streams + + def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + """ + Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing + on the input streams before the main computation. This is useful if you need to modify the input streams + or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the + pre-processing step will be tracked separately from the main computation in forward. + By default, it returns the input streams unchanged. + """ + return streams + + @abstractmethod + def validate_inputs(self, *streams: dp.Stream) -> None: ... + + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> dp.LiveStream: + """ + Prepare the output stream for the kernel invocation. + This method is called after the main computation is performed. + It creates a KernelStream with the provided streams and label. + """ + return KernelStream(source=self, upstreams=streams, label=label) + + def track_invocation(self, *streams: dp.Stream) -> None: + """ + Track the invocation of the kernel with the provided streams. + This is a convenience method that calls record_kernel_invocation. """ if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_kernel_invocation(self, upstreams) + self._tracker_manager.record_kernel_invocation(self, streams) def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - output_stream = KernelStream(source=self, upstreams=streams, label=label) - self.record_kernel_invocation(streams) + streams = self.resolve_input_streams(*streams) + processed_streams = self.pre_processing_step(*streams) + self.validate_inputs(*processed_streams) + output_stream = self.prepare_output_stream(*processed_streams, label=label) + self.track_invocation(*processed_streams) return output_stream - @abstractmethod - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... - - @abstractmethod - def validate_inputs(self, *streams: dp.Stream) -> None: ... - @abstractmethod def forward(self, *streams: dp.Stream) -> dp.Stream: """ @@ -65,6 +101,9 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: Subclasses should override this method to provide the kernel with its unique behavior """ + @abstractmethod + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def __repr__(self): return self.__class__.__name__ diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 7e7ba7b..6b1d730 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,47 +1,30 @@ -from orcapod.data.datagrams import PythonDictPacket -from orcapod.data.streams import PodStream -from orcapod.data.kernels import TrackedKernelBase -from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types import SemanticTypeRegistry, default_registry -from orcapod.types import typespec_utils as tsutils -from abc import abstractmethod - import logging import sys +from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast - -from orcapod.types.typespec_utils import ( - extract_function_typespecs, - check_typespec_compatibility, -) -from orcapod.types import TypeSpec - -from orcapod.hashing.legacy_core import get_function_signature +from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join - +from orcapod.data.streams import PodStream +from orcapod.hashing.hash_utils import get_function_signature +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import SemanticTypeRegistry, TypeSpec, default_registry +from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) error_handling_options = Literal["raise", "ignore", "warn"] -class PodBase(TrackedKernelBase): +class ActivatablePodBase(TrackedKernelBase): """ FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. It allows for the execution of a function with a specific label and can be tracked by the system. """ - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - """ - Return the input and output typespecs for the pod. - This is used to validate the input and output streams. - """ - input_stream = self.process_and_verify_streams(*streams) - tag_typespec, _ = input_stream.types() - return tag_typespec, self.output_packet_types() - @abstractmethod def input_packet_types(self) -> TypeSpec: """ @@ -63,14 +46,25 @@ def call( def __init__( self, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, error_handling: error_handling_options = "raise", label: str | None = None, **kwargs, ) -> None: - super().__init__(label=label, **kwargs) + super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self._active = True self.error_handling = error_handling + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Return the input and output typespecs for the pod. + This is used to validate the input and output streams. + """ + input_streams = self.pre_processing_step(*streams) + self.validate_inputs(*input_streams) + tag_typespec, _ = input_streams[0].types() + return tag_typespec, self.output_packet_types() + def is_active(self) -> bool: """ Check if the pod is active. If not, it will not process any packets. @@ -83,7 +77,22 @@ def set_active(self, active: bool) -> None: """ self._active = active - def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: + def validate_inputs(self, *streams: dp.Stream) -> None: + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + input_stream = streams[0] + _, incoming_packet_types = input_stream.types() + if not tsutils.check_typespec_compatibility( + incoming_packet_types, self.input_packet_types() + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" + ) + + def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Prepare the incoming streams for execution in the pod. This default implementation joins all the input streams together. @@ -96,29 +105,16 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: for next_stream in streams[1:]: stream = Join()(stream, next_stream) combined_streams = [stream] - input_stream = combined_streams[0] - _, incoming_packet_types = input_stream.types() - if not tsutils.check_typespec_compatibility( - incoming_packet_types, self.input_packet_types() - ): - raise ValueError( - f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" - ) - return input_stream - def validate_inputs(self, *streams: dp.Stream) -> None: - self.process_and_verify_streams(*streams) + return tuple(combined_streams) - def record_pod_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: - """ - Register the pod with the upstream streams. This is used to track the pod in the system. - """ + def track_invocation(self, *streams: dp.Stream) -> None: if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, upstreams) + self._tracker_manager.record_pod_invocation(self, streams) def forward(self, *streams: dp.Stream) -> PodStream: - input_stream = self.process_and_verify_streams(*streams) - # at this point, streams should have been joined into one + assert len(streams) == 1, "PodBase.forward expects exactly one input stream" + input_stream = streams[0] return PodStream( self, @@ -126,18 +122,6 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) - def __call__( - self, *streams: dp.Stream, label: str | None = None, **kwargs - ) -> PodStream: - """ - Invoke the pod with a collection of streams. This will process the streams and return a PodStream. - """ - output_stream = self.forward(*streams, **kwargs) - - self.record_pod_invocation(output_stream.upstreams) - - return output_stream - def function_pod( output_keys: str | Collection[str] | None = None, @@ -189,7 +173,7 @@ def decorator(func) -> FunctionPod: return decorator -class FunctionPod(PodBase): +class FunctionPod(ActivatablePodBase): def __init__( self, function: dp.PodFunction, @@ -227,7 +211,7 @@ def __init__( # extract input and output types from the function signature self._input_packet_types, self._output_packet_types = ( - extract_function_typespecs( + tsutils.extract_function_typespecs( self.function, self.output_keys, input_typespec=input_typespec, @@ -250,7 +234,7 @@ def output_packet_types(self) -> TypeSpec: return self._output_packet_types def __repr__(self) -> str: - return f"FunctionPod:{self.function!r}" + return f"FunctionPod:{self.function_name}" def __str__(self) -> str: include_module = self.function.__module__ != "__main__" @@ -271,7 +255,9 @@ def call( return tag, None output_values = [] - values = self.function(**packet.as_dict(include_source=False)) + # any kernel/pod invocation happening inside the function will NOT be tracked + with self._tracker_manager.no_tracking(): + values = self.function(**packet.as_dict(include_source=False)) if len(self.output_keys) == 0: output_values = [] @@ -297,6 +283,7 @@ def call( def identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function + # if function_info_extractor is available, use that but substitute the function_name if self.function_info_extractor is not None: function_info = self.function_info_extractor.extract_function_info( @@ -309,20 +296,39 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # use basic information only function_info = { "name": self.function_name, - "input_packet_types": self.input_packet_types, - "output_packet_types": self.output_packet_types, + "input_packet_types": self.input_packet_types(), + "output_packet_types": self.output_packet_types(), } function_info["output_keys"] = tuple(self.output_keys) - return ( + id_struct = ( self.__class__.__name__, function_info, - ) + streams + ) + # if streams are provided, perform pre-processing step, validate, and add the + # resulting single stream to the identity structure + if len(streams) > 0: + processed_streams = self.pre_processing_step(*streams) + self.validate_inputs(*processed_streams) + id_struct += (processed_streams[0],) + return id_struct -class StoredPod(PodBase): - def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: - super().__init__(**kwargs) + +class WrappedPod(ActivatablePodBase): + """ + A wrapper for a pod that allows it to be used as a kernel. + This class is meant to serve as a base class for other pods that need to wrap existing pods. + """ + + def __init__( + self, + pod: dp.Pod, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod def computed_label(self) -> str | None: @@ -349,7 +355,19 @@ def identity_structure(self, *streams: dp.Stream) -> Any: return self.pod.identity_structure(*streams) def __repr__(self) -> str: - return f"StoredPod({self.pod!r})" + return f"WrappedPod({self.pod!r})" def __str__(self) -> str: - return f"StoredPod:{self.pod!s}" + return f"WrappedPod:{self.pod!s}" + + +class CachedPod(WrappedPod): + """ + A pod that caches the results of the wrapped pod. + This is useful for pods that are expensive to compute and can benefit from caching. + """ + + def __init__(self, pod: dp.Pod, cache_key: str, **kwargs): + super().__init__(pod, **kwargs) + self.cache_key = cache_key + self.cache: dict[str, dp.Packet] = {} diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 456711d..5ad2a55 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,11 +1,22 @@ -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from abc import ABC, abstractmethod +from typing import Any, ContextManager, Generator +from contextlib import contextmanager class BasicTrackerManager: def __init__(self) -> None: self._active_trackers: list[dp.Tracker] = [] + self._active = True + + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the tracker manager. + This is used to enable or disable the tracker manager. + """ + self._active = active def register_tracker(self, tracker: dp.Tracker) -> None: """ @@ -28,27 +39,43 @@ def get_active_trackers(self) -> list[dp.Tracker]: Get the list of active trackers. This is used to retrieve the currently active trackers in the system. """ + if not self._active: + return [] + # Filter out inactive trackers + # This is to ensure that we only return trackers that are currently active return [t for t in self._active_trackers if t.is_active()] def record_kernel_invocation( - self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, ) -> None: """ Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ for tracker in self.get_active_trackers(): - tracker.record_kernel_invocation(kernel, upstreams) + tracker.record_kernel_invocation(kernel, upstreams, label=label) def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: """ Record the output stream of a pod invocation in the tracker. This is used to track the computational graph and the invocations of pods. """ for tracker in self.get_active_trackers(): - tracker.record_pod_invocation(pod, upstreams) + tracker.record_pod_invocation(pod, upstreams, label=label) + + @contextmanager + def no_tracking(self) -> Generator[None, Any, None]: + original_state = self._active + self.set_active(False) + try: + yield + finally: + self.set_active(original_state) class AutoRegisteringContextBasedTracker(ABC): @@ -68,12 +95,15 @@ def is_active(self) -> bool: @abstractmethod def record_kernel_invocation( - self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, ) -> None: ... @abstractmethod def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: ... def __enter__(self): @@ -84,6 +114,44 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) +class Invocation: + def __init__( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + """ + Represents an invocation of a kernel with its upstream streams. + This is used to track the computational graph and the invocations of kernels. + """ + self.kernel = kernel + self.upstreams = upstreams + self._label = label + + def parents(self) -> tuple["Invocation", ...]: + parent_invoctions = [] + for stream in self.upstreams: + if stream.source is not None: + parent_invoctions.append(Invocation(stream.source, stream.upstreams)) + return tuple(parent_invoctions) + + @property + def label(self) -> str | None: + """ + Return the label of the kernel invocation. + This is used to identify the invocation in the tracker. + """ + return self._label or self.kernel.label or self.kernel.__class__.__name__ + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this invocation. + This is used to uniquely identify the invocation in the tracker. + """ + return self.kernel.identity_structure(*self.upstreams) + + class GraphTracker(AutoRegisteringContextBasedTracker): """ A tracker that records the invocations of operations and generates a graph @@ -92,11 +160,20 @@ class GraphTracker(AutoRegisteringContextBasedTracker): # Thread-local storage to track active trackers - def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + def __init__( + self, + tracker_manager: dp.TrackerManager | None = None, + object_hasher: hp.ObjectHasher | None = None, + ) -> None: super().__init__(tracker_manager=tracker_manager) - self.kernel_to_invoked_stream_lut: dict[dp.Kernel, list[dp.Stream]] = ( - defaultdict(list) - ) + if object_hasher is None: + object_hasher = get_default_object_hasher() + self.object_hasher = object_hasher + # Dictionary to map kernels to the streams they have invoked + # This is used to track the computational graph and the invocations of kernels + self.id_to_invocation_lut: dict[str, Invocation] = {} + self.id_to_label_lut: dict[str, list[str]] = defaultdict(list) + self.id_to_pod_lut: dict[str, dp.Pod] = {} def record(self, stream: dp.Stream) -> None: assert stream.source is not None, ( @@ -106,6 +183,43 @@ def record(self, stream: dp.Stream) -> None: if stream not in stream_list: stream_list.append(stream) + def _record_kernel_and_get_id( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> str: + invocation = Invocation(kernel, upstreams, label=label) + invocation_id = self.object_hasher.hash_to_hex(invocation) + if invocation_id not in self.id_to_invocation_lut: + self.id_to_invocation_lut[invocation_id] = invocation + label = label or kernel.label or kernel.__class__.__name__ + existing_labels = self.id_to_label_lut[invocation_id] + if label not in existing_labels: + existing_labels.append(label) + return invocation_id + + def record_kernel_invocation( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + self._record_kernel_and_get_id(kernel, upstreams, label) + + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None + ) -> None: + """ + Record the output stream of a pod invocation in the tracker. + """ + invocation_id = self._record_kernel_and_get_id(pod, upstreams, label) + self.id_to_pod_lut[invocation_id] = pod + def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: """ Reset the tracker and return the recorded invocations. diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 476b0a0..790b49f 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -5,10 +5,11 @@ import json from uuid import UUID from pathlib import Path -from collections.abc import Mapping, Collection +from collections.abc import Mapping, Collection, Callable import hashlib import xxhash import zlib +import inspect logger = logging.getLogger(__name__) @@ -171,7 +172,7 @@ def process_structure( # handle data types if isinstance(obj, type): logger.debug(f"Processing class/type: {obj.__name__}") - return f"type:{obj.__class__.__module__}.{obj.__class__.__name__}" + return f"type:{obj.__name__}" # For other objects, attempt to create deterministic representation only if force_hash=True class_name = obj.__class__.__name__ @@ -310,3 +311,54 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: hasher.update(data) return hasher.digest() + + +def get_function_signature( + func: Callable, + name_override: str | None = None, + include_defaults: bool = True, + include_module: bool = True, + output_names: Collection[str] | None = None, +) -> str: + """ + Get a stable string representation of a function's signature. + + Args: + func: The function to process + include_defaults: Whether to include default values + include_module: Whether to include the module name + + Returns: + A string representation of the function signature + """ + sig = inspect.signature(func) + + # Build the signature string + parts = {} + + # Add module if requested + if include_module and hasattr(func, "__module__"): + parts["module"] = func.__module__ + + # Add function name + parts["name"] = name_override or func.__name__ + + # Add parameters + param_strs = [] + for name, param in sig.parameters.items(): + param_str = str(param) + if not include_defaults and "=" in param_str: + param_str = param_str.split("=")[0].strip() + param_strs.append(param_str) + + parts["params"] = f"({', '.join(param_strs)})" + + # Add return annotation if present + if sig.return_annotation is not inspect.Signature.empty: + parts["returns"] = sig.return_annotation + + # TODO: fix return handling + fn_string = f"{parts['module'] + '.' if 'module' in parts else ''}{parts['name']}{parts['params']}" + if "returns" in parts: + fn_string = fn_string + f"-> {str(parts['returns'])}" + return fn_string diff --git a/src/orcapod/hashing/legacy_core.py b/src/orcapod/hashing/legacy_core.py index e338a89..83d172b 100644 --- a/src/orcapod/hashing/legacy_core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -884,6 +884,7 @@ def get_function_signature( if sig.return_annotation is not inspect.Signature.empty: parts["returns"] = sig.return_annotation + # TODO: fix return handling fn_string = f"{parts['module'] + '.' if 'module' in parts else ''}{parts['name']}{parts['params']}" if "returns" in parts: fn_string = fn_string + f"-> {str(parts['returns'])}" diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 24b6861..266797b 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,4 +1,4 @@ -from typing import Protocol +from typing import Protocol, Any, ContextManager from orcapod.types import DataValue, TypeSpec from orcapod.protocols.hashing_protocols import ContentIdentifiable from collections.abc import Iterator, Collection @@ -602,6 +602,27 @@ def validate_inputs(self, *streams: Stream) -> None: """ ... + def identity_structure(self, *streams: Stream) -> Any: + """ + Generate a unique identity structure for this kernel and/or kernel invocation. + When invoked without streams, it should return a structure + that uniquely identifies the kernel itself (e.g., class name, parameters). + When invoked with streams, it should include the identity of the streams + to distinguish different invocations of the same kernel. + + This structure is used for: + - Caching and memoization + - Debugging and error reporting + - Tracking kernel invocations in computational graphs + + Args: + *streams: Optional input streams for this invocation + + Returns: + Any: Unique identity structure (e.g., tuple of class name and stream identities) + """ + ... + class Pod(Kernel, Protocol): """ @@ -768,7 +789,7 @@ def is_active(self) -> bool: ... def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...] + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ Record a kernel invocation in the computational graph. @@ -786,7 +807,9 @@ def record_kernel_invocation( """ ... - def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: """ Record a pod invocation in the computational graph. @@ -862,7 +885,7 @@ def deregister_tracker(self, tracker: Tracker) -> None: ... def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...] + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ Record a stream in all active trackers. @@ -876,7 +899,9 @@ def record_kernel_invocation( """ ... - def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: """ Record a stream in all active trackers. @@ -888,3 +913,5 @@ def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None stream: The stream to record in all active trackers """ ... + + def no_tracking(self) -> ContextManager[None]: ... From 6e2bdd7d5b657decff8a8ab473e6fc91ed9bf96e Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:18:45 +0000 Subject: [PATCH 067/224] feat: implement pure immutable datagram --- src/orcapod/data/datagrams.py | 271 ++++++++++++++++++++++++++++++++-- 1 file changed, 259 insertions(+), 12 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 717b928..139e22a 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -1,5 +1,5 @@ from orcapod.types.core import DataValue, StoreValue -from typing import TypeAlias, cast +from typing import TypeAlias, cast, Self from collections.abc import Callable, Mapping, Collection from orcapod.types import TypeSpec, default_registry from orcapod.protocols import data_protocols as dp, hashing_protocols as hp @@ -8,10 +8,45 @@ from orcapod.types import schemas from orcapod.types.typespec_utils import get_typespec_from_dict import pyarrow as pa +import logging from orcapod.hashing.defaults import get_default_arrow_hasher +# Constants used for source info keys +SOURCE_INFO_PREFIX = "_source_info_" + +# TODO: move this to a separate module +def hstack_tables(*tables: pa.Table) -> pa.Table: + if len(tables) == 0: + raise ValueError("At least one table is required for horizontal stacking.") + if len(tables) == 1: + return tables[0] + + N = len(tables[0]) + for table in tables[1:]: + if len(table) != N: + raise ValueError( + "All tables must have the same number of rows for horizontal stacking." + ) + + # create combined column names + all_column_names = [] + all_columns = [] + all_names = set() + for i, table in enumerate(tables): + if overlap := set(table.column_names).intersection(all_names): + raise ValueError( + f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" + ) + all_names.update(table.column_names) + all_column_names += table.column_names + all_columns += table.columns + + return pa.Table.from_arrays(all_columns, names=all_column_names) + + +logger = logging.getLogger(__name__) # A conveniece packet-like type that defines a value that can be # converted to a packet. It's broader than Packet and a simple mapping # from string keys to DataValue (e.g., int, float, str) can be regarded @@ -192,6 +227,124 @@ def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): ) +class ImmutableDict(Mapping[str, DataValue]): + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + +# TODO: Inherit from Mapping instead to provide immutable datagram +class DictDatagram(ImmutableDict): + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the data content and remove any source info keys + super().__init__(data) + + # combine provided typespec info with inferred typespec from content + verified_typespec = {} + if typespec is not None: + verified_typespec = dict(typespec) + # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict + inferred_typespec = get_typespec_from_dict(self) + for key in self: + if key not in verified_typespec: + verified_typespec[key] = inferred_typespec[key] + self._python_schema = schemas.PythonSchema(verified_typespec) + + # create semantic converter + if semantic_converter is not None: + if semantic_converter.python_schema != self._python_schema: + raise ValueError( + "Incompatible Python schema between packet and semantic converter: " + + str(self._python_schema) + + " vs " + + str(semantic_converter.python_schema) + ) + else: + semantic_converter = SemanticConverter.from_typespec( + self._python_schema, + semantic_type_registry or default_registry, + ) + self.semantic_converter = semantic_converter + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + self._cached_table: pa.Table | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + keep_columns: Collection[str] | None = None, + drop_columns: Collection[str] | None = None, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + if keep_columns is not None and drop_columns is not None: + logger.warning( + "It is not recommended to provide both keep_columns and drop_columns. The resulting behavior may not be as expected." + ) + if self._cached_table is None: + self._cached_table = ( + self.semantic_converter.from_python_store_to_arrow_table(self.as_dict()) + ) + assert self._cached_table is not None, "Cached table should not be None" + processed_table = self._cached_table + if keep_columns is not None: + processed_table = processed_table.select(list(keep_columns)) + + if drop_columns is not None: + processed_table = processed_table.drop(list(drop_columns)) + + return processed_table + + def as_dict(self) -> dict[str, DataValue]: + return dict(self) + + def content_hash( + self, + ) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def copy(self) -> Self: + return self.__class__( + self, + typespec=self.types(), + semantic_converter=self.semantic_converter, + arrow_hasher=self.arrow_hasher, + ) + + class PythonDictTag(dict[str, DataValue]): def as_dict(self) -> dict[str, DataValue]: return dict(self) @@ -243,6 +396,99 @@ def __repr__(self) -> str: return f"{self.as_dict()}" +class PythonDictPacket2(DictDatagram): + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX) + } + contained_source_info = { + k.removeprefix(SOURCE_INFO_PREFIX): v + for k, v in data.items() + if k.startswith(SOURCE_INFO_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + + def as_table( + self, + keep_columns: Collection[str] | None = None, + drop_columns: Collection[str] | None = None, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table(keep_columns=keep_columns, drop_columns=drop_columns) + if include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() + } + source_info_schema = pa.schema( + {k: pa.large_string() for k in source_info_data} + ) + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] + ) + table = hstack_tables(table, source_info_table) + return table + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + dict_copy = dict(self) + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value + return dict_copy + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(include_source=False), prefix_hasher_id=True + ) + return self._cached_content_hash + + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def source_info(self) -> dict[str, str | None]: + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> "PythonDictPacket2": + """Return a shallow copy of the packet.""" + new_packet = PythonDictPacket2(self, self.source_info()) + new_packet._cached_table = self._cached_table + new_packet._cached_content_hash = self._cached_content_hash + new_packet._python_schema = self._python_schema.copy() + new_packet.semantic_converter = self.semantic_converter + new_packet.arrow_hasher = self.arrow_hasher + return new_packet + + class PythonDictPacket(dict[str, DataValue]): @classmethod def create_from( @@ -281,11 +527,11 @@ def __init__( post_hash_callback: Callable[[str, str], None] | None = None, ) -> None: # normalize the data content and remove any source info keys - data = {k: v for k, v in data.items() if not k.startswith("_source_info_")} + data = {k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX)} contained_source_info = { - k.removeprefix("_source_info_"): v + k.removeprefix(SOURCE_INFO_PREFIX): v for k, v in data.items() - if k.startswith("_source_info_") + if k.startswith(SOURCE_INFO_PREFIX) } super().__init__(data) @@ -345,7 +591,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: dict_copy = self.copy() if include_source: for key, value in self.source_info().items(): - dict_copy[f"_source_info_{key}"] = value + dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy def content_hash(self) -> str: @@ -401,9 +647,9 @@ def process_table_with_source_info( existing_source_info = {} for i, name in enumerate(table.column_names): - if name.startswith("_source_info_"): + if name.startswith(SOURCE_INFO_PREFIX): # Extract the base column name - base_name = name.removeprefix("_source_info_") + base_name = name.removeprefix(SOURCE_INFO_PREFIX) existing_source_info[base_name] = table.column(i) else: regular_columns.append(table.column(i)) @@ -421,7 +667,7 @@ def process_table_with_source_info( num_rows = table.num_rows for col_name in regular_names: - source_info_col_name = f"_source_info_{col_name}" + source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" # if col_name is in source_info, use that value if col_name in source_info: @@ -501,12 +747,12 @@ def __init__( ) else: self._keys: tuple[str, ...] = tuple( - [c for c in table.column_names if not c.startswith("_source_info_")] + [c for c in table.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) for k in self._keys: - if f"_source_info_{k}" not in table.column_names: + if f"{SOURCE_INFO_PREFIX}{k}" not in table.column_names: raise ValueError( - f"Source info column '_source_info_{k}' is missing in the table." + f"Source info column '{SOURCE_INFO_PREFIX}{k}' is missing in the table." ) self._arrow_table = table @@ -571,7 +817,8 @@ def keys(self) -> tuple[str, ...]: def source_info(self) -> dict[str, str | None]: if self._cached_source_info is None: self._cached_source_info = { - k: self._arrow_table[f"_source_info_{k}"][0].as_py() for k in self._keys + k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() + for k in self._keys } return self._cached_source_info.copy() From 72937494994b495237f75468f43970d4d2a8dcc3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:18:59 +0000 Subject: [PATCH 068/224] fix: preparation of output stream in pod --- src/orcapod/data/pods.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 6b1d730..3eb1346 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -108,6 +108,13 @@ def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return tuple(combined_streams) + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> dp.LiveStream: + output_stream = self.forward(*streams) + output_stream.label = label + return output_stream + def track_invocation(self, *streams: dp.Stream) -> None: if not self._skip_tracking and self._tracker_manager is not None: self._tracker_manager.record_pod_invocation(self, streams) From 7f49de0622f39bda0d325ede1784d4cbed905328 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:19:50 +0000 Subject: [PATCH 069/224] feat: add feature to include content hash in arrow table --- src/orcapod/data/streams.py | 40 +++++++++++++++++++++++-- src/orcapod/protocols/data_protocols.py | 36 +++++++++++++++------- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 2454f85..223011b 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -335,11 +335,20 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: self._packet_converter.python_schema.copy(), ) - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. """ + if not include_content_hash: + return self._table + hash_column_name = ( + "_content_hash" if include_content_hash is True else include_content_hash + ) + content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] + self._table = self._table.append_column( + hash_column_name, pa.array(content_hashes, type=pa.large_string()) + ) return self._table def clear_cache(self) -> None: @@ -354,6 +363,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). """ + # TODO: make it work with table batch stream if self._cached_elements is None: self._cached_elements = [] tags = self._table.select(self._tag_columns) @@ -395,6 +405,7 @@ def __init__( self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} self._computation_complete: bool = False self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None @property def source(self) -> dp.Pod | None: @@ -427,6 +438,7 @@ def clear_cache(self) -> None: self._cached_output_packets = {} self._computation_complete = False self._cached_output_table = None + self._cached_content_hash_column = None def refresh(self, force: bool = False) -> bool: if not self.is_current or force: @@ -442,7 +454,8 @@ def invalidate(self) -> None: self.clear_cache() self._set_modified_time(invalidate=True) - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + # TODO: note that this is likely NOT multi-thread safe self.refresh() if self._cached_output_table is None: all_tags = [] @@ -450,7 +463,8 @@ def as_table(self) -> pa.Table: for tag, packet in self.iter_packets(): # TODO: evaluate handling efficiency here all_tags.append(tag.as_dict()) - all_packets.append(packet.as_dict()) + all_packets.append(packet.as_dict(include_source=True)) + all_tags: pa.Table = pa.Table.from_pylist(all_tags) all_packets: pa.Table = pa.Table.from_pylist(all_packets) # assert that column names do not overlap @@ -466,6 +480,26 @@ def as_table(self) -> pa.Table: names=all_tags.column_names + all_packets.column_names, ) + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + return self._cached_output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) return self._cached_output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 266797b..767ea0e 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -66,6 +66,21 @@ def as_dict(self) -> dict[str, DataValue]: """ ... + def content_hash(self) -> str: + """ + Return a hash of the packet content for caching/comparison. + + This hash should be deterministic and based only on the packet content, + not on source information or metadata. Used for: + - Caching computation results + - Detecting data changes + - Deduplication operations + + Returns: + str: Deterministic hash of packet content + """ + ... + class Tag(Datagram, Protocol): """ @@ -134,18 +149,16 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ ... - def content_hash(self) -> str: + def as_datagram(self, include_source: bool = False) -> Datagram: """ - Return a hash of the packet content for caching/comparison. + Convert the packet to a Datagram. - This hash should be deterministic and based only on the packet content, - not on source information or metadata. Used for: - - Caching computation results - - Detecting data changes - - Deduplication operations + Args: + include_source: If True, source information is included in the datagram + for debugging and lineage tracking Returns: - str: Deterministic hash of packet content + Datagram: Datagram representation of packet data """ ... @@ -382,7 +395,7 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: """ Convert the entire stream to a PyArrow Table. @@ -390,8 +403,9 @@ def as_table(self) -> pa.Table: analysis and processing. This operation may be expensive for large streams or live streams that need computation. - Tag fields are prefixed with "_tag_" to avoid naming conflicts - with packet fields. + If include_content_hash is True, an additional column called "_content_hash" + containing the content hash of each packet is included. If include_content_hash + is a string, it is used as the name of the content hash column. Returns: pa.Table: Complete stream data as a PyArrow Table From ff9949509e3032b77cd2ced12d273a5bddd0136f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:47:43 +0000 Subject: [PATCH 070/224] doc: add comprehensive documentation to datagrams --- src/orcapod/data/datagrams.py | 360 +++++++++++++++++++++++++++++++++- 1 file changed, 359 insertions(+), 1 deletion(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 139e22a..f32ee89 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -1,3 +1,21 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + from orcapod.types.core import DataValue, StoreValue from typing import TypeAlias, cast, Self from collections.abc import Callable, Mapping, Collection @@ -18,6 +36,21 @@ # TODO: move this to a separate module def hstack_tables(*tables: pa.Table) -> pa.Table: + """ + Horizontally stack multiple PyArrow tables by concatenating their columns. + + All input tables must have the same number of rows and unique column names. + + Args: + *tables: Variable number of PyArrow tables to stack horizontally + + Returns: + Combined PyArrow table with all columns from input tables + + Raises: + ValueError: If no tables provided, tables have different row counts, + or duplicate column names are found + """ if len(tables) == 0: raise ValueError("At least one table is required for horizontal stacking.") if len(tables) == 1: @@ -122,11 +155,29 @@ def check_arrow_schema_compatibility( class SemanticConverter: + """ + Converts data between different representations (Python, semantic stores, Arrow tables). + + This class handles the conversion between Python data structures, semantic stores + (which use storage-optimized types), and Arrow tables while maintaining type + information and semantic type metadata. + """ + @staticmethod def prepare_handler( semantic_schema: schemas.SemanticSchema, semantic_type_registry: SemanticTypeRegistry, ) -> dict[str, TypeHandler]: + """ + Prepare type handlers for semantic type conversion. + + Args: + semantic_schema: Schema containing semantic type information + semantic_type_registry: Registry for looking up type handlers + + Returns: + Dictionary mapping field names to their type handlers + """ handler_lut = {} for key, (_, semantic_type) in semantic_schema.items(): if semantic_type is None: @@ -140,6 +191,16 @@ def prepare_handler( def from_typespec( cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry ) -> "SemanticConverter": + """ + Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). + + Args: + typespec: Type specification dictionary + semantic_type_registry: Registry for semantic type lookup + + Returns: + New SemanticConverter instance + """ semantic_schema = schemas.from_typespec_to_semantic_schema( typespec, semantic_type_registry ) @@ -151,6 +212,16 @@ def from_typespec( def from_arrow_schema( cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry ) -> "SemanticConverter": + """ + Create a SemanticConverter from an Arrow schema. + + Args: + arrow_schema: PyArrow schema with semantic type metadata + semantic_type_registry: Registry for semantic type lookup + + Returns: + New SemanticConverter instance + """ semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) python_schema = schemas.from_semantic_schema_to_python_schema( semantic_schema, semantic_type_registry=semantic_type_registry @@ -164,6 +235,15 @@ def __init__( semantic_schema: schemas.SemanticSchema, handler_lut: dict[str, TypeHandler] | None = None, ): + """ + Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. + Use class methods like `from_arrow_schema` or `from_typespec` instead. + + Args: + python_schema: Schema for Python data types + semantic_schema: Schema for semantic types + handler_lut: Optional dictionary of type handlers for conversion + """ self.python_schema = python_schema self.semantic_schema = semantic_schema self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( @@ -176,6 +256,15 @@ def __init__( def from_semantic_store_to_python_store( self, semantic_store: SemanticStore ) -> PythonStore: + """ + Convert a semantic store to a Python store. + + Args: + semantic_store: Store (dict) with data stored in semantic (storage-optimized) types + + Returns: + Store with Python native types + """ python_store = dict(semantic_store) for key, handler in self.handler_lut.items(): python_store[key] = handler.storage_to_python(semantic_store[key]) @@ -184,6 +273,15 @@ def from_semantic_store_to_python_store( def from_python_store_to_semantic_store( self, python_store: PythonStore ) -> SemanticStore: + """ + Convert a Python store to a semantic store. + + Args: + python_store: Store with Python native types + + Returns: + Store with semantic (storage-optimized) types + """ semantic_store = dict(python_store) for key, handler in self.handler_lut.items(): semantic_store[key] = handler.python_to_storage(python_store[key]) @@ -210,13 +308,22 @@ def from_arrow_table_to_semantic_stores( def from_arrow_table_to_python_stores( self, arrow_table: pa.Table ) -> list[PythonStore]: - """Convert an Arrow table to a Python store.""" + """Convert an Arrow table to a list of Python stores.""" return [ self.from_semantic_store_to_python_store(semantic_store) for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) ] def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): + """ + Verify that an Arrow schema is compatible with the expected schema. + + Args: + arrow_schema: Schema to verify + + Raises: + ValueError: If schemas are incompatible + """ compatible, errors = check_arrow_schema_compatibility( arrow_schema, self.arrow_schema ) @@ -228,6 +335,17 @@ def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + def __init__(self, data: Mapping[str, DataValue]): self._data = dict(data) @@ -249,6 +367,24 @@ def __str__(self) -> str: # TODO: Inherit from Mapping instead to provide immutable datagram class DictDatagram(ImmutableDict): + """ + An immutable datagram implementation using a dictionary backend. + + Extends ImmutableDict to provide additional functionality for type handling, + semantic conversion, and Arrow table representation while maintaining + immutability of the underlying data. + + + Initialize DictDatagram with data and optional type information. + + Args: + data: Source data mapping + typespec: Optional type specification for fields + semantic_converter: Optional converter for semantic types + semantic_type_registry: Registry for semantic type lookup + arrow_hasher: Optional hasher for Arrow table content + """ + def __init__( self, data: Mapping[str, DataValue], @@ -319,11 +455,18 @@ def as_table( return processed_table def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" return dict(self) def content_hash( self, ) -> str: + """ + Calculate and return content hash of the datagram. + + Returns: + Hash string of the datagram content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(), @@ -334,9 +477,11 @@ def content_hash( # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """Return copy of the Python schema.""" return self._python_schema.copy() def copy(self) -> Self: + """Return a copy of the datagram.""" return self.__class__( self, typespec=self.types(), @@ -346,18 +491,47 @@ def copy(self) -> Self: class PythonDictTag(dict[str, DataValue]): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation.""" return dict(self) def as_table(self) -> pa.Table: + """Convert to Arrow table representation.""" return pa.Table.from_pylist([self]) def types(self) -> schemas.PythonSchema: + """ + Return Python schema (basic implementation). + + Note: This is a simplified implementation that assumes all values are strings. + """ # TODO: provide correct implementation return schemas.PythonSchema({k: str for k in self.keys()}) class ArrowTag: + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + def __init__(self, table: pa.Table) -> None: self.table = table if len(table) != 1: @@ -369,9 +543,16 @@ def __init__(self, table: pa.Table) -> None: self._cached_python_dict: dict[str, DataValue] | None = None def keys(self) -> tuple[str, ...]: + """Return column names as a tuple.""" return tuple(self.table.column_names) def types(self) -> schemas.PythonSchema: + """ + Return Python schema derived from Arrow schema. + + Returns: + TypeSpec information returned as PythonSchema. + """ if self._cached_python_schema is None: self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( self.table.schema @@ -379,6 +560,12 @@ def types(self) -> schemas.PythonSchema: return self._cached_python_schema.copy() def as_dict(self) -> dict[str, DataValue]: + """ + Convert to Python dictionary representation. + + Returns: + Dictionary with tag data + """ if self._cached_python_dict is None: self._cached_python_dict = cast( dict[str, DataValue], self.table.to_pylist()[0] @@ -386,17 +573,38 @@ def as_dict(self) -> dict[str, DataValue]: return self._cached_python_dict def as_table(self) -> pa.Table: + """Return the underlying Arrow table.""" return self.table def clear_cache(self) -> None: + """Clear cached Python representations.""" self._cached_python_schema = None self._cached_python_dict = None def __repr__(self) -> str: + """Return string representation.""" return f"{self.as_dict()}" class PythonDictPacket2(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + def __init__( self, data: Mapping[str, DataValue], @@ -457,6 +665,15 @@ def as_table( return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ dict_copy = dict(self) if include_source: for key, value in self.source_info().items(): @@ -464,6 +681,12 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return dict_copy def content_hash(self) -> str: + """ + Calculate content hash excluding source information. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(include_source=False), prefix_hasher_id=True @@ -473,9 +696,19 @@ def content_hash(self) -> str: # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """ + Returns: + Packet type information as PythonSchema (dict mapping field names to types). + """ return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ return {key: self._source_info.get(key, None) for key in self.keys()} def copy(self) -> "PythonDictPacket2": @@ -490,6 +723,27 @@ def copy(self) -> "PythonDictPacket2": class PythonDictPacket(dict[str, DataValue]): + """ + Dictionary-based Packet with source tracking and hashing. + + A dictionary-based packet that maintains source information, supports + type specifications, and provides content hashing with optional callbacks. + Includes comprehensive conversion capabilities to Arrow tables. + + Initialize packet with comprehensive configuration options. + + Args: + data: Primary packet data + source_info: Optional source information mapping + typespec: Optional type specification + finger_print: Optional fingerprint for tracking + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + """ + @classmethod def create_from( cls, @@ -500,6 +754,20 @@ def create_from( arrow_hasher: hp.ArrowHasher | None = None, post_hash_callback: Callable[[str, str], None] | None = None, ) -> "PythonDictPacket": + """ + Create a PythonDictPacket from another packet object. + + Args: + object: Source packet to copy from + finger_print: Optional fingerprint identifier + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + Returns: + New PythonDictPacket instance + """ if isinstance(object, PythonDictPacket): return object.copy() @@ -588,6 +856,15 @@ def as_table(self, include_source: bool = False) -> pa.Table: return self._cached_table.select(list(self.keys())) def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ dict_copy = self.copy() if include_source: for key, value in self.source_info().items(): @@ -595,6 +872,15 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return dict_copy def content_hash(self) -> str: + """ + Calculate and return content hash. + + Computes hash of packet data content (thus excluding source info) and + optionally triggers post-hash callback if configured. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(include_source=False), prefix_hasher_id=True @@ -606,9 +892,16 @@ def content_hash(self) -> str: # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """Return packet data type information as PythonSchema (dict mapping field names to types).""" return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ return {key: self._source_info.get(key, None) for key in self.keys()} def copy(self) -> "PythonDictPacket": @@ -697,6 +990,30 @@ def process_table_with_source_info( class ArrowPacket: + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + @classmethod def create_from( cls, @@ -707,6 +1024,20 @@ def create_from( arrow_hasher: hp.ArrowHasher | None = None, post_hash_callback: Callable[[str, str], None] | None = None, ) -> "ArrowPacket": + """ + Create an ArrowPacket from another packet object. + + Args: + object: Source packet to copy from + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint identifier + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + Returns: + New ArrowPacket instance + """ if isinstance(object, ArrowPacket): return object.copy() @@ -787,6 +1118,15 @@ def as_table(self, include_source: bool = False) -> pa.Table: return base_table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ if self._cached_python_packet is None: self._cached_python_packet = ( self.semantic_converter.from_arrow_table_to_python_stores( @@ -799,6 +1139,15 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return {k: self._cached_python_packet[k] for k in self._keys} def content_hash(self) -> str: + """ + Calculate and return content hash. + + Computes hash of the Arrow table content and optionally + triggers post-hash callback if configured. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self._arrow_table, prefix_hasher_id=True @@ -808,6 +1157,7 @@ def content_hash(self) -> str: return self._cached_content_hash def types(self) -> schemas.PythonSchema: + """Return packet data type information as PythonSchema (dict mapping field names to types).""" return self.semantic_converter.python_schema.copy() def keys(self) -> tuple[str, ...]: @@ -815,6 +1165,12 @@ def keys(self) -> tuple[str, ...]: return tuple(self._keys) def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ if self._cached_source_info is None: self._cached_source_info = { k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() @@ -846,8 +1202,10 @@ def copy(self) -> "ArrowPacket": return new_packet def __repr__(self) -> str: + """Return string representation.""" return f"{self.as_dict(include_source=False)}" # a batch is a tuple of a tag and a list of packets Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] +"""Type alias for a batch: a tuple containing a tag and collection of packets.""" From 5c8f85d2ed7c78f3abee9e941d4e3449daa63666 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:29:13 +0000 Subject: [PATCH 071/224] refactor: remove unused datagram base --- src/orcapod/data/base.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index 5082c9b..f8788e1 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -10,35 +10,6 @@ logger = logging.getLogger(__name__) -class DatagramBase(ABC): - """ - Base class for data packets that can be processed in a pipeline. - This class provides a common interface for data packets, allowing them to be processed - and transformed in a consistent manner. - """ - - @property - @abstractmethod - def typespec(self) -> TypeSpec: - """Return the type specification of the data packet.""" - pass - - @abstractmethod - def keys(self) -> tuple[str, ...]: - """Return the keys of the data packet.""" - pass - - @abstractmethod - def as_table(self) -> pa.Table: - """Convert the data packet to a PyArrow Table.""" - pass - - @abstractmethod - def as_dict(self) -> dict[str, Any]: - """Convert the data packet to a dictionary.""" - pass - - class LabeledContentIdentifiableBase: """ Base class for content-identifiable objects. From 3d3e946fa65a5c0a4bd3af1e6e33a15784c204bf Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:29:36 +0000 Subject: [PATCH 072/224] refactor: combine pre-foward step into one for simplicity --- src/orcapod/data/kernels.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 538cf11..e876916 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -39,10 +39,13 @@ def __init__( self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams - def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Resolve the input streams for the kernel. If the kernel has fixed input streams, - it returns those. Otherwise, it returns the provided streams. + Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing + on the input streams before the main computation. This is useful if you need to modify the input streams + or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the + pre-processing step will be tracked separately from the main computation in forward. + By default, it returns the input streams unchanged. """ if self.fixed_input_streams is not None: if len(streams) != 0: @@ -52,16 +55,6 @@ def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return self.fixed_input_streams return streams - def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: - """ - Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing - on the input streams before the main computation. This is useful if you need to modify the input streams - or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the - pre-processing step will be tracked separately from the main computation in forward. - By default, it returns the input streams unchanged. - """ - return streams - @abstractmethod def validate_inputs(self, *streams: dp.Stream) -> None: ... @@ -86,8 +79,7 @@ def track_invocation(self, *streams: dp.Stream) -> None: def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - streams = self.resolve_input_streams(*streams) - processed_streams = self.pre_processing_step(*streams) + processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) self.track_invocation(*processed_streams) From 0d8f7cb945e0c71dddf7ec4f7443ebce50d02943 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:30:04 +0000 Subject: [PATCH 073/224] refactor: adopt the new method signature for pre-forward step --- src/orcapod/data/pods.py | 43 +++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 3eb1346..a6b0d0f 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -60,7 +60,7 @@ def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ - input_streams = self.pre_processing_step(*streams) + input_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*input_streams) tag_typespec, _ = input_streams[0].types() return tag_typespec, self.output_packet_types() @@ -92,21 +92,40 @@ def validate_inputs(self, *streams: dp.Stream) -> None: f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" ) - def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + @staticmethod + def _join_streams(*streams: dp.Stream) -> dp.Stream: + if not streams: + raise ValueError("No streams provided for joining") + # Join the streams using a suitable join strategy + if len(streams) == 1: + return streams[0] + + joined_stream = streams[0] + for next_stream in streams[1:]: + joined_stream = Join()(joined_stream, next_stream) + return joined_stream + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Prepare the incoming streams for execution in the pod. This default implementation - joins all the input streams together. + Prepare the incoming streams for execution in the pod. If fixed_input_streams are present, + they will be used as the input streams and the newly provided streams would be used to + restrict (semijoin) the fixed streams. + Otherwise, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - combined_streams = list(streams) - if len(streams) > 1: - stream = streams[0] - for next_stream in streams[1:]: - stream = Join()(stream, next_stream) - combined_streams = [stream] - - return tuple(combined_streams) + if self.fixed_input_streams is not None and len(streams) > 0: + output_stream = self._join_streams(*self.fixed_input_streams) + if len(streams) > 0: + restrict_stream = self._join_streams(*streams) + # output_stream = SemiJoin()(output_stream, restrict_stream) + else: + if len(streams) == 0: + raise ValueError( + f"{self.__class__.__name__} expects at least one input stream" + ) + output_stream = self._join_streams(*streams) + return (output_stream,) def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None From a7531bf6015e5bb653bfa527a00796e6e4c1026c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:30:28 +0000 Subject: [PATCH 074/224] feat: add non-zero input operator --- src/orcapod/data/operators.py | 155 +++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 2 deletions(-) diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 3db4949..95667d2 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -14,7 +14,95 @@ class InputValidationError(Exception): """ -class BinaryOperator(TrackedKernelBase): +class Operator(TrackedKernelBase): + """ + Base class for all operators. + Operators are a special type of kernel that can be used to perform operations on streams. + They are defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + """ + + +class NonZeroInputOperator(Operator): + """ + Operators that work with at least one input stream. + This is useful for operators that can take a variable number of (but at least one ) input streams, + such as joins, unions, etc. + """ + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.verify_non_zero_input(*streams) + return self.op_validate_inputs(*streams) + + @abstractmethod + def op_validate_inputs(self, *streams: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + def verify_non_zero_input( + self, + *streams: dp.Stream, + ) -> None: + """ + Check that the inputs to the variable inputs operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if len(streams) == 0: + raise ValueError( + f"Operator {self.__class__.__name__} requires at least one input stream." + ) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for variable inputs operators. + It expects at least one stream as input. + """ + return self.op_forward(*streams) + + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + self.validate_inputs(*streams) + return self.op_output_types(*streams) + + def identity_structure(self, *streams: dp.Stream) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + if len(streams) > 0: + self.verify_non_zero_input(*streams) + return self.op_identity_structure(*streams) + + @abstractmethod + def op_forward(self, *streams: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. + It takes variable number of streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes at least one stream as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure(self, *streams: dp.Stream) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. + If zero, it should return identity of the operator itself. + If one or more, it should return a identity structure approrpiate for the operator invoked on the given streams. + """ + ... + + +class BinaryOperator(Operator): """ Base class for all operators. """ @@ -93,7 +181,7 @@ def op_identity_structure(self, *streams: dp.Stream) -> Any: ... -class Join(BinaryOperator): +class BinaryJoin(BinaryOperator): def op_identity_structure(self, *streams: dp.Stream) -> Any: # Join does not depend on the order of the streams -- convert it onto a set id_struct = (self.__class__.__name__,) @@ -154,3 +242,66 @@ def op_validate_inputs( def __repr__(self) -> str: return "Join()" + + +class Join(NonZeroInputOperator): + def op_identity_structure(self, *streams: dp.Stream) -> Any: + # Join does not depend on the order of the streams -- convert it onto a set + id_struct = (self.__class__.__name__,) + if len(streams) > 0: + id_struct += (set(streams),) + return id_struct + + def op_forward(self, *streams: dp.Stream) -> ImmutableTableStream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + + all_tag_typespecs = [] + all_packet_typespecs = [] + + for stream in streams: + tag_typespec, packet_typespec = stream.types() + all_tag_typespecs.append(tag_typespec) + all_packet_typespecs.append(packet_typespec) + + common_tag_keys = tuple(intersection_typespecs(*all_tag_typespecs).keys()) + joined_tag_keys = tuple(union_typespecs(*all_tag_typespecs).keys()) + + # performing a check to ensure that packets are compatible + union_typespecs(*all_packet_typespecs) + + joined_table = left_stream.as_table().join( + right_stream.as_table(), + keys=common_tag_keys, + join_type="inner", + ) + + return ImmutableTableStream( + joined_table, + tag_columns=tuple(joined_tag_keys), + source=self, + upstreams=streams, + ) + + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + left_stream, right_stream = streams + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) + joined_packet_typespec = union_typespecs( + left_packet_typespec, right_packet_typespec + ) + return joined_tag_typespec, joined_packet_typespec + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + try: + self.op_output_types(left_stream, right_stream) + except Exception as e: + raise InputValidationError(f"Input streams are not compatible: {e}") + + def __repr__(self) -> str: + return "Join()" From 730f72bef7f4f150a104e0a8827d66a002646a49 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 18 Jul 2025 07:26:07 +0000 Subject: [PATCH 075/224] wip: major refactoring of package structure --- src/orcapod/core/pod.py | 2 +- src/orcapod/data/datagram_store.py | 890 +++++++++++ src/orcapod/data/datagrams.py | 1314 +++++++---------- src/orcapod/data/operators.py | 10 + src/orcapod/data/pods.py | 80 +- src/orcapod/data/streams.py | 77 +- src/orcapod/hashing/arrow_hashers.py | 13 +- src/orcapod/hashing/semantic_type_hashers.py | 22 +- src/orcapod/hashing/versioned_hashers.py | 31 +- src/orcapod/pipeline/nodes.py | 5 +- src/orcapod/protocols/hashing_protocols.py | 5 + src/orcapod/protocols/store_protocols.py | 34 + src/orcapod/stores/__init__.py | 8 +- src/orcapod/stores/delta_lake_stores.py | 861 +++++++++++ .../delta_table_arrow_data_store.py | 0 .../stores/{ => legacy}/dict_data_stores.py | 2 +- .../{ => legacy}/dict_transfer_data_store.py | 2 +- .../legacy_arrow_data_stores.py} | 0 .../{ => legacy}/safe_dir_data_store.py | 2 +- src/orcapod/stores/{ => legacy}/types.py | 0 src/orcapod/types/__init__.py | 20 +- src/orcapod/types/arrow_utils.py | 10 + src/orcapod/types/core.py | 68 +- src/orcapod/types/defaults.py | 51 + src/orcapod/types/{ => legacy}/packets.py | 2 +- .../{ => legacy}/semantic_type_handlers.py | 0 .../{ => legacy}/semantic_type_registry.py | 2 +- src/orcapod/types/schemas.py | 493 +++---- src/orcapod/types/semantic_converter.py | 86 ++ src/orcapod/types/semantic_types.py | 569 +++++++ src/orcapod/types/typespec_utils.py | 68 +- src/orcapod/utils/arrow_utils.py | 126 ++ src/orcapod/utils/object_spec.py | 41 +- tests/test_store/test_dir_data_store.py | 2 +- tests/test_store/test_integration.py | 2 +- tests/test_store/test_noop_data_store.py | 2 +- tests/test_store/test_transfer_data_store.py | 4 +- 37 files changed, 3647 insertions(+), 1257 deletions(-) create mode 100644 src/orcapod/data/datagram_store.py create mode 100644 src/orcapod/stores/delta_lake_stores.py rename src/orcapod/stores/{ => legacy}/delta_table_arrow_data_store.py (100%) rename src/orcapod/stores/{ => legacy}/dict_data_stores.py (99%) rename src/orcapod/stores/{ => legacy}/dict_transfer_data_store.py (97%) rename src/orcapod/stores/{arrow_data_stores.py => legacy/legacy_arrow_data_stores.py} (100%) rename src/orcapod/stores/{ => legacy}/safe_dir_data_store.py (99%) rename src/orcapod/stores/{ => legacy}/types.py (100%) create mode 100644 src/orcapod/types/arrow_utils.py create mode 100644 src/orcapod/types/defaults.py rename src/orcapod/types/{ => legacy}/packets.py (99%) rename src/orcapod/types/{ => legacy}/semantic_type_handlers.py (100%) rename src/orcapod/types/{ => legacy}/semantic_type_registry.py (99%) create mode 100644 src/orcapod/types/semantic_converter.py create mode 100644 src/orcapod/types/semantic_types.py create mode 100644 src/orcapod/utils/arrow_utils.py diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index 92d8568..3ca7d6b 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -12,7 +12,7 @@ extract_function_typespecs, check_typespec_compatibility, ) -from orcapod.types.packets import PacketConverter +from orcapod.types.legacy.packets import PacketConverter from orcapod.hashing import ( FunctionInfoExtractor, diff --git a/src/orcapod/data/datagram_store.py b/src/orcapod/data/datagram_store.py new file mode 100644 index 0000000..72d082c --- /dev/null +++ b/src/orcapod/data/datagram_store.py @@ -0,0 +1,890 @@ +# class DatagramStore(Protocol): +# def record_datagram( +# self, +# record_path: tuple[str, ...], +# datagram: dp.Datagram, +# ignore_duplicates: bool = False, +# ) -> str | None: ... + +# def record_stream( +# self, +# record_path: tuple[str, ...], +# stream: dp.Stream, +# ignore_duplicates: bool = False, +# ) -> None: ... + +# def get_recorded_datagram( +# self, +# record_path: tuple[str, ...], +# record_id: str, +# ) -> dp.Datagram | None: ... + +# def get_all_records(self, record_path: tuple[str, ...]) -> dp.Stream | None: +# """Retrieve all records for a given path as a stream.""" +# ... + +# def get_all_records_as_polars( +# self, record_path: tuple[str, ...] +# ) -> pl.DataFrame | None: +# """Retrieve all records for a given path as a Polars stream.""" +# ... + +# def get_records_by_ids( +# self, +# record_path: tuple[str, ...], +# entry_ids: Collection[str], +# add_entry_id_column: bool | str = False, +# preseve_input_order: bool = False, +# ) -> dp.Stream: ... + + +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.dataset as ds +import polars as pl +from pathlib import Path +from typing import Any +import logging +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from collections import defaultdict +from orcapod.data.datagrams import ArrowDatagram, SemanticTypeRegistry +from orcapod.data.streams import ImmutableTableStream +from orcapod.hashing import get_default_arrow_hasher +from orcapod.hashing.types import ArrowHasher +from orcapod.protocols import data_protocols as dp +from orcapod.types import default_registry + + +# Module-level logger +logger = logging.getLogger(__name__) + + +class DeltaTableArrowStore: + """ + Delta Table-based Arrow data store with flexible hierarchical path support and schema preservation. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10, + batch_size: int = 100, + ): + """ + Initialize the DeltaTableArrowDataStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + auto_flush_interval: Time in seconds to auto-flush pending batches (0 to disable) + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, as" + ) + + def flush(self) -> None: + """ + Flush all pending batches immediately. + + This method is called to ensure all pending data is written to the Delta tables. + """ + try: + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def __del__(self): + """Cleanup when object is destroyed.""" + self.flush() + + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + source_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not source_path: + raise ValueError("Source path cannot be empty") + + if len(source_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(source_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path component contains invalid characters: {repr(component)}" + ) + + def _get_source_key(self, source_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(source_path) + + def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in source_path: + path = path / subpath + return path + + def _get_existing_delta_table( + self, source_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + source_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(source_path) + table_path = self._get_table_path(source_path) + + # Check cache first + if dt := self._delta_table_cache.get(source_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: + """Ensure the table has an __entry_id column.""" + if "__entry_id" not in arrow_data.column_names: + # Add entry_id column at the beginning + key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + return arrow_data + + def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the __entry_id column if it exists.""" + if "__entry_id" in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_entry_id_column( + self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + ) -> pa.Table: + """ + Handle entry_id column based on add_entry_id_column parameter. + + Args: + arrow_data: Arrow table with __entry_id column + add_entry_id_column: Control entry ID column inclusion: + - False: Remove __entry_id column + - True: Keep __entry_id column as is + - str: Rename __entry_id column to custom name + """ + if add_entry_id_column is False: + # Remove the __entry_id column + return self._remove_entry_id_column(arrow_data) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + if "__entry_id" in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + # If add_entry_id_column is True, keep __entry_id as is + return arrow_data + + def _create_entry_id_filter(self, entry_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + entry_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "=", entry_id)] + + def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + entry_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "in", entry_ids)] + + def _read_table_with_filter( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> pa.Table: + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def record_data( + self, + record_path: tuple[str, ...], + entry_id: str, + data: pa.Table, + force_flush: bool = False, + error_on_duplicate: bool | None = None, + ) -> pa.Table: + self._validate_source_path(record_path) + source_key = self._get_source_key(record_path) + + # Check for existing entry + if error_on_duplicate is None: + error_on_duplicate = self.duplicate_entry_behavior == "error" + if error_on_duplicate: + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + if existing_record is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table - save original schema first + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {entry_id}: {e}" + ) + + write_deltalake( + table_path, + data_with_entry_id, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) + + # Check if we need to flush + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added record {entry_id} to {source_key}") + return data + + def get_recorded_data( + self, + record_path: tuple[str, ...], + entry_id: str, + flush: bool = False, + ) -> pa.Table | None: + """ + Get a specific record by entry_id with schema preservation. + + Args: + source_path: Tuple of path components + entry_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + # check if entry_id is found in pending batches + source_key = self._get_source_key(record_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_entry_id_filter(entry_id) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Remove the __entry_id column before returning + return self._remove_entry_id_column(result) + + except Exception as e: + logger.error( + f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_all_records( + self, + record_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve all records for a given source path as a single table with schema preservation. + + Args: + source_path: Tuple of path components + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + + Returns: + Arrow table containing all records with original schema, or None if no records found + """ + # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + collected_tables = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(record_path) + ].items(): + collected_tables.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) + + if len(result) != 0: + collected_tables.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(record_path)}: {e}" + ) + if collected_tables: + total_table = pa.concat_tables(collected_tables) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(total_table, add_entry_id_column) + + return None + + # def get_all_records_as_polars( + # self, source_path: tuple[str, ...], flush: bool = True + # ) -> pl.LazyFrame | None: + # """ + # Retrieve all records for a given source path as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + + # Returns: + # Polars LazyFrame containing all records, or None if no records found + # """ + # all_records = self.get_all_records(source_path, flush=flush) + # if all_records is None: + # return None + # # TODO: take care of converting semantics to Python objects + # return pl.LazyFrame(all_records.as_table()) + + def get_records_by_ids( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + + if flush: + self.flush_batch(source_path) + + self._validate_source_path(source_path) + + # Convert input to list of strings for consistency + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read with filters + filter_expr = self._create_entry_ids_filter(entry_ids_list) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + if preserve_input_order: + raise NotImplementedError("Preserve input order is not yet implemented") + # Need to reorder results and add nulls for missing entries + import pandas as pd + + df = result.to_pandas() + df = df.set_index("__entry_id") + + # Create a DataFrame with the desired order, filling missing with NaN + ordered_df = df.reindex(entry_ids_list) + + # Convert back to Arrow + result = pa.Table.from_pandas(ordered_df.reset_index()) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + ) + return None + + # def get_records_by_ids_as_polars( + # self, + # source_path: tuple[str, ...], + # entry_ids: list[str] | pl.Series | pa.Array, + # add_entry_id_column: bool | str = False, + # preserve_input_order: bool = False, + # flush: bool = False, + # ) -> pl.LazyFrame | None: + # """ + # Retrieve records by entry IDs as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + # entry_ids: Entry IDs to retrieve + # add_entry_id_column: Control entry ID column inclusion + # preserve_input_order: If True, return results in input order with nulls for missing + + # Returns: + # Polars LazyFrame containing all found records, or None if no records found + # """ + # arrow_result = self.get_records_by_ids( + # source_path, + # entry_ids, + # add_entry_id_column, + # preserve_input_order, + # flush=flush, + # ) + + # if arrow_result is None: + # return None + + # # Convert to Polars LazyFrame + # return pl.LazyFrame(arrow_result) + + # Additional utility methods + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, source_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + source_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + if not table_path.exists(): + return False + + try: + # Remove from caches + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + """ + Delete a specific record. + + Args: + source_path: Tuple of path components + entry_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return False + + try: + # Check if record exists using proper filter + filter_expr = self._create_entry_id_filter(entry_id) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) + if len(existing) == 0: + return False + + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + + # Update cache + source_key = self._get_source_key(source_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + return True + + except Exception as e: + logger.error( + f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + ) + return False + + def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + source_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_source_path(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + source_key = self._get_source_key(source_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + return { + "path": str(self._get_table_path(source_path)), + "source_path": source_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + "pending_records": pending_count, + } + + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + return None diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index f32ee89..5bab7ba 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -16,69 +16,24 @@ and type conversions between semantic stores, Python stores, and Arrow tables. """ -from orcapod.types.core import DataValue, StoreValue -from typing import TypeAlias, cast, Self -from collections.abc import Callable, Mapping, Collection -from orcapod.types import TypeSpec, default_registry +from orcapod.types.core import DataValue +from typing import TypeAlias, Self +from collections.abc import Mapping, Collection +from orcapod.types import TypeSpec +from orcapod.types.semantic_converter import SemanticConverter from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types.semantic_type_registry import SemanticTypeRegistry -from orcapod.types.core import TypeHandler +from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import schemas -from orcapod.types.typespec_utils import get_typespec_from_dict +from orcapod.types import typespec_utils as tsutils import pyarrow as pa import logging +from orcapod.utils import arrow_utils -from orcapod.hashing.defaults import get_default_arrow_hasher # Constants used for source info keys SOURCE_INFO_PREFIX = "_source_info_" -# TODO: move this to a separate module -def hstack_tables(*tables: pa.Table) -> pa.Table: - """ - Horizontally stack multiple PyArrow tables by concatenating their columns. - - All input tables must have the same number of rows and unique column names. - - Args: - *tables: Variable number of PyArrow tables to stack horizontally - - Returns: - Combined PyArrow table with all columns from input tables - - Raises: - ValueError: If no tables provided, tables have different row counts, - or duplicate column names are found - """ - if len(tables) == 0: - raise ValueError("At least one table is required for horizontal stacking.") - if len(tables) == 1: - return tables[0] - - N = len(tables[0]) - for table in tables[1:]: - if len(table) != N: - raise ValueError( - "All tables must have the same number of rows for horizontal stacking." - ) - - # create combined column names - all_column_names = [] - all_columns = [] - all_names = set() - for i, table in enumerate(tables): - if overlap := set(table.column_names).intersection(all_names): - raise ValueError( - f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" - ) - all_names.update(table.column_names) - all_column_names += table.column_names - all_columns += table.columns - - return pa.Table.from_arrays(all_columns, names=all_column_names) - - logger = logging.getLogger(__name__) # A conveniece packet-like type that defines a value that can be # converted to a packet. It's broader than Packet and a simple mapping @@ -90,248 +45,202 @@ def hstack_tables(*tables: pa.Table) -> pa.Table: # enforce the typespec or source_info, which are important for packet integrity. PacketLike: TypeAlias = Mapping[str, DataValue] -SemanticStore: TypeAlias = Mapping[str, StoreValue] PythonStore: TypeAlias = Mapping[str, DataValue] -def check_arrow_schema_compatibility( - incoming_schema: pa.Schema, current_schema: pa.Schema -) -> tuple[bool, list[str]]: - """ - Check if incoming schema is compatible with current schema. - - Args: - incoming_schema: Schema to validate - current_schema: Expected schema to match against - - Returns: - Tuple of (is_compatible, list_of_errors) - """ - errors = [] - - # Create lookup dictionaries for efficient access - incoming_fields = {field.name: field for field in incoming_schema} - current_fields = {field.name: field for field in current_schema} - - # Check each field in current_schema - for field_name, current_field in current_fields.items(): - if field_name not in incoming_fields: - errors.append(f"Missing field '{field_name}' in incoming schema") - continue - - incoming_field = incoming_fields[field_name] - - # Check data type compatibility - if not current_field.type.equals(incoming_field.type): - errors.append( - f"Type mismatch for field '{field_name}': " - f"expected {current_field.type}, got {incoming_field.type}" - ) - - # Check semantic_type metadata if present in current schema - current_metadata = current_field.metadata or {} - incoming_metadata = incoming_field.metadata or {} - - if b"semantic_type" in current_metadata: - expected_semantic_type = current_metadata[b"semantic_type"] - - if b"semantic_type" not in incoming_metadata: - errors.append( - f"Missing 'semantic_type' metadata for field '{field_name}'" - ) - elif incoming_metadata[b"semantic_type"] != expected_semantic_type: - errors.append( - f"Semantic type mismatch for field '{field_name}': " - f"expected {expected_semantic_type.decode()}, " - f"got {incoming_metadata[b'semantic_type'].decode()}" - ) - elif b"semantic_type" in incoming_metadata: - errors.append( - f"Unexpected 'semantic_type' metadata for field '{field_name}': " - f"{incoming_metadata[b'semantic_type'].decode()}" - ) - - return len(errors) == 0, errors - - -class SemanticConverter: - """ - Converts data between different representations (Python, semantic stores, Arrow tables). - - This class handles the conversion between Python data structures, semantic stores - (which use storage-optimized types), and Arrow tables while maintaining type - information and semantic type metadata. - """ - - @staticmethod - def prepare_handler( - semantic_schema: schemas.SemanticSchema, - semantic_type_registry: SemanticTypeRegistry, - ) -> dict[str, TypeHandler]: - """ - Prepare type handlers for semantic type conversion. - - Args: - semantic_schema: Schema containing semantic type information - semantic_type_registry: Registry for looking up type handlers - - Returns: - Dictionary mapping field names to their type handlers - """ - handler_lut = {} - for key, (_, semantic_type) in semantic_schema.items(): - if semantic_type is None: - continue # Skip keys without semantic type - handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( - semantic_type - ) - return handler_lut - - @classmethod - def from_typespec( - cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticConverter": - """ - Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). - - Args: - typespec: Type specification dictionary - semantic_type_registry: Registry for semantic type lookup - - Returns: - New SemanticConverter instance - """ - semantic_schema = schemas.from_typespec_to_semantic_schema( - typespec, semantic_type_registry - ) - python_schema = schemas.PythonSchema(typespec) - handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) - return cls(python_schema, semantic_schema, handler_lut) - - @classmethod - def from_arrow_schema( - cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticConverter": - """ - Create a SemanticConverter from an Arrow schema. - - Args: - arrow_schema: PyArrow schema with semantic type metadata - semantic_type_registry: Registry for semantic type lookup - - Returns: - New SemanticConverter instance - """ - semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) - python_schema = schemas.from_semantic_schema_to_python_schema( - semantic_schema, semantic_type_registry=semantic_type_registry - ) - handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) - return cls(python_schema, semantic_schema, handler_lut) - - def __init__( - self, - python_schema: schemas.PythonSchema, - semantic_schema: schemas.SemanticSchema, - handler_lut: dict[str, TypeHandler] | None = None, - ): - """ - Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. - Use class methods like `from_arrow_schema` or `from_typespec` instead. - - Args: - python_schema: Schema for Python data types - semantic_schema: Schema for semantic types - handler_lut: Optional dictionary of type handlers for conversion - """ - self.python_schema = python_schema - self.semantic_schema = semantic_schema - self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( - semantic_schema, include_source_info=False - ) - if handler_lut is None: - handler_lut = {} - self.handler_lut = handler_lut - - def from_semantic_store_to_python_store( - self, semantic_store: SemanticStore - ) -> PythonStore: - """ - Convert a semantic store to a Python store. - - Args: - semantic_store: Store (dict) with data stored in semantic (storage-optimized) types - - Returns: - Store with Python native types - """ - python_store = dict(semantic_store) - for key, handler in self.handler_lut.items(): - python_store[key] = handler.storage_to_python(semantic_store[key]) - return python_store - - def from_python_store_to_semantic_store( - self, python_store: PythonStore - ) -> SemanticStore: - """ - Convert a Python store to a semantic store. - - Args: - python_store: Store with Python native types - - Returns: - Store with semantic (storage-optimized) types - """ - semantic_store = dict(python_store) - for key, handler in self.handler_lut.items(): - semantic_store[key] = handler.python_to_storage(python_store[key]) - return semantic_store # type: ignore[return-value] - - def from_semantic_store_to_arrow_table( - self, semantic_store: SemanticStore - ) -> pa.Table: - """Convert a semantic store to an Arrow table.""" - return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) - - def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: - """Convert a Python store to an Arrow table.""" - semantic_store = self.from_python_store_to_semantic_store(python_store) - return self.from_semantic_store_to_arrow_table(semantic_store) - - def from_arrow_table_to_semantic_stores( - self, arrow_table: pa.Table - ) -> list[SemanticStore]: - """Convert an Arrow table to a list of semantic stores.""" - self.verify_compatible_arrow_schema(arrow_table.schema) - return arrow_table.to_pylist() # Ensure the table is materialized - - def from_arrow_table_to_python_stores( - self, arrow_table: pa.Table - ) -> list[PythonStore]: - """Convert an Arrow table to a list of Python stores.""" - return [ - self.from_semantic_store_to_python_store(semantic_store) - for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) - ] - - def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): - """ - Verify that an Arrow schema is compatible with the expected schema. - - Args: - arrow_schema: Schema to verify - - Raises: - ValueError: If schemas are incompatible - """ - compatible, errors = check_arrow_schema_compatibility( - arrow_schema, self.arrow_schema - ) - if not compatible: - raise ValueError( - "Arrow table schema is not compatible with the expected schema: " - + ", ".join(errors) - ) +# class SemanticConverter: +# """ +# Converts data between different representations (Python, semantic stores, Arrow tables). + +# SemanticConverter only tracks the semantic columns to be converted and does not +# enforce any type checking on other columns. Consequently, two completely different +# schemas could share a semantic converter if the have same named fields with identical +# semantic types. Furthermore, semantic types are defined by the association of semantic +# type name with a specific TypeHandler. + +# """ + +# @staticmethod +# def prepare_handler( +# semantic_schema: schemas.SemanticSchema, +# semantic_type_registry: SemanticTypeRegistry, +# ) -> dict[str, TypeHandler]: +# """ +# Prepare type handlers for semantic type conversion. + +# Args: +# semantic_schema: Schema containing semantic type information +# semantic_type_registry: Registry for looking up type handlers + +# Returns: +# Dictionary mapping field names to their type handlers +# """ +# handler_lut = {} +# for key, (_, semantic_type) in semantic_schema.items(): +# if semantic_type is None: +# continue # Skip keys without semantic type +# handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( +# semantic_type +# ) +# return handler_lut + +# @classmethod +# def from_typespec( +# cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry +# ) -> "SemanticConverter": +# """ +# Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). + +# Args: +# typespec: Type specification dictionary +# semantic_type_registry: Registry for semantic type lookup + +# Returns: +# New SemanticConverter instance +# """ +# semantic_schema = schemas.from_typespec_to_semantic_schema( +# typespec, semantic_type_registry +# ) +# python_schema = schemas.PythonSchema(typespec) +# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) +# return cls(python_schema, semantic_schema, handler_lut) + +# @classmethod +# def from_arrow_schema( +# cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry +# ) -> "SemanticConverter": +# """ +# Create a SemanticConverter from an Arrow schema. + +# Args: +# arrow_schema: PyArrow schema with semantic type metadata +# semantic_type_registry: Registry for semantic type lookup + +# Returns: +# New SemanticConverter instance +# """ +# semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) +# python_schema = schemas.from_semantic_schema_to_python_schema( +# semantic_schema, semantic_type_registry=semantic_type_registry +# ) +# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) +# return cls(python_schema, semantic_schema, handler_lut) + +# def __init__( +# self, +# handler_lut: dict[str, tuple[str, TypeHandler]] | None = None, +# ): +# """ +# Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. +# Use class methods like `from_arrow_schema` or `from_typespec` instead. + +# Args: +# python_schema: Schema for Python data types +# semantic_schema: Schema for semantic types +# handler_lut: Optional dictionary of type handlers for conversion +# """ +# if handler_lut is None: +# handler_lut = {} +# self.handler_lut = handler_lut + +# def convert_from_semantic_to_python( +# self, semantic_value: Any, semantic_type: SemanticType +# ) -> Any: +# """ +# Convert a semantic value to a Python value. + +# Args: +# semantic_value: Value in semantic (storage-optimized) format +# semantic_type: Corresponding semantic type + +# Returns: +# Value in Python native format +# """ +# handler = self.handler_lut.get(semantic_type) +# if handler: +# return handler.to_canonical(semantic_value) +# return semantic_value + +# def from_semantic_store_to_python_store( +# self, semantic_store: SemanticStore +# ) -> dict[str, DataValue]: +# """ +# Convert a semantic store to a Python store. + +# Args: +# semantic_store: Store (dict) with data stored in semantic (storage-optimized) types + +# Returns: +# Store with Python native types +# """ +# python_store = dict(semantic_store) +# for key, handler in self.handler_lut.items(): +# python_store[key] = handler.storage_to_python(semantic_store[key]) +# # TODO: come up with a more robust handling/conversion +# return cast(dict[str, DataValue], python_store) + +# def from_python_store_to_semantic_store( +# self, python_store: PythonStore +# ) -> SemanticStore: +# """ +# Convert a Python store to a semantic store. + +# Args: +# python_store: Store with Python native types + +# Returns: +# Store with semantic (storage-optimized) types +# """ +# semantic_store = dict(python_store) +# for key, handler in self.handler_lut.items(): +# semantic_store[key] = handler.python_to_storage(python_store[key]) +# return semantic_store # type: ignore[return-value] + +# def from_semantic_store_to_arrow_table( +# self, semantic_store: SemanticStore +# ) -> pa.Table: +# """Convert a semantic store to an Arrow table.""" +# return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) + +# def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: +# """Convert a Python store to an Arrow table.""" +# semantic_store = self.from_python_store_to_semantic_store(python_store) +# return self.from_semantic_store_to_arrow_table(semantic_store) + +# def from_arrow_table_to_semantic_stores( +# self, arrow_table: pa.Table +# ) -> list[SemanticStore]: +# """Convert an Arrow table to a list of semantic stores.""" +# self.verify_compatible_arrow_schema(arrow_table.schema) +# return arrow_table.to_pylist() # Ensure the table is materialized + +# def from_arrow_table_to_python_stores( +# self, arrow_table: pa.Table +# ) -> list[dict[str, DataValue]]: +# """Convert an Arrow table to a list of Python stores.""" +# return [ +# self.from_semantic_store_to_python_store(semantic_store) +# for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) +# ] + +# def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): +# """ +# Verify that an Arrow schema is compatible with the expected schema. + +# Args: +# arrow_schema: Schema to verify + +# Raises: +# ValueError: If schemas are incompatible +# """ +# compatible, errors = check_arrow_schema_compatibility( +# arrow_schema, self.arrow_schema +# ) +# if not compatible: +# raise ValueError( +# "Arrow table schema is not compatible with the expected schema: " +# + ", ".join(errors) +# ) class ImmutableDict(Mapping[str, DataValue]): @@ -401,58 +310,37 @@ def __init__( if typespec is not None: verified_typespec = dict(typespec) # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict - inferred_typespec = get_typespec_from_dict(self) + inferred_typespec = tsutils.get_typespec_from_dict(self) for key in self: if key not in verified_typespec: verified_typespec[key] = inferred_typespec[key] self._python_schema = schemas.PythonSchema(verified_typespec) # create semantic converter - if semantic_converter is not None: - if semantic_converter.python_schema != self._python_schema: - raise ValueError( - "Incompatible Python schema between packet and semantic converter: " - + str(self._python_schema) - + " vs " - + str(semantic_converter.python_schema) - ) - else: - semantic_converter = SemanticConverter.from_typespec( - self._python_schema, - semantic_type_registry or default_registry, + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._python_schema.to_semantic_schema( + semantic_type_registry=semantic_type_registry + ), ) self.semantic_converter = semantic_converter - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher + self._arrow_hasher = arrow_hasher self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None def as_table( self, - keep_columns: Collection[str] | None = None, - drop_columns: Collection[str] | None = None, ) -> pa.Table: """Convert the packet to an Arrow table.""" - if keep_columns is not None and drop_columns is not None: - logger.warning( - "It is not recommended to provide both keep_columns and drop_columns. The resulting behavior may not be as expected." - ) + if self._cached_table is None: - self._cached_table = ( - self.semantic_converter.from_python_store_to_arrow_table(self.as_dict()) + self._cached_table = self.semantic_converter.from_python_to_arrow( + self, self.types() ) assert self._cached_table is not None, "Cached table should not be None" - processed_table = self._cached_table - if keep_columns is not None: - processed_table = processed_table.select(list(keep_columns)) - - if drop_columns is not None: - processed_table = processed_table.drop(list(drop_columns)) - - return processed_table + return self._cached_table def as_dict(self) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" @@ -468,7 +356,11 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( + if self._arrow_hasher is None: + raise ValueError( + "Arrow hasher must be provided to calculate content hash." + ) + self._cached_content_hash = self._arrow_hasher.hash_table( self.as_table(), prefix_hasher_id=True, ) @@ -480,17 +372,38 @@ def types(self) -> schemas.PythonSchema: """Return copy of the Python schema.""" return self._python_schema.copy() + @classmethod + def _from_copy( + cls, + data: Mapping[str, DataValue], + python_schema: schemas.PythonSchema, + semantic_converter: SemanticConverter, + arrow_hasher: hp.ArrowHasher | None, + ) -> Self: + """Create a new instance from copy without full initialization.""" + instance = cls.__new__(cls) + ImmutableDict.__init__(instance, data) + + # Set attributes directly + instance._python_schema = python_schema + instance.semantic_converter = semantic_converter + instance._arrow_hasher = arrow_hasher + instance._cached_table = None + instance._cached_content_hash = None + + return instance + def copy(self) -> Self: """Return a copy of the datagram.""" - return self.__class__( + return self._from_copy( self, - typespec=self.types(), - semantic_converter=self.semantic_converter, - arrow_hasher=self.arrow_hasher, + self._python_schema.copy(), + self.semantic_converter, + self._arrow_hasher, ) -class PythonDictTag(dict[str, DataValue]): +class DictTag(DictDatagram): """ A simple tag implementation using Python dictionary. @@ -498,95 +411,8 @@ class PythonDictTag(dict[str, DataValue]): to different representations like Arrow tables. """ - def as_dict(self) -> dict[str, DataValue]: - """Return dictionary representation.""" - return dict(self) - - def as_table(self) -> pa.Table: - """Convert to Arrow table representation.""" - return pa.Table.from_pylist([self]) - - def types(self) -> schemas.PythonSchema: - """ - Return Python schema (basic implementation). - - Note: This is a simplified implementation that assumes all values are strings. - """ - # TODO: provide correct implementation - return schemas.PythonSchema({k: str for k in self.keys()}) - - -class ArrowTag: - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__(self, table: pa.Table) -> None: - self.table = table - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - - def keys(self) -> tuple[str, ...]: - """Return column names as a tuple.""" - return tuple(self.table.column_names) - - def types(self) -> schemas.PythonSchema: - """ - Return Python schema derived from Arrow schema. - - Returns: - TypeSpec information returned as PythonSchema. - """ - if self._cached_python_schema is None: - self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( - self.table.schema - ).storage_schema - return self._cached_python_schema.copy() - - def as_dict(self) -> dict[str, DataValue]: - """ - Convert to Python dictionary representation. - - Returns: - Dictionary with tag data - """ - if self._cached_python_dict is None: - self._cached_python_dict = cast( - dict[str, DataValue], self.table.to_pylist()[0] - ) - return self._cached_python_dict - - def as_table(self) -> pa.Table: - """Return the underlying Arrow table.""" - return self.table - - def clear_cache(self) -> None: - """Clear cached Python representations.""" - self._cached_python_schema = None - self._cached_python_dict = None - - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict()}" - - -class PythonDictPacket2(DictDatagram): +class DictPacket(DictDatagram): """ Enhanced packet implementation with source information support. @@ -637,12 +463,10 @@ def __init__( def as_table( self, - keep_columns: Collection[str] | None = None, - drop_columns: Collection[str] | None = None, include_source: bool = False, ) -> pa.Table: """Convert the packet to an Arrow table.""" - table = super().as_table(keep_columns=keep_columns, drop_columns=drop_columns) + table = super().as_table() if include_source: if self._cached_source_info_table is None: source_info_data = { @@ -661,7 +485,7 @@ def as_table( source_info_table = self._cached_source_info_table.select( [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] ) - table = hstack_tables(table, source_info_table) + table = arrow_utils.hstack_tables(table, source_info_table) return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: @@ -680,219 +504,50 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy - def content_hash(self) -> str: - """ - Calculate content hash excluding source information. - - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self.as_table(include_source=False), prefix_hasher_id=True - ) - return self._cached_content_hash - - # use keys() implementation from dict - - def types(self) -> schemas.PythonSchema: - """ - Returns: - Packet type information as PythonSchema (dict mapping field names to types). - """ - return self._python_schema.copy() - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info + def as_datagram(self, include_source: bool = False) -> DictDatagram: """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> "PythonDictPacket2": - """Return a shallow copy of the packet.""" - new_packet = PythonDictPacket2(self, self.source_info()) - new_packet._cached_table = self._cached_table - new_packet._cached_content_hash = self._cached_content_hash - new_packet._python_schema = self._python_schema.copy() - new_packet.semantic_converter = self.semantic_converter - new_packet.arrow_hasher = self.arrow_hasher - return new_packet - - -class PythonDictPacket(dict[str, DataValue]): - """ - Dictionary-based Packet with source tracking and hashing. - - A dictionary-based packet that maintains source information, supports - type specifications, and provides content hashing with optional callbacks. - Includes comprehensive conversion capabilities to Arrow tables. - - Initialize packet with comprehensive configuration options. - - Args: - data: Primary packet data - source_info: Optional source information mapping - typespec: Optional type specification - finger_print: Optional fingerprint for tracking - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - """ - - @classmethod - def create_from( - cls, - object: dp.Packet, - finger_print: str | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> "PythonDictPacket": - """ - Create a PythonDictPacket from another packet object. - - Args: - object: Source packet to copy from - finger_print: Optional fingerprint identifier - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - Returns: - New PythonDictPacket instance - """ - if isinstance(object, PythonDictPacket): - return object.copy() - - new_packet = PythonDictPacket( - object.as_dict(include_source=False), - object.source_info(), - dict(object.types()), - finger_print=finger_print, - semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, - post_hash_callback=post_hash_callback, - ) - return new_packet - - def __init__( - self, - data: dict[str, DataValue], - source_info: dict[str, str | None] | None = None, - typespec: TypeSpec | None = None, - finger_print: str | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data = {k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX)} - contained_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in data.items() - if k.startswith(SOURCE_INFO_PREFIX) - } - super().__init__(data) - - self._source_info = {**contained_source_info, **(source_info or {})} - - verified_typespec = {} - if typespec is not None: - verified_typespec = dict(typespec) - inferred_typespec = get_typespec_from_dict(self) - for key in self: - if key not in verified_typespec: - verified_typespec[key] = inferred_typespec[key] - self._typespec = verified_typespec - - self._python_schema = schemas.PythonSchema(self._typespec) - - if semantic_converter is not None: - if semantic_converter.python_schema != self._python_schema.with_source_info: - raise ValueError( - "Incompatible Python schema between packet and semantic converter: " - + str(self._python_schema.with_source_info) - + " vs " - + str(semantic_converter.python_schema) - ) - else: - semantic_converter = SemanticConverter.from_typespec( - self._python_schema.with_source_info, - semantic_type_registry or default_registry, - ) - self.semantic_converter = semantic_converter - - self._finger_print = finger_print - self._post_hash_callback = post_hash_callback - self._cached_table: pa.Table | None = None - self._cached_content_hash: str | None = None - - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher - - def as_table(self, include_source: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - if self._cached_table is None: - self._cached_table = ( - self.semantic_converter.from_python_store_to_arrow_table( - self.as_dict(include_source=True) - ) - ) - assert self._cached_table is not None, "Cached table should not be None" - if include_source: - return self._cached_table - else: - # drop source info columns if not needed - return self._cached_table.select(list(self.keys())) - - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: - """ - Return dictionary representation. + Convert the packet to a DictDatagram. Args: include_source: Whether to include source info fields Returns: - Dictionary representation of the packet + DictDatagram representation of the packet """ - dict_copy = self.copy() + data = self.as_dict(include_source=include_source) + typespec = self.types() + # append source info to typespec if requested if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value - return dict_copy - - def content_hash(self) -> str: - """ - Calculate and return content hash. + for key in self.keys(): + typespec[f"{SOURCE_INFO_PREFIX}{key}"] = str + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + arrow_hasher=self._arrow_hasher, + ) - Computes hash of packet data content (thus excluding source info) and - optionally triggers post-hash callback if configured. + # def content_hash2(self) -> str: + # """ + # Calculate content hash excluding source information. - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self.as_table(include_source=False), prefix_hasher_id=True - ) - if self._post_hash_callback is not None and self._finger_print is not None: - self._post_hash_callback(self._finger_print, self._cached_content_hash) - return self._cached_content_hash + # Returns: + # Hash string of the packet content + # """ + # # TODO: check if this is identical to DictDatagram.content_hash + # if self._cached_content_hash is None: + # self._cached_content_hash = self._arrow_hasher.hash_table( + # self.as_table(include_source=False), prefix_hasher_id=True + # ) + # return self._cached_content_hash # use keys() implementation from dict def types(self) -> schemas.PythonSchema: - """Return packet data type information as PythonSchema (dict mapping field names to types).""" + """ + Returns: + Packet type information as PythonSchema (dict mapping field names to types). + """ return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: @@ -904,22 +559,17 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} - def copy(self) -> "PythonDictPacket": + def copy(self) -> Self: """Return a shallow copy of the packet.""" - new_packet = PythonDictPacket(self, self.source_info()) - new_packet._finger_print = self._finger_print - new_packet._cached_table = self._cached_table - new_packet._cached_content_hash = self._cached_content_hash - new_packet._python_schema = self._python_schema.copy() - new_packet.semantic_converter = self.semantic_converter - new_packet.arrow_hasher = self.arrow_hasher - new_packet._post_hash_callback = self._post_hash_callback - return new_packet + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance -def process_table_with_source_info( +def prepare_data_and_source_tables( table: pa.Table, source_info: dict[str, str | None] | None = None -) -> tuple[tuple[str, ...], pa.Table]: +) -> tuple[pa.Table, pa.Table]: """ Process a table to ensure proper source_info columns. @@ -929,14 +579,14 @@ def process_table_with_source_info( it will take precedence over existing source_info columns in the table. Returns: - Processed table with source_info columns + tuple of table without any source info and another table only containing source info columns (with prefix) """ if source_info is None: source_info = {} # Step 1: Separate source_info columns from regular columns - regular_columns = [] - regular_names = [] + data_columns = [] + data_column_names = [] existing_source_info = {} for i, name in enumerate(table.column_names): @@ -945,21 +595,19 @@ def process_table_with_source_info( base_name = name.removeprefix(SOURCE_INFO_PREFIX) existing_source_info[base_name] = table.column(i) else: - regular_columns.append(table.column(i)) - regular_names.append(name) + data_columns.append(table.column(i)) + data_column_names.append(name) # Step 2: Create source_info columns for each regular column - final_columns = [] - final_names = [] + source_info_columns = [] + source_info_column_names = [] # Add all regular columns first - final_columns.extend(regular_columns) - final_names.extend(regular_names) # Create source_info columns for each regular column num_rows = table.num_rows - for col_name in regular_names: + for col_name in data_column_names: source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" # if col_name is in source_info, use that value @@ -981,15 +629,192 @@ def process_table_with_source_info( # Use null values source_values = pa.array([None] * num_rows, type=pa.large_string()) - final_columns.append(source_values) - final_names.append(source_info_col_name) + source_info_columns.append(source_values) + source_info_column_names.append(source_info_col_name) # Step 3: Create the final table - result: pa.Table = pa.Table.from_arrays(final_columns, names=final_names) - return tuple(regular_names), result + data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) + source_info_table: pa.Table = pa.Table.from_arrays( + source_info_columns, names=source_info_column_names + ) + return data_table, source_info_table + + +class ArrowDatagram: + """ + An immutable datagram implementation using a PyArrow Table backend. + TODO: handle RecordBatch in addition to table + + This basic datagram provides functionality for type handling, + semantic conversion, and dict-based content representation while maintaining + immutability of the underlying data. + + Initialize ArrowDatagram with a PyArrow table. -class ArrowPacket: + Args: + data: Source data mapping + typespec: Optional type specification for fields + semantic_converter: Optional converter for semantic types + semantic_type_registry: Registry for semantic type lookup + arrow_hasher: Optional hasher for Arrow table content + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the table to ensure it contains proper source columns + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # TODO: add check for compatible types, especially of str being pa.large_string + self._table = table + + # create semantic converter + # TODO: consider some validation of passed semantic_converter + if semantic_converter is None: + if semantic_type_registry is None: + raise ValueError( + "Semantic type registry must be provided if semantic converter is not specified." + ) + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._table.schema, + semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + self._arrow_hasher = arrow_hasher + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + return self._table + + def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._table + )[0] + assert self._cached_python_dict is not None, "Cached dict should not be None" + return dict(self._cached_python_dict) + + def content_hash( + self, + ) -> str: + """ + Calculate and return content hash of the datagram. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + if self._arrow_hasher is None: + raise ValueError( + "Arrow hasher must be provided to calculate content hash." + ) + self._cached_content_hash = self._arrow_hasher.hash_table( + self.as_table(), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + def keys(self) -> tuple[str, ...]: + return tuple(self._table.column_names) + + def types(self) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema(self._table.schema) + ) + return self._cached_python_schema.copy() + + @classmethod + def _from_copy( + cls, + table: pa.Table, + python_schema: schemas.PythonSchema, + semantic_converter: SemanticConverter, + hash_keys: tuple[str, ...], + arrow_hasher: hp.ArrowHasher, + ) -> Self: + """Create a new instance from copy without full initialization.""" + instance = cls.__new__(cls) + instance._table = table + instance._semantic_converter = semantic_converter + instance._arrow_hasher = arrow_hasher + + # Set attributes directly + instance._cached_content_hash = None + + return instance + + def copy(self) -> Self: + """Return a copy of the datagram.""" + new_datagram = self.__class__( + self._table, + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, + ) + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_python_dict = self._cached_python_dict + return new_datagram + + def __repr__(self) -> str: + """Return string representation.""" + return f"{self.as_dict()}" + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) + + +class ArrowPacket(ArrowDatagram): """ Arrow table-based packet implementation with comprehensive features. @@ -1014,56 +839,16 @@ class ArrowPacket: ValueError: If table doesn't contain exactly one row """ - @classmethod - def create_from( - cls, - object: dp.Packet, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - finger_print: str | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> "ArrowPacket": - """ - Create an ArrowPacket from another packet object. - - Args: - object: Source packet to copy from - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint identifier - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - Returns: - New ArrowPacket instance - """ - if isinstance(object, ArrowPacket): - return object.copy() - - new_packet = ArrowPacket( - object.as_table(include_source=True), - semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - finger_print=finger_print, - arrow_hasher=arrow_hasher, - post_hash_callback=post_hash_callback, - skip_source_info_extraction=True, - ) - return new_packet - def __init__( self, - table: pa.Table, + data: pa.Table, source_info: dict[str, str | None] | None = None, semantic_converter: SemanticConverter | None = None, semantic_type_registry: SemanticTypeRegistry | None = None, - finger_print: str | None = None, arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, skip_source_info_extraction: bool = False, ) -> None: - if len(table) != 1: + if len(data) != 1: raise ValueError( "ArrowPacket should only contain a single row, " "as it represents a single packet." @@ -1073,49 +858,43 @@ def __init__( if not skip_source_info_extraction: # normalize the table to ensure it has the expected source_info columns - self._keys, self._arrow_table = process_table_with_source_info( - table, source_info - ) - else: - self._keys: tuple[str, ...] = tuple( - [c for c in table.column_names if not c.startswith(SOURCE_INFO_PREFIX)] - ) - for k in self._keys: - if f"{SOURCE_INFO_PREFIX}{k}" not in table.column_names: - raise ValueError( - f"Source info column '{SOURCE_INFO_PREFIX}{k}' is missing in the table." - ) - self._arrow_table = table - - self._finger_print = finger_print - self._post_hash_callback = post_hash_callback - - if semantic_converter is not None: - check_arrow_schema_compatibility( - semantic_converter.arrow_schema, self._arrow_table.schema + data_table, self._source_info_table = prepare_data_and_source_tables( + data, source_info ) else: - semantic_converter = SemanticConverter.from_arrow_schema( - self._arrow_table.schema, semantic_type_registry or default_registry + data_columns: tuple[str, ...] = tuple( + [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) - self.semantic_converter = semantic_converter + source_columns = [f"{SOURCE_INFO_PREFIX}{c}" for c in data_columns] + # Add conversion to large_string type + data_table = data.select(data_columns) + self._source_info_table = data.select(source_columns) - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher + super().__init__( + data_table, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) - self._cached_python_packet: PythonStore | None = None - self._cached_content_hash: str | None = None - self._cached_python_schema: schemas.PythonSchema | None = None self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None - def as_table(self, include_source: bool = False) -> pa.Table: - """Return the Arrow table representation of the packet.""" - base_table = self._arrow_table - if not include_source: - # Select only the keys that are not source info - base_table = base_table.select(self._keys) - return base_table + def as_table( + self, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table() + if include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables( + table, + self._source_info_table.select( + [f"{SOURCE_INFO_PREFIX}{c}" for c in table.column_names] + ), + ) + return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ @@ -1127,42 +906,20 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - if self._cached_python_packet is None: - self._cached_python_packet = ( - self.semantic_converter.from_arrow_table_to_python_stores( - self._arrow_table - )[0] - ) + return_dict = super().as_dict() if include_source: - return dict(self._cached_python_packet) - - return {k: self._cached_python_packet[k] for k in self._keys} - - def content_hash(self) -> str: - """ - Calculate and return content hash. - - Computes hash of the Arrow table content and optionally - triggers post-hash callback if configured. - - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self._arrow_table, prefix_hasher_id=True + return_dict.update( + {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} ) - if self._post_hash_callback is not None and self._finger_print is not None: - self._post_hash_callback(self._finger_print, self._cached_content_hash) - return self._cached_content_hash - - def types(self) -> schemas.PythonSchema: - """Return packet data type information as PythonSchema (dict mapping field names to types).""" - return self.semantic_converter.python_schema.copy() - - def keys(self) -> tuple[str, ...]: - """Return the keys of the packet.""" - return tuple(self._keys) + return return_dict + + def as_datagram(self, include_source: bool = False) -> ArrowDatagram: + table = self.as_table(include_source=include_source) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, + ) def source_info(self) -> dict[str, str | None]: """ @@ -1173,37 +930,26 @@ def source_info(self) -> dict[str, str | None]: """ if self._cached_source_info is None: self._cached_source_info = { - k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() - for k in self._keys + k.removeprefix(SOURCE_INFO_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() } return self._cached_source_info.copy() - def copy(self) -> "ArrowPacket": - """Return a shallow copy of the packet.""" - new_packet = ArrowPacket( - self._arrow_table, - semantic_converter=self.semantic_converter, - finger_print=self._finger_print, - arrow_hasher=self.arrow_hasher, - post_hash_callback=self._post_hash_callback, + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, skip_source_info_extraction=True, ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema new_packet._cached_content_hash = self._cached_content_hash - new_packet._cached_source_info = ( - self._cached_source_info.copy() - if self._cached_source_info is not None - else None - ) - new_packet._cached_python_packet = ( - dict(self._cached_python_packet) - if self._cached_python_packet is not None - else None - ) - return new_packet - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict(include_source=False)}" + return new_packet # a batch is a tuple of a tag and a list of packets diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 95667d2..a276e23 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -261,6 +261,16 @@ def op_forward(self, *streams: dp.Stream) -> ImmutableTableStream: all_tag_typespecs = [] all_packet_typespecs = [] + joined_stream = streams[0] + for stream in streams[1:]: + joined_tag_typespec, joined_packet_typespec = joined_stream.types() + stream_tag_typespec, stream_packet_typespec = stream.types() + joined_table = joined_stream.as_table().join( + stream.as_table(), + keys=intersection_typespecs(joined_tag_typespec, stream_tag_typespec), + join_type="inner", + ) + for stream in streams: tag_typespec, packet_typespec = stream.types() all_tag_typespecs.append(tag_typespec) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a6b0d0f..e6c2c96 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -4,14 +4,19 @@ from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast -from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.datagrams import DictPacket, DictTag from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join from orcapod.data.streams import PodStream +from orcapod.hashing import get_default_arrow_hasher from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp -from orcapod.types import SemanticTypeRegistry, TypeSpec, default_registry +from orcapod.protocols.store_protocols import ArrowDataStore +from orcapod.types import TypeSpec, default_registry +from orcapod.types.schemas import PythonSchema +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) @@ -209,6 +214,7 @@ def __init__( output_typespec: TypeSpec | Sequence[type] | None = None, label: str | None = None, semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, ) -> None: @@ -228,36 +234,37 @@ def __init__( self.function_name = function_name super().__init__(label=label or self.function_name, **kwargs) - if semantic_type_registry is None: - # TODO: reconsider the use of default registry here - semantic_type_registry = default_registry - - self.semantic_type_registry = semantic_type_registry - self.function_info_extractor = function_info_extractor - # extract input and output types from the function signature - self._input_packet_types, self._output_packet_types = ( - tsutils.extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_typespec, - output_typespec=output_typespec, - ) + input_packet_types, output_packet_types = tsutils.extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, ) + self._input_packet_schema = PythonSchema(input_packet_types) + self._output_packet_schema = PythonSchema(output_packet_types) - def input_packet_types(self) -> TypeSpec: + semantic_type_registry = semantic_type_registry or default_registry + self._output_semantic_converter = SemanticConverter.from_semantic_schema( + self._output_packet_schema.to_semantic_schema(semantic_type_registry) + ) + + self.arrow_hasher = arrow_hasher or get_default_arrow_hasher() + self.function_info_extractor = function_info_extractor + + def input_packet_types(self) -> PythonSchema: """ Return the input typespec for the function pod. This is used to validate the input streams. """ - return self._input_packet_types + return self._input_packet_schema.copy() - def output_packet_types(self) -> TypeSpec: + def output_packet_types(self) -> PythonSchema: """ Return the output typespec for the function pod. This is used to validate the output streams. """ - return self._output_packet_types + return self._output_packet_schema.copy() def __repr__(self) -> str: return f"FunctionPod:{self.function_name}" @@ -271,9 +278,7 @@ def __str__(self) -> str: ) return f"FunctionPod:{func_sig}" - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[dp.Tag, PythonDictPacket | None]: + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -302,8 +307,11 @@ def call( ) # TODO: add source info based on this function call - output_packet = PythonDictPacket( - {k: v for k, v in zip(self.output_keys, output_values)} + output_packet = DictPacket( + {k: v for k, v in zip(self.output_keys, output_values)}, + typespec=self.output_packet_types(), + semantic_converter=self._output_semantic_converter, + arrow_hasher=self.arrow_hasher, ) return tag, output_packet @@ -325,7 +333,6 @@ def identity_structure(self, *streams: dp.Stream) -> Any: "input_packet_types": self.input_packet_types(), "output_packet_types": self.output_packet_types(), } - function_info["output_keys"] = tuple(self.output_keys) id_struct = ( self.__class__.__name__, @@ -334,7 +341,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure if len(streams) > 0: - processed_streams = self.pre_processing_step(*streams) + # TODO: extract the common handling of input streams + processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) id_struct += (processed_streams[0],) @@ -393,7 +401,19 @@ class CachedPod(WrappedPod): This is useful for pods that are expensive to compute and can benefit from caching. """ - def __init__(self, pod: dp.Pod, cache_key: str, **kwargs): + def __init__( + self, + pod: dp.Pod, + result_store: ArrowDataStore, + lineage_store: ArrowDataStore | None, + record_path_prefix: tuple[str, ...] = (), + **kwargs, + ): super().__init__(pod, **kwargs) - self.cache_key = cache_key - self.cache: dict[str, dp.Packet] = {} + self.record_path_prefix = record_path_prefix + self.result_store = result_store + self.lineage_store = lineage_store + + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[DictTag, DictPacket | None]: ... diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 223011b..bce6585 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,6 +1,8 @@ +from orcapod.hashing.types import ArrowHasher from orcapod.protocols import data_protocols as dp -from orcapod.types import SemanticTypeRegistry, default_registry, schemas, TypeSpec -from orcapod.data.datagrams import ArrowPacket, ArrowTag, SemanticConverter +from orcapod.types import schemas, TypeSpec +from orcapod.types.semantic_types import SemanticTypeRegistry +from orcapod.data.datagrams import ArrowPacket, ArrowTag, DictTag, SemanticConverter from orcapod.data.base import LabeledContentIdentifiableBase import pyarrow as pa from collections.abc import Iterator, Collection @@ -9,6 +11,7 @@ from typing import Any, Literal import logging import warnings +from itertools import repeat # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -118,7 +121,7 @@ def iter_packets( ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... @abstractmethod - def as_table(self) -> pa.Table: ... + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: ... def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ @@ -252,12 +255,12 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) - return self._cached_stream.as_table() + return self._cached_stream.as_table(include_content_hash=include_content_hash) def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() @@ -289,6 +292,7 @@ def __init__( source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: ArrowHasher | None = None, **kwargs, ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) @@ -299,21 +303,31 @@ def __init__( self._packet_columns = tuple( c for c in table.column_names if c not in tag_columns ) + if len(self._packet_columns) == 0: + raise ValueError( + "No packet columns found in the table. At least one packet column is required." + ) - semantic_type_registry = semantic_type_registry or default_registry tag_schema = pa.schema( f for f in self._table.schema if f.name in self._tag_columns ) packet_schema = pa.schema( f for f in self._table.schema if f.name in self._packet_columns ) - self._tag_converter = SemanticConverter.from_arrow_schema( - tag_schema, semantic_type_registry + + self._tag_schema = tag_schema + self._packet_schema = packet_schema + self._tag_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema(tag_schema, semantic_type_registry) ) - self._packet_converter = SemanticConverter.from_arrow_schema( - packet_schema, semantic_type_registry + self._packet_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + packet_schema, semantic_type_registry + ) ) + self._arrow_hasher = arrow_hasher + self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now @@ -331,8 +345,12 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: """ # TODO: consider using MappingProxyType to avoid copying the dicts return ( - self._tag_converter.python_schema.copy(), - self._packet_converter.python_schema.copy(), + schemas.PythonSchema.from_arrow_schema( + self._tag_schema, converters=self._tag_converter.as_dict() + ), + schemas.PythonSchema.from_arrow_schema( + self._packet_schema, converters=self._packet_converter.as_dict() + ), ) def as_table(self, include_content_hash: bool | str = False) -> pa.Table: @@ -346,10 +364,10 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: "_content_hash" if include_content_hash is True else include_content_hash ) content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] - self._table = self._table.append_column( + table_with_hash = self._table.append_column( hash_column_name, pa.array(content_hashes, type=pa.large_string()) ) - return self._table + return table_with_hash def clear_cache(self) -> None: """ @@ -366,14 +384,35 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: # TODO: make it work with table batch stream if self._cached_elements is None: self._cached_elements = [] - tags = self._table.select(self._tag_columns) + tag_present = len(self._tag_columns) > 0 + if tag_present: + tags = self._table.select(self._tag_columns) + tag_batches = tags.to_batches() + else: + tag_batches = repeat(DictTag({})) + + # TODO: come back and clean up this logic + packets = self._table.select(self._packet_columns) - for tag_batch, packet_batch in zip(tags.to_batches(), packets.to_batches()): - for i in range(len(tag_batch)): + for tag_batch, packet_batch in zip(tag_batches, packets.to_batches()): + for i in range(len(packet_batch)): + if tag_present: + tag = ArrowTag( + tag_batch.slice(i, 1), # type: ignore + semantic_converter=self._tag_converter, + arrow_hasher=self._arrow_hasher, + ) + + else: + tag = tag_batch self._cached_elements.append( ( - ArrowTag(tag_batch.slice(i, 1)), - ArrowPacket(packet_batch.slice(i, 1)), + tag, + ArrowPacket( + packet_batch.slice(i, 1), + semantic_converter=self._packet_converter, + arrow_hasher=self._arrow_hasher, + ), ) ) yield from self._cached_elements diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 2b66b52..a89ab4e 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -56,6 +56,7 @@ def __init__( handle_missing: str = "error", semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, serialization_method: str = "logical", + # TODO: consider passing options for serialization method ): """ Initialize SemanticArrowHasher. @@ -111,7 +112,11 @@ def _get_semantic_type(self, field: pa.Field) -> str | None: return None def _create_hash_column( - self, original_column: pa.Array, hash_bytes: bytes, original_field: pa.Field + self, + original_column: pa.Array, + hash_algorithm: str, + hash_bytes: bytes, + original_field: pa.Field, ) -> tuple[pa.Array, pa.Field]: """Create a new column containing the hash bytes.""" # Create array of hash bytes (one hash value repeated for each row) @@ -124,11 +129,11 @@ def _create_hash_column( "semantic_type", "unknown" ) new_metadata["semantic_type"] = "hash" - new_metadata["hash_algorithm"] = "sha256" + new_metadata["hash_algorithm"] = hash_algorithm_id new_field = pa.field( original_field.name, - pa.string(), # Hash stored as string + pa.large_string(), # Hash stored as large string nullable=original_field.nullable, metadata=new_metadata, ) @@ -152,7 +157,7 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: # Replace column with hash hash_column, hash_field = self._create_hash_column( - column, hash_bytes, field + column, hasher.hasher_id, hash_bytes, field ) new_columns.append(hash_column) new_fields.append(hash_field) diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 4508f95..bcd489f 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -30,31 +30,31 @@ def __init__( self.cacher = string_cacher self.cache_key_prefix = cache_key_prefix - def _hash_file_content(self, file_path: str) -> str: - """Hash the content of a single file and return hex string.""" + def _hash_file_content(self, file_path: str) -> bytes: + """Hash the content of a single file""" import os # if cacher exists, check if the hash is cached if self.cacher: cache_key = f"{self.cache_key_prefix}:{file_path}" - cached_hash = self.cacher.get_cached(cache_key) - if cached_hash is not None: - return cached_hash + cached_hash_hex = self.cacher.get_cached(cache_key) + if cached_hash_hex is not None: + return bytes.fromhex(cached_hash_hex) try: if not os.path.exists(file_path): if self.handle_missing == "error": raise FileNotFoundError(f"File not found: {file_path}") elif self.handle_missing == "skip": - return hashlib.sha256(b"").hexdigest() + return hashlib.sha256(b"").digest() elif self.handle_missing == "null_hash": - return hashlib.sha256(b"").hexdigest() + return hashlib.sha256(b"").digest() - hashed_value = self.file_hasher.hash_file(file_path).hex() + hashed_value = self.file_hasher.hash_file(file_path) if self.cacher: - # Cache the computed hash + # Cache the computed hash hex self.cacher.set_cached( - f"{self.cache_key_prefix}:{file_path}", hashed_value + f"{self.cache_key_prefix}:{file_path}", hashed_value.hex() ) return hashed_value @@ -63,7 +63,7 @@ def _hash_file_content(self, file_path: str) -> str: raise IOError(f"Cannot read file {file_path}: {e}") else: # skip or null_hash error_msg = f"" - return hashlib.sha256(error_msg.encode("utf-8")).hexdigest() + return hashlib.sha256(error_msg.encode("utf-8")).digest() def hash_column(self, column: pa.Array) -> pa.Array: """ diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index c5c1919..0cd0722 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,7 +1,7 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher +from orcapod.utils.object_spec import parse_objectspec from orcapod.protocols.hashing_protocols import ObjectHasher -import importlib from typing import Any CURRENT_VERSION = "v0.1" @@ -9,7 +9,7 @@ versioned_semantic_arrow_hashers = { "v0.1": { "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", - "config": { + "_config": { "hasher_id": "arrow_v0.1", "hash_algorithm": "sha256", "chunk_size": 8192, @@ -17,10 +17,10 @@ "semantic_type_hashers": { "path": { "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", - "config": { + "_config": { "file_hasher": { "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "config": { + "_config": { "algorithm": "sha256", }, } @@ -34,36 +34,17 @@ versioned_object_hashers = { "v0.1": { "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", - "config": { + "_config": { "hasher_id": "object_v0.1", "function_info_extractor": { "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", - "config": {"include_module": True, "include_defaults": True}, + "_config": {"include_module": True, "include_defaults": True}, }, }, } } -def parse_objectspec(obj_spec: dict) -> Any: - if "_class" in obj_spec: - # if _class is specified, treat the dict as an object specification - module_name, class_name = obj_spec["_class"].rsplit(".", 1) - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - configs = parse_objectspec(obj_spec.get("config", {})) - return cls(**configs) - else: - # otherwise, parse through the dictionary recursively - parsed_object = obj_spec - for k, v in obj_spec.items(): - if isinstance(v, dict): - parsed_object[k] = parse_objectspec(v) - else: - parsed_object[k] = v - return parsed_object - - def get_versioned_semantic_arrow_hasher( version: str | None = None, ) -> SemanticArrowHasher: diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 405714f..9470c1e 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -4,9 +4,10 @@ from orcapod.core.streams import EmptyStream from orcapod.stores import ArrowDataStore from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry +from orcapod.types.legacy import packets from orcapod.types.typespec_utils import union_typespecs -from orcapod.types.semantic_type_registry import SemanticTypeRegistry -from orcapod.types import packets, schemas +from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry +from orcapod.types import schemas from orcapod.hashing import ObjectHasher, ArrowHasher from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher from typing import Any, Literal diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 7c14e2e..1767509 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -127,6 +127,11 @@ def extract_function_info( class SemanticTypeHasher(Protocol): """Abstract base class for semantic type-specific hashers.""" + @property + def hasher_id(self) -> str: + """Unique identifier for this semantic type hasher.""" + ... + def hash_column( self, column: pa.Array, diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index e69de29..d5ca902 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -0,0 +1,34 @@ +from typing import Collection, Protocol, TYPE_CHECKING +from orcapod.protocols import data_protocols as dp +import pyarrow as pa + +if TYPE_CHECKING: + import polars as pl + + +class ArrowDataStore(Protocol): + def record_data( + self, + record_path: tuple[str, ...], + record_id: str, + data: pa.Table, + ignore_duplicates: bool = False, + ) -> str | None: ... + + def get_recorded_data( + self, + record_path: tuple[str, ...], + record_id: str, + ) -> pa.Table | None: ... + + def get_all_records(self, record_path: tuple[str, ...]) -> pa.Table | None: + """Retrieve all records for a given path as a stream.""" + ... + + def get_records_by_ids( + self, + record_path: tuple[str, ...], + record_ids: Collection[str], + add_entry_id_column: bool | str = False, + preseve_input_order: bool = False, + ) -> pa.Table: ... diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 1114c11..573a316 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -1,7 +1,7 @@ -from .types import DataStore, ArrowDataStore -from .arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore -from .dict_data_stores import DirDataStore, NoOpDataStore -from .safe_dir_data_store import SafeDirDataStore +from .legacy.types import DataStore, ArrowDataStore +from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore +from .legacy.dict_data_stores import DirDataStore, NoOpDataStore +from .legacy.safe_dir_data_store import SafeDirDataStore __all__ = [ "DataStore", diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py new file mode 100644 index 0000000..f8f0451 --- /dev/null +++ b/src/orcapod/stores/delta_lake_stores.py @@ -0,0 +1,861 @@ +import pyarrow as pa +import pyarrow.dataset as ds +import polars as pl +from pathlib import Path +from typing import Any +import logging +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from collections import defaultdict + + +# Module-level logger +logger = logging.getLogger(__name__) + + +class BasicDeltaTableArrowStore: + """ + A basic Delta Table-based Arrow data store with flexible hierarchical path support. + This store does NOT implement lazy loading or streaming capabilities, therefore + being "basic" in that sense. It is designed for simple use cases where data is written + in batches and read back as complete tables. It is worth noting that the Delta table + structure created by this store IS compatible with more advanced Delta Table-based + data stores (to be implemented) that will support lazy loading and streaming. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10, + batch_size: int = 100, + ): + """ + Initialize the BasicDeltaTableArrowStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, as" + ) + + def flush(self) -> None: + """ + Flush all pending batches immediately. + + This method is called to ensure all pending data is written to the Delta tables. + """ + try: + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def __del__(self): + """Cleanup when object is destroyed.""" + self.flush() + + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + source_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not source_path: + raise ValueError("Source path cannot be empty") + + if len(source_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(source_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path component contains invalid characters: {repr(component)}" + ) + + def _get_source_key(self, source_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(source_path) + + def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in source_path: + path = path / subpath + return path + + def _get_existing_delta_table( + self, source_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + source_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(source_path) + table_path = self._get_table_path(source_path) + + # Check cache first + if dt := self._delta_table_cache.get(source_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: + """Ensure the table has an __entry_id column.""" + if "__entry_id" not in arrow_data.column_names: + # Add entry_id column at the beginning + key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + return arrow_data + + def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the __entry_id column if it exists.""" + if "__entry_id" in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_entry_id_column( + self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + ) -> pa.Table: + """ + Handle entry_id column based on add_entry_id_column parameter. + + Args: + arrow_data: Arrow table with __entry_id column + add_entry_id_column: Control entry ID column inclusion: + - False: Remove __entry_id column + - True: Keep __entry_id column as is + - str: Rename __entry_id column to custom name + """ + if add_entry_id_column is False: + # Remove the __entry_id column + return self._remove_entry_id_column(arrow_data) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + if "__entry_id" in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + # If add_entry_id_column is True, keep __entry_id as is + return arrow_data + + def _create_entry_id_filter(self, entry_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + entry_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "=", entry_id)] + + def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + entry_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "in", entry_ids)] + + def _read_table_with_filter( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> pa.Table: + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def record_data( + self, + record_path: tuple[str, ...], + entry_id: str, + data: pa.Table, + force_flush: bool = False, + error_on_duplicate: bool | None = None, + ) -> pa.Table: + self._validate_source_path(record_path) + source_key = self._get_source_key(record_path) + + # Check for existing entry + if error_on_duplicate is None: + error_on_duplicate = self.duplicate_entry_behavior == "error" + if error_on_duplicate: + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + if existing_record is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table - save original schema first + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {entry_id}: {e}" + ) + + write_deltalake( + table_path, + data_with_entry_id, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) + + # Check if we need to flush + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added record {entry_id} to {source_key}") + return data + + def get_recorded_data( + self, + record_path: tuple[str, ...], + entry_id: str, + flush: bool = False, + ) -> pa.Table | None: + """ + Get a specific record by entry_id with schema preservation. + + Args: + source_path: Tuple of path components + entry_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + # check if entry_id is found in pending batches + source_key = self._get_source_key(record_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_entry_id_filter(entry_id) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Remove the __entry_id column before returning + return self._remove_entry_id_column(result) + + except Exception as e: + logger.error( + f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_all_records( + self, + record_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve all records for a given source path as a single table with schema preservation. + + Args: + source_path: Tuple of path components + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + + Returns: + Arrow table containing all records with original schema, or None if no records found + """ + # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + collected_tables = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(record_path) + ].items(): + collected_tables.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) + + if len(result) != 0: + collected_tables.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(record_path)}: {e}" + ) + if collected_tables: + total_table = pa.concat_tables(collected_tables) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(total_table, add_entry_id_column) + + return None + + # def get_all_records_as_polars( + # self, source_path: tuple[str, ...], flush: bool = True + # ) -> pl.LazyFrame | None: + # """ + # Retrieve all records for a given source path as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + + # Returns: + # Polars LazyFrame containing all records, or None if no records found + # """ + # all_records = self.get_all_records(source_path, flush=flush) + # if all_records is None: + # return None + # # TODO: take care of converting semantics to Python objects + # return pl.LazyFrame(all_records.as_table()) + + def get_records_by_ids( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + + if flush: + self.flush_batch(source_path) + + self._validate_source_path(source_path) + + # Convert input to list of strings for consistency + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read with filters + filter_expr = self._create_entry_ids_filter(entry_ids_list) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + if preserve_input_order: + raise NotImplementedError("Preserve input order is not yet implemented") + # Need to reorder results and add nulls for missing entries + import pandas as pd + + df = result.to_pandas() + df = df.set_index("__entry_id") + + # Create a DataFrame with the desired order, filling missing with NaN + ordered_df = df.reindex(entry_ids_list) + + # Convert back to Arrow + result = pa.Table.from_pandas(ordered_df.reset_index()) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + ) + return None + + # def get_records_by_ids_as_polars( + # self, + # source_path: tuple[str, ...], + # entry_ids: list[str] | pl.Series | pa.Array, + # add_entry_id_column: bool | str = False, + # preserve_input_order: bool = False, + # flush: bool = False, + # ) -> pl.LazyFrame | None: + # """ + # Retrieve records by entry IDs as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + # entry_ids: Entry IDs to retrieve + # add_entry_id_column: Control entry ID column inclusion + # preserve_input_order: If True, return results in input order with nulls for missing + + # Returns: + # Polars LazyFrame containing all found records, or None if no records found + # """ + # arrow_result = self.get_records_by_ids( + # source_path, + # entry_ids, + # add_entry_id_column, + # preserve_input_order, + # flush=flush, + # ) + + # if arrow_result is None: + # return None + + # # Convert to Polars LazyFrame + # return pl.LazyFrame(arrow_result) + + # Additional utility methods + + def get_pending_batch_info(self) -> dict[str, int]: + """ + Get information about pending batches. + + Returns: + Dictionary mapping source keys to number of pending records + """ + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } + + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, source_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + source_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + if not table_path.exists(): + return False + + try: + # Remove from caches + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + """ + Delete a specific record. + + Args: + source_path: Tuple of path components + entry_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return False + + try: + # Check if record exists using proper filter + filter_expr = self._create_entry_id_filter(entry_id) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) + if len(existing) == 0: + return False + + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + + # Update cache + source_key = self._get_source_key(source_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + return True + + except Exception as e: + logger.error( + f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + ) + return False + + def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + source_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_source_path(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + source_key = self._get_source_key(source_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + return { + "path": str(self._get_table_path(source_path)), + "source_path": source_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + "pending_records": pending_count, + } + + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + return None diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/legacy/delta_table_arrow_data_store.py similarity index 100% rename from src/orcapod/stores/delta_table_arrow_data_store.py rename to src/orcapod/stores/legacy/delta_table_arrow_data_store.py diff --git a/src/orcapod/stores/dict_data_stores.py b/src/orcapod/stores/legacy/dict_data_stores.py similarity index 99% rename from src/orcapod/stores/dict_data_stores.py rename to src/orcapod/stores/legacy/dict_data_stores.py index c4eff60..718fef0 100644 --- a/src/orcapod/stores/dict_data_stores.py +++ b/src/orcapod/stores/legacy/dict_data_stores.py @@ -7,7 +7,7 @@ from orcapod.hashing.legacy_core import hash_packet from orcapod.hashing.types import LegacyPacketHasher from orcapod.hashing.defaults import get_default_composite_file_hasher -from orcapod.stores.types import DataStore +from orcapod.stores.legacy.types import DataStore from orcapod.types import Packet, PacketLike logger = logging.getLogger(__name__) diff --git a/src/orcapod/stores/dict_transfer_data_store.py b/src/orcapod/stores/legacy/dict_transfer_data_store.py similarity index 97% rename from src/orcapod/stores/dict_transfer_data_store.py rename to src/orcapod/stores/legacy/dict_transfer_data_store.py index 7e8762f..fe7a52a 100644 --- a/src/orcapod/stores/dict_transfer_data_store.py +++ b/src/orcapod/stores/legacy/dict_transfer_data_store.py @@ -1,6 +1,6 @@ # Implements transfer data store that lets you transfer memoized packets between data stores. -from orcapod.stores.types import DataStore +from orcapod.stores.legacy.types import DataStore from orcapod.types import PacketLike diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/legacy/legacy_arrow_data_stores.py similarity index 100% rename from src/orcapod/stores/arrow_data_stores.py rename to src/orcapod/stores/legacy/legacy_arrow_data_stores.py diff --git a/src/orcapod/stores/safe_dir_data_store.py b/src/orcapod/stores/legacy/safe_dir_data_store.py similarity index 99% rename from src/orcapod/stores/safe_dir_data_store.py rename to src/orcapod/stores/legacy/safe_dir_data_store.py index e02e9cc..72f8ef0 100644 --- a/src/orcapod/stores/safe_dir_data_store.py +++ b/src/orcapod/stores/legacy/safe_dir_data_store.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Optional, Union -from .file_utils import atomic_copy, atomic_write +from ..file_utils import atomic_copy, atomic_write logger = logging.getLogger(__name__) diff --git a/src/orcapod/stores/types.py b/src/orcapod/stores/legacy/types.py similarity index 100% rename from src/orcapod/stores/types.py rename to src/orcapod/stores/legacy/types.py diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index 179a253..ca29627 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,30 +1,16 @@ -from .core import Tag, PathLike, PathSet, TypeSpec, DataValue, StoreValue -from .semantic_type_registry import SemanticTypeRegistry -from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler -from . import semantic_type_handlers +from .core import PathLike, PathSet, TypeSpec, DataValue from . import typespec_utils +from .defaults import DEFAULT_REGISTRY as default_registry Packet = dict[str, str] PacketLike = Packet -# Create default registry and register handlers -default_registry = SemanticTypeRegistry() - -# Register with semantic names - registry extracts supported types automatically -default_registry.register("path", PathHandler()) -default_registry.register("uuid", UUIDHandler()) -default_registry.register( - "datetime", DateTimeHandler() -) # Registers for datetime, date, time __all__ = [ - "default_registry", - "Tag", "TypeSpec", "PathLike", "PathSet", - "semantic_type_handlers", "typespec_utils", "DataValue", - "StoreValue", + "default_registry", ] diff --git a/src/orcapod/types/arrow_utils.py b/src/orcapod/types/arrow_utils.py new file mode 100644 index 0000000..c446901 --- /dev/null +++ b/src/orcapod/types/arrow_utils.py @@ -0,0 +1,10 @@ +import pyarrow as pa + + +def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: + """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, + no field names should collide.""" + merged_fields = [] + for schema in schemas: + merged_fields.extend(schema) + return pa.schema(merged_fields) diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 98b49b8..b43d21a 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -4,6 +4,7 @@ import logging +logger = logging.getLogger(__name__) DataType: TypeAlias = type @@ -11,18 +12,6 @@ str, DataType ] # Mapping of parameter names to their types -logger = logging.getLogger(__name__) - - -# class TypeSpec(dict[str, DataType]): -# def __init__(self, *args, **kwargs): -# """ -# TypeSpec is a mapping of parameter names to their types. -# It can be used to define the expected types of parameters in a function or a pod. -# """ -# super().__init__(*args, **kwargs) - - # Convenience alias for anything pathlike PathLike = str | os.PathLike @@ -30,10 +19,6 @@ # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists TagValue: TypeAlias = int | str | None | Collection["TagValue"] -# the top level tag is a mapping from string keys to values that can be a string or -# an arbitrary depth of nested list of strings or None -Tag: TypeAlias = Mapping[str, TagValue] - # a pathset is a path or an arbitrary depth of nested list of paths PathSet: TypeAlias = PathLike | Collection[PathLike | None] @@ -46,55 +31,4 @@ # Either the original PathSet or one of our supported simple data types DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None -StoreValue: TypeAlias = SupportedNativePythonData | Collection["StoreValue"] | None - PacketLike: TypeAlias = Mapping[str, DataValue] - - -# class PodFunction(Protocol): -# """ -# A function suitable to be used in a FunctionPod. -# It takes one or more named arguments, each corresponding to either: -# - A path to a file or directory (PathSet) - for backward compatibility -# - A simple data value (str, int, float, bool, bytes, Path) -# and returns either None, a single value, or a list of values -# """ - -# def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... - - -class TypeHandler(Protocol): - """Protocol for handling conversion between Python type and Arrow - data types used for storage. - - The handler itself IS the definition of a semantic type. The semantic type - name/identifier is provided by the registerer when registering the handler. - - TypeHandlers should clearly communicate what Python types they can handle, - and focus purely on conversion logic. - """ - - def python_type(self) -> type: - """Return the Python type(s) this handler can process. - - Returns: - Python type the handler supports - - Examples: - - PathHandler: return Path - - NumericHandler: return (int, float) - - CollectionHandler: return (list, tuple, set) - """ - ... - - def storage_type(self) -> type: - """Return the Arrow DataType instance for schema definition.""" - ... - - def python_to_storage(self, value: Any) -> Any: - """Convert Python value to Arrow-compatible storage representation.""" - ... - - def storage_to_python(self, value: Any) -> Any: - """Convert storage representation back to Python object.""" - ... diff --git a/src/orcapod/types/defaults.py b/src/orcapod/types/defaults.py new file mode 100644 index 0000000..f7b5773 --- /dev/null +++ b/src/orcapod/types/defaults.py @@ -0,0 +1,51 @@ +# A collection of versioned hashers that provide a "default" implementation of hashers. +from orcapod.utils.object_spec import parse_objectspec + + +from orcapod.types.semantic_types import ( + SemanticTypeRegistry, + SemanticType, + CanonicalPath, + PathlibPathConverter, + ArrowStringPathConverter, +) + +CURRENT_VERSION = "v0.1" + + +semantic_path_objectspec = { + "v0.1": { + "_class": "orcapod.types.semantic_types.SemanticType", + "_config": { + "name": "path", + "description": "File system path representation", + "python_converters": [ + { + "_class": "orcapod.types.semantic_types.PathlibPathConverter", + } + ], + "arrow_converters": [ + { + "_class": "orcapod.types.semantic_types.ArrowStringPathConverter", + } + ], + }, + } +} + +semantic_registry_objectspec = { + "v0.1": { + "_class": "orcapod.types.semantic_types.SemanticTypeRegistry", + "_config": {"semantic_types": [semantic_path_objectspec["v0.1"]]}, + } +} + + +SEMANTIC_PATH = SemanticType[CanonicalPath]( + "path", + "File system path representation", + python_converters=[PathlibPathConverter()], + arrow_converters=[ArrowStringPathConverter()], +) + +DEFAULT_REGISTRY = SemanticTypeRegistry([SEMANTIC_PATH]) diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/legacy/packets.py similarity index 99% rename from src/orcapod/types/packets.py rename to src/orcapod/types/legacy/packets.py index a5836b1..7950d5b 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/legacy/packets.py @@ -2,7 +2,7 @@ from typing import TypeAlias, Any from collections.abc import Mapping, Collection from orcapod.types.core import TypeSpec, Tag, TypeHandler -from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry from orcapod.types import schemas from orcapod.types.typespec_utils import get_typespec_from_dict import pyarrow as pa diff --git a/src/orcapod/types/semantic_type_handlers.py b/src/orcapod/types/legacy/semantic_type_handlers.py similarity index 100% rename from src/orcapod/types/semantic_type_handlers.py rename to src/orcapod/types/legacy/semantic_type_handlers.py diff --git a/src/orcapod/types/semantic_type_registry.py b/src/orcapod/types/legacy/semantic_type_registry.py similarity index 99% rename from src/orcapod/types/semantic_type_registry.py rename to src/orcapod/types/legacy/semantic_type_registry.py index 2091904..6934bae 100644 --- a/src/orcapod/types/semantic_type_registry.py +++ b/src/orcapod/types/legacy/semantic_type_registry.py @@ -1,6 +1,6 @@ import logging import pyarrow as pa -from .core import TypeHandler +from ..core import TypeHandler from dataclasses import dataclass # This mapping is expected to be stable diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 31e56d5..57f0551 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,5 +1,10 @@ +from typing import Self from orcapod.types.core import DataType, TypeSpec -from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.semantic_types import ( + SemanticType, + SemanticTypeRegistry, + PythonArrowConverter, +) import pyarrow as pa import datetime @@ -58,23 +63,140 @@ class PythonSchema(dict[str, DataType]): {'name': , 'age': } """ - @property - def with_source_info(self) -> dict[str, type]: + def copy(self) -> "PythonSchema": + return PythonSchema(self) + + def to_semantic_schema( + self, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticSchema": """ - Get the schema with source info fields included. + Convert the Python schema to a semantic schema using the provided semantic type registry. + + Parameters + ---------- + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. Returns ------- - dict[str, type|None] - A new schema including source info fields. + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + + Examples + -------- + >>> python_schema = PythonSchema(name=str, age=int) + >>> semantic_schema = python_schema.to_semantic_schema(registry) + >>> print(semantic_schema) + {'name': (str, None), 'age': (int, None)} """ - return {**self, **{f"_source_info_{k}": str for k in self.keys()}} + return SemanticSchema.from_typespec(self, semantic_type_registry) - def copy(self) -> "PythonSchema": - return PythonSchema(self) + def to_arrow_schema( + self, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> pa.Schema: + """ + Convert the Python schema to an Arrow schema. + If converters are provided, they are used to convert the schema. Note that + no validation is performed on the converters, so they must be compatible with the schema. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + fields = [] + for field_name, python_type in self.items(): + if field_name in converters: + converter = converters[field_name] + arrow_type = converter.arrow_type + metadata = None + if converter.semantic_type_name is not None: + metadata = { + b"semantic_type": converter.semantic_type_name.encode( + "utf-8" + ) + } + else: + arrow_type = python_to_arrow_type(python_type) + metadata = None + fields.append(pa.field(field_name, arrow_type, metadata=metadata)) + return pa.schema(fields) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + # Otherwise, convert using the semantic type registry + return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() + + @classmethod + def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: + """ + Create a PythonSchema from a SemanticSchema. + + Parameters + ---------- + semantic_schema : SemanticSchema + The semantic schema to convert. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return cls(semantic_schema.get_python_types()) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> Self: + """ + Create a PythonSchema from an Arrow schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + skip_system_columns : bool, optional + Whether to skip system columns (default is True). + converters : dict[str, PythonArrowConverter], optional + A dictionary of converters to use for converting the schema. If provided, the schema will be + converted using the converters. If not provided, the schema will be converted using the semantic type + registry. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + python_types = {} + for field in arrow_schema: + # TODO: consider performing validation of semantic type + if field.name in converters: + converter = converters[field.name] + python_types[field.name] = converter.python_type + else: + python_types[field.name] = arrow_to_python_type(field.type) + return cls(python_types) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + semantic_schema = SemanticSchema.from_arrow_schema( + arrow_schema, + semantic_type_registry, + ) + return cls(semantic_schema.get_python_types()) -class SemanticSchema(dict[str, tuple[type, str | None]]): +class SemanticSchema(dict[str, type | SemanticType]): """ A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. @@ -84,275 +206,152 @@ class SemanticSchema(dict[str, tuple[type, str | None]]): ---------- keys : str The keys of the schema. - values : tuple[type, str|None] - The types and optional semantic type corresponding to each key. + values : type | SemanticType + Either type for simple fields or SemanticType for semantic fields. Examples -------- - >>> schema = SemanticSchema(image=(str, 'path'), age=(int, None)) + >>> schema = SemanticSchema(image=SemanticType('path'), age=int) >>> print(schema) - {'image': (, 'path'), 'age': (, None)} + {"image": SemanticType(name='path'), "age": })} """ - def get_store_type(self, key: str) -> type | None: + def get_semantic_fields(self) -> dict[str, SemanticType]: """ - Get the storage type for a given key in the schema. - - Parameters - ---------- - key : str - The key for which to retrieve the storage type. + Get a dictionary of semantic fields in the schema. Returns ------- - type | None - The storage type associated with the key, or None if not found. + dict[str, SemanticType] + A dictionary mapping keys to their corresponding SemanticType. """ - return self.get(key, (None, None))[0] + return {k: v for k, v in self.items() if isinstance(v, SemanticType)} - def get_semantic_type(self, key: str) -> str | None: + def get_python_types(self) -> dict[str, type]: """ - Get the semantic type for a given key in the schema. - - Parameters - ---------- - key : str - The key for which to retrieve the semantic type. + Get the Python types for all keys in the schema. Returns ------- - str | None - The semantic type associated with the key, or None if not found. + dict[str, type] + A dictionary mapping keys to their corresponding Python types. """ - return self.get(key, (None, None))[1] + return { + k: v.get_default_python_type() if isinstance(v, SemanticType) else v + for k, v in self.items() + } - @property - def storage_schema(self) -> PythonSchema: + def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: """ - Get the storage schema, which is a PythonSchema representation of the semantic schema. + Get the Arrow types for all keys in the schema. Returns ------- - PythonSchema - A new schema mapping keys to Python types. + dict[str, tuple[pa.DataType, str|None]] + A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, + the second element of the tuple is the semantic type name; otherwise, it is None. """ - return PythonSchema({k: v[0] for k, v in self.items()}) - - @property - def storage_schema_with_source_info(self) -> dict[str, type]: + return { + k: (v.get_default_arrow_type(), v.name) + if isinstance(v, SemanticType) + else (python_to_arrow_type(v), None) + for k, v in self.items() + } + + def to_arrow_schema(self) -> pa.Schema: """ - Get the storage schema with source info fields included. + Get the Arrow schema, which is a PythonSchema representation of the semantic schema. Returns ------- - dict[str, type] - A new schema including source info fields. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, 'name'), age=(int, None)) - >>> storage_schema = semantic_schema.storage_schema_with_source_info - >>> print(storage_schema) - {'name': , 'age': , '_source_info_name': , '_source_info_age': } + PythonSchema + A new schema mapping keys to Python types. """ - return self.storage_schema.with_source_info - - -def from_typespec_to_semantic_schema( - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, -) -> SemanticSchema: - """ - Convert a Python schema to a semantic schema using the provided semantic type registry. - - Parameters - ---------- - typespec : TypeSpec - The typespec to convert, mapping keys to Python types. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - - Examples - -------- - >>> typespec: TypeSpec = dict(name=str, age=int) - >>> semantic_schema = from_typespec_to_semanticn_schema(typespec, registry) - >>> print(semantic_schema) - {'name': (, None), 'age': (, None)} - """ - semantic_schema = {} - for key, python_type in typespec.items(): - if python_type in semantic_type_registry: - type_info = semantic_type_registry.get_type_info(python_type) - assert type_info is not None, ( - f"Type {python_type} should be found in the registry as `in` returned True" - ) - semantic_schema[key] = (type_info.storage_type, type_info.semantic_type) - else: - semantic_schema[key] = (python_type, None) - return SemanticSchema(semantic_schema) - - -def from_semantic_schema_to_python_schema( - semantic_schema: SemanticSchema, - semantic_type_registry: SemanticTypeRegistry, -) -> PythonSchema: - """ - Convert a semantic schema to a Python schema using the provided semantic type registry. - - Parameters - ---------- - semantic_schema : SemanticSchema - The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) - >>> python_schema = from_semantic_schema_to_python_schema(semantic_schema, registry) - >>> print(python_schema) - {'name': , 'age': } - """ - python_schema_content = {} - for key, (python_type, semantic_type) in semantic_schema.items(): - if semantic_type is not None: - # If the semantic type is registered, use the corresponding Python type - python_type = semantic_type_registry.get_python_type(semantic_type) - python_schema_content[key] = python_type - return PythonSchema(python_schema_content) - - -def from_semantic_schema_to_arrow_schema( - semantic_schema: SemanticSchema, - include_source_info: bool = True, -) -> pa.Schema: - """ - Convert a semantic schema to an Arrow schema. - - Parameters - ---------- - semantic_schema : SemanticSchema - The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. - - Returns - ------- - dict[str, type] - A new schema mapping keys to Arrow-compatible types. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) - >>> arrow_schema = from_semantic_schema_to_arrow_schema(semantic_schema) - >>> print(arrow_schema) - {'name': str, 'age': int} - """ - fields = [] - for field_name, (python_type, semantic_type) in semantic_schema.items(): - arrow_type = DEFAULT_ARROW_TYPE_LUT[python_type] - field_metadata = ( - {b"semantic_type": semantic_type.encode("utf-8")} if semantic_type else {} - ) - fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) - - if include_source_info: - for field in semantic_schema: - field_metadata = {b"field_type": b"source_info"} - fields.append( - pa.field( - f"_source_info_{field}", pa.large_string(), metadata=field_metadata + fields = [] + for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): + if semantic_type_name is not None: + field = pa.field( + k, + arrow_type, + metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, ) - ) - - return pa.schema(fields) + else: + field = pa.field(k, arrow_type) + fields.append(field) + return pa.schema(fields) -def from_arrow_schema_to_semantic_schema( - arrow_schema: pa.Schema, -) -> SemanticSchema: - """ - Convert an Arrow schema to a semantic schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The schema to convert, containing fields with metadata. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + def to_python_schema(self) -> PythonSchema: + """ + Get the Python schema, which is a PythonSchema representation of the semantic schema. - Examples - -------- - >>> arrow_schema = pa.schema([pa.field('name', pa.string(), metadata={'semantic_type': 'name'}), - ... pa.field('age', pa.int64(), metadata={'semantic_type': 'age'})]) - >>> semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) - >>> print(semantic_schema) - {'name': (str, 'name'), 'age': (int, 'age')} - """ - semantic_schema = {} - for field in arrow_schema: - if field.name.startswith("_source_info_") or ( - field.metadata and field.metadata.get(b"field_type", b"") == b"source_info" - ): - # Skip source info fields - continue - - semantic_type = None - if field.metadata is not None: - semantic_type = field.metadata.get(b"semantic_type", None) - semantic_type = semantic_type.decode() if semantic_type else None - python_type = arrow_to_python_type(field.type) - semantic_schema[field.name] = (python_type, semantic_type) - return SemanticSchema(semantic_schema) - - -def from_typespec_to_arrow_schema( - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, - include_source_info: bool = True, -) -> pa.Schema: - semantic_schema = from_typespec_to_semantic_schema(typespec, semantic_type_registry) - return from_semantic_schema_to_arrow_schema( - semantic_schema, include_source_info=include_source_info - ) - - -def from_arrow_schema_to_python_schema( - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry, -) -> PythonSchema: - """ - Convert an Arrow schema to a Python schema. + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return PythonSchema.from_semantic_schema(self) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + """ + Create a SemanticSchema from an Arrow schema. - Parameters - ---------- - arrow_schema : pa.Schema - The schema to convert, containing fields with metadata. + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + """ - Examples - -------- - >>> arrow_schema = pa.schema([pa.field('name', pa.string()), pa.field('age', pa.int64())]) - >>> python_schema = from_arrow_schema_to_python_schema(arrow_schema) - >>> print(python_schema) - {'name': , 'age': } - """ - semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) - return from_semantic_schema_to_python_schema( - semantic_schema, semantic_type_registry - ) + semantic_schema = {} + for field in arrow_schema: + field_type = None + if field.metadata is not None: + semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() + if semantic_type_name: + semantic_type = semantic_type_registry.get_semantic_type( + semantic_type_name + ) + if semantic_type is None: + raise ValueError( + f"Semantic type '{semantic_type_name}' not found in registry" + ) + if not semantic_type.supports_arrow_type(field.type): + raise ValueError( + f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" + ) + field_type = semantic_type + + if ( + field_type is None + ): # was not set to semantic type, so fallback to simple conversion + field_type = arrow_to_python_type(field.type) + + semantic_schema[field.name] = field_type + return cls(semantic_schema) + + @classmethod + def from_typespec( + cls, + typespec: TypeSpec, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + semantic_schema = {} + for key, python_type in typespec.items(): + semantic_type = semantic_type_registry.get_semantic_type_for_python_type( + python_type + ) + if semantic_type is not None: + semantic_schema[key] = semantic_type + else: + semantic_schema[key] = python_type + return cls(semantic_schema) diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py new file mode 100644 index 0000000..8dc0df1 --- /dev/null +++ b/src/orcapod/types/semantic_converter.py @@ -0,0 +1,86 @@ +from orcapod.types.semantic_types import PythonArrowConverter +from orcapod.types.schemas import PythonSchema, SemanticSchema +from orcapod.types import typespec_utils as tsutils + +from typing import Any, Mapping, Self +import pyarrow as pa +import logging + +logger = logging.getLogger(__name__) + + +class SemanticConverter: + @classmethod + def from_semantic_schema(cls, semantic_schema: SemanticSchema) -> Self: + converter_lut = {} + for ( + field, + semantic_type, + ) in semantic_schema.get_semantic_fields().items(): + converter_lut[field] = PythonArrowConverter.from_semantic_type( + semantic_type + ) + return cls(converter_lut) + + def __init__( + self, + converter_lut: dict[str, PythonArrowConverter], + ): + self._converter_lut = converter_lut + + def from_python_to_arrow_schema(self, python_schema: PythonSchema) -> pa.Schema: + """Convert a Python schema to an Arrow schema""" + return python_schema.to_arrow_schema(converters=self._converter_lut) + + def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: + """Convert an Arrow schema to a Python schema""" + return PythonSchema.from_arrow_schema( + arrow_schema, converters=self._converter_lut + ) + + def from_python_to_arrow( + self, python_data: Mapping[str, Any], python_schema: PythonSchema | None = None + ) -> pa.Table: + """Convert a dictionary of Python values to Arrow arrays""" + if python_schema is None: + # infer schema from data + python_schema = PythonSchema(tsutils.get_typespec_from_dict(python_data)) + logger.warning( + f"Inferred schema {python_schema} from Python data {python_data}. Note that this may not behave as expected." + ) + + arrow_schema = self.from_python_to_arrow_schema(python_schema) + + arrow_data = {} + for field, value in python_data.items(): + if field in self._converter_lut: + converter = self._converter_lut[field] + arrow_data[field] = converter.from_python_to_arrow(value) + else: + arrow_data[field] = [value] + return pa.Table.from_pydict(arrow_data, schema=arrow_schema) + + def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: + """Convert a dictionary of Arrow arrays to Python values""" + + values = [] + for column_name in arrow_data.column_names: + column = arrow_data[column_name] + if column_name not in self._converter_lut: + values.append(column.to_pylist()) + else: + converter = self._converter_lut[column_name] + values.append(converter.from_arrow_to_python(column)) + all_entries = [] + + for entry in zip(*values): + assert len(entry) == len(arrow_data.column_names), ( + "Mismatch in number of columns and values" + ) + all_entries.append(dict(zip(arrow_data.column_names, entry))) + + return all_entries + + def as_dict(self) -> dict[str, PythonArrowConverter]: + """Return the converter lookup table as a dictionary.""" + return self._converter_lut.copy() diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py new file mode 100644 index 0000000..169da69 --- /dev/null +++ b/src/orcapod/types/semantic_types.py @@ -0,0 +1,569 @@ +from typing import Any, Self, cast +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +import pyarrow as pa +from collections.abc import Collection + + +# Converter interfaces using modern generics with ABC +class PythonConverter[T, R](ABC): + """ + Abstract base class for converters between canonical and Python representation types. + T: canonical type, R: Python representation type + """ + + def __init__(self): + # Automatically infer types from inheritance + self._python_type = self._infer_python_type() + + def _infer_python_type(self) -> type[R]: + """Infer the Python type from __orig_bases__""" + for base in getattr(self.__class__, "__orig_bases__", []): + if hasattr(base, "__origin__") and issubclass( + base.__origin__, PythonConverter + ): + # Get the R type parameter (second argument) + args = getattr(base, "__args__", ()) + if len(args) >= 2: + return args[1] # R is the second type parameter + raise RuntimeError(f"Could not infer Python type for {self.__class__.__name__}") + + @abstractmethod + def to_canonical(self, value: R) -> T: + """Convert from Python representation to canonical form""" + pass + + @abstractmethod + def from_canonical(self, value: T) -> R: + """Convert from canonical to Python representation form""" + pass + + @abstractmethod + def can_handle(self, python_type: type) -> bool: ... + + def get_python_type(self) -> type[R]: + """Get the Python type this converter converts into (auto-inferred)""" + return self._python_type + + +class ArrowConverter[T](ABC): + """ + Abstract base class for converters between canonical and Arrow representation types. + T: canonical type + """ + + @abstractmethod + def to_canonical(self, value: pa.Array) -> list[T]: + """Convert from Arrow representation to canonical form""" + pass + + @abstractmethod + def from_canonical(self, value: T | Collection[T]) -> pa.Array: + """Convert from canonical to Arrow representation""" + pass + + @abstractmethod + def can_handle(self, arrow_type: pa.DataType) -> bool: ... + + @abstractmethod + def get_arrow_type(self) -> pa.DataType: + """Get the Arrow DataType this converter handles""" + pass + + +# Canonical types with explicit definitions +@dataclass(frozen=True) +class CanonicalPath: + """Canonical representation of a file system path""" + + path_str: str + is_absolute: bool = False + + def __str__(self) -> str: + return self.path_str + + def __post_init__(self) -> None: + if not self.path_str: + raise ValueError("Path string cannot be empty") + + +@dataclass(frozen=True) +class CanonicalTimestamp: + """Canonical representation of a timestamp""" + + timestamp: int + timezone: str = "UTC" + + def __post_init__(self) -> None: + if self.timestamp < 0: + raise ValueError("Timestamp cannot be negative") + + +@dataclass(frozen=True) +class CanonicalURL: + """Canonical representation of a URL""" + + url: str + scheme: str + host: str + + def __post_init__(self) -> None: + if not self.url.startswith(f"{self.scheme}://"): + raise ValueError(f"URL must start with {self.scheme}://") + + +# Python converters for Path +class PathlibPathConverter(PythonConverter[CanonicalPath, Path]): + """Converter for pathlib.Path objects""" + + def to_canonical(self, value: Path) -> CanonicalPath: + return CanonicalPath(path_str=str(value), is_absolute=value.is_absolute()) + + def from_canonical(self, value: CanonicalPath) -> Path: + return Path(value.path_str) + + def can_handle(self, python_type: type) -> bool: + return issubclass(python_type, Path) + + +# Arrow converters for Path +class ArrowStringPathConverter(ArrowConverter[CanonicalPath]): + """Converter for Arrow string representation of paths""" + + def to_canonical(self, value: pa.Array) -> list[CanonicalPath]: + return [ + CanonicalPath(v, is_absolute=Path(v).is_absolute()) + for v in value.to_pylist() + ] + + def from_canonical( + self, value: CanonicalPath | Collection[CanonicalPath] + ) -> pa.Array: + if isinstance(value, CanonicalPath): + value = [value] + return pa.array([v.path_str for v in value], type=pa.large_string()) + + def can_handle(self, arrow_type: pa.DataType) -> bool: + return arrow_type == pa.large_string() + + def get_arrow_type(self) -> pa.DataType: + return pa.large_string() + + +# Enhanced SemanticType with explicit Python and Arrow handling +class SemanticType[T]: + """ + Represents a semantic type with explicit Python/Arrow converters. + + A SemanticType is a central concept that: + 1. Defines a canonical representation (T) for a domain concept + 2. Manages separate Python and Arrow converters + 3. Provides explicit methods for Python and Arrow operations + 4. Maintains type safety while allowing runtime discovery + + Type parameter T represents the canonical representation type. + """ + + def __init__( + self, + name: str, + description: str = "", + python_converters: Collection[PythonConverter[T, Any]] | None = None, + arrow_converters: Collection[ArrowConverter[T]] | None = None, + ): + self.name = name + self.description = description + + self._python_type_converters: list[PythonConverter[T, Any]] = [] + self._arrow_type_converters: list[ArrowConverter[T]] = [] + + # Default converters + self._default_python_converter: PythonConverter[T, Any] | None = None + self._default_arrow_converter: ArrowConverter[T] | None = None + + if python_converters is not None: + for converter in python_converters: + self.register_python_converter( + converter, + set_default=self._default_python_converter is None, + force=False, + ) + + if arrow_converters is not None: + for converter in arrow_converters: + self.register_arrow_converter( + converter, + set_default=self._default_arrow_converter is None, + force=False, + ) + + def get_default_python_type(self) -> type[T]: + """Get the default Python type for this semantic type""" + if self._default_python_converter: + return self._default_python_converter.get_python_type() + raise ValueError( + f"No default Python converter registered for semantic type '{self.name}'" + ) + + def get_default_arrow_type(self) -> pa.DataType: + """Get the default Arrow DataType for this semantic type""" + if self._default_arrow_converter: + return self._default_arrow_converter.get_arrow_type() + raise ValueError( + f"No default Arrow converter registered for semantic type '{self.name}'" + ) + + def register_python_converter[R]( + self, + converter: PythonConverter[T, R], + set_default: bool = False, + force: bool = False, + ): + """ + Register a Python converter + """ + if converter not in self._python_type_converters: + self._python_type_converters.append(converter) + + if set_default: + if self._default_python_converter is not None and not force: + raise ValueError( + f"Default Python converter already set for semantic type '{self.name}'" + ) + self._default_python_converter = converter + + def register_arrow_converter( + self, + converter: ArrowConverter[T], + set_default: bool = False, + force: bool = False, + ) -> None: + """Register an Arrow converter""" + if converter not in self._arrow_type_converters: + self._arrow_type_converters.append(converter) + + if set_default: + if self._default_arrow_converter is not None and not force: + raise ValueError( + f"Default Arrow converter already set for semantic type '{self.name}'" + ) + self._default_arrow_converter = converter + + # Python-specific methods + def get_python_converter_for_type( + self, python_type: type + ) -> PythonConverter[T, Any] | None: + """Find a Python converter that can handle the given type""" + for converter in self._python_type_converters: + if converter.can_handle(python_type): + return converter + return None + + def get_arrow_converter_for_type( + self, arrow_type: pa.DataType + ) -> ArrowConverter[T] | None: + """Find an Arrow converter for the given Arrow DataType""" + for converter in self._arrow_type_converters: + if converter.can_handle(arrow_type): + return converter + return None + + def get_python_converter_with_output_type( + self, output_type: type + ) -> PythonConverter[T, Any] | None: + """Get a Python converter that can handle the specified output type""" + for converter in self._python_type_converters: + if issubclass(converter.get_python_type(), output_type): + return converter + return None + + def get_arrow_converter_with_output_type( + self, output_type: pa.DataType + ) -> ArrowConverter[T] | None: + for converter in self._arrow_type_converters: + if output_type == converter.get_arrow_type(): + return converter + return None + + def supports_python_type(self, python_type: type) -> bool: + return self.get_python_converter_for_type(python_type) is not None + + def supports_arrow_type(self, arrow_type: pa.DataType) -> bool: + return self.get_arrow_converter_for_type(arrow_type) is not None + + @property + def default_python_converter(self) -> PythonConverter[T, Any] | None: + """Get the default Python converter""" + return self._default_python_converter + + @property + def default_arrow_converter(self) -> ArrowConverter[T] | None: + return self._default_arrow_converter + + def to_canonical_from_python(self, value: Any) -> T: + """Convert Python value to canonical form""" + converter = self.get_python_converter_for_type(type(value)) + if not converter: + raise ValueError( + f"No Python converter found for {type(value)} in semantic type '{self.name}'" + ) + + return converter.to_canonical(value) + + def from_canonical_to_python( + self, value: T, target_type: type | None = None + ) -> Any: + """Convert from canonical to Python representation""" + if target_type is None: + converter = self.default_python_converter + if not converter: + raise ValueError( + f"No default Python converter for semantic type '{self.name}'" + ) + else: + converter = self.get_python_converter_for_type(target_type) + if not converter: + raise ValueError( + f"No converter found for target type '{target_type}' in semantic type '{self.name}'" + ) + + return converter.from_canonical(value) + + def to_canonical_from_arrow(self, value: pa.Array) -> list[T]: + """Convert Arrow value to canonical form using explicit Arrow DataType""" + converter = self.get_arrow_converter_for_type(value.type) + if not converter: + raise ValueError( + f"No Arrow converter found for type '{value.type}' in semantic type '{self.name}'" + ) + + canonical = converter.to_canonical(value) + + return canonical + + def from_canonical_to_arrow( + self, value: T, target_type: pa.DataType | None = None + ) -> Any: + """Convert from canonical to Arrow representation using explicit Arrow DataType""" + + if target_type is None: + converter = self.default_arrow_converter + if not converter: + raise ValueError( + f"No default Arrow converter for semantic type '{self.name}'" + ) + else: + converter = self.get_arrow_converter_for_type(target_type) + if not converter: + raise ValueError( + f"No Arrow converter found for target type '{target_type}' in semantic type '{self.name}'" + ) + + return converter.from_canonical(value) + + def get_python_types(self) -> list[type]: + """Get all supported output Python DataTypes""" + return [ + converter.get_python_type() for converter in self._python_type_converters + ] + + def get_arrow_types(self) -> list[pa.DataType]: + """Get all supported output Arrow DataTypes""" + return [converter.get_arrow_type() for converter in self._arrow_type_converters] + + # Cross-system conversion methods + def convert_python_to_arrow( + self, python_value: Any, arrow_type: pa.DataType | None = None + ) -> Any: + """Convert directly from Python to Arrow representation""" + canonical = self.to_canonical_from_python(python_value) + return self.from_canonical_to_arrow(canonical, arrow_type) + + def convert_arrow_to_python( + self, arrow_value, python_type: type | None = None + ) -> list[Any]: + """Convert directly from Arrow to Python representation""" + canonical_values = self.to_canonical_from_arrow(arrow_value) + return [ + self.from_canonical_to_python(value, target_type=python_type) + for value in canonical_values + ] + + def __str__(self) -> str: + return f"SemanticType(name='{self.name}')" + + def __repr__(self) -> str: + python_count = len(self._python_type_converters) + arrow_count = len(self._arrow_type_converters) + return ( + f"SemanticType(name='{self.name}', " + f"python_converters={python_count}, " + f"arrow_converters={arrow_count})" + ) + + +# Registry with explicit Python and Arrow handling +class SemanticTypeRegistry: + """Registry that manages SemanticType objects with explicit Python/Arrow operations""" + + def __init__(self, semantic_types: Collection[SemanticType] | None = None): + self._semantic_type_lut: dict[str, SemanticType] = {} + self._python_to_semantic_lut: dict[type, SemanticType] = {} + if semantic_types is not None: + for semantic_type in semantic_types: + self.register_semantic_type(semantic_type) + + def register_semantic_type[T](self, semantic_type: SemanticType[T]): + """Register a semantic type""" + if semantic_type.name not in self._semantic_type_lut: + self._semantic_type_lut[semantic_type.name] = semantic_type + else: + raise ValueError( + f"Semantic type {self._semantic_type_lut[semantic_type.name]} is already registered for semantic name {semantic_type.name}" + ) + + python_type = semantic_type.get_default_python_type() + if python_type is None: + raise ValueError( + f"Semantic type {semantic_type.name} does not have a default Python type" + ) + if python_type in self._python_to_semantic_lut: + raise ValueError( + f"Python type {python_type} is already registered for semantic type {self._python_to_semantic_lut[python_type]}" + ) + self._python_to_semantic_lut[python_type] = semantic_type + + def get_semantic_type_for_python_type( + self, python_type: type + ) -> SemanticType | None: + """Get a semantic type by Python type""" + return self._python_to_semantic_lut.get(python_type) + + def get_semantic_type(self, name: str) -> SemanticType | None: + """Get a semantic type by name""" + return self._semantic_type_lut.get(name) + + def list_semantic_types(self) -> list[SemanticType]: + """Get all registered semantic types""" + return list(self._semantic_type_lut.values()) + + def supports_python_type(self, python_type: type) -> bool: + """Check if registry supports the given Python type""" + return python_type in self._python_to_semantic_lut + + # Python-specific registry methods + def supports_semantic_and_arrow_type( + self, semantic_type_name: str, arrow_type: pa.DataType + ) -> bool: + """Check if registry supports the given semantic type and Arrow DataType combination""" + semantic_type = self._semantic_type_lut.get(semantic_type_name) + if not semantic_type: + return False + return semantic_type.supports_arrow_type(arrow_type) + + +# Type-safe wrapper for semantic values +class SemanticValue[T]: + """Type-safe wrapper for semantic values""" + + def __init__(self, value: T, semantic_type: SemanticType[T]): + self._value = value + self._semantic_type = semantic_type + + @property + def value(self) -> T: + return self._value + + @property + def semantic_type(self) -> SemanticType[T]: + return self._semantic_type + + def to_python(self) -> Any: + """Convert to Python representation""" + return self._semantic_type.from_canonical_to_python(self._value) + + def to_python_type(self, python_type: type) -> Any: + """Convert to Arrow representation using specific Arrow DataType""" + return self._semantic_type.from_canonical_to_arrow(self._value, python_type) + + def to_arrow(self) -> Any: + """Convert to Arrow representation using default dtype""" + return self._semantic_type.from_canonical_to_arrow(self._value) + + def to_arrow_with_type(self, arrow_type: pa.DataType) -> Any: + """Convert to Arrow representation using specific Arrow DataType""" + return self._semantic_type.from_canonical_to_arrow(self._value, arrow_type) + + @classmethod + def from_python(cls, python_value: Any, semantic_type: SemanticType[T]) -> Self: + """Create from a Python value""" + canonical = semantic_type.to_canonical_from_python(python_value) + return cls(canonical, semantic_type) + + @classmethod + def from_arrow(cls, arrow_value: Any, semantic_type: SemanticType[T]) -> Self: + """Create from an Arrow value with explicit Arrow DataType""" + canonical = semantic_type.to_canonical_from_arrow(arrow_value) + if len(canonical) != 1: + raise ValueError( + f"Expected single value from Arrow, got {len(canonical)} values" + ) + return cls(canonical[0], semantic_type) + + def __str__(self) -> str: + return f"SemanticValue({self._value}, {self._semantic_type.name})" + + def __repr__(self) -> str: + return f"SemanticValue(value={self._value!r}, semantic_type={self._semantic_type.name})" + + +class PythonArrowConverter[T, R]: + @classmethod + def from_semantic_type(cls, semantic_type: SemanticType[T]) -> Self: + """Create a PythonArrowConverter from a SemanticType""" + python_converter = semantic_type.default_python_converter + arrow_converter = semantic_type.default_arrow_converter + + if not python_converter or not arrow_converter: + raise ValueError( + f"Semantic type '{semantic_type.name}' does not have default converters" + ) + + return cls(python_converter, arrow_converter, semantic_type.name) + + def __init__( + self, + python_converter: PythonConverter[T, R], + arrow_converter: ArrowConverter[T], + semantic_type_name: str | None = None, + ): + self.python_converter = python_converter + self.arrow_converter = arrow_converter + self.semantic_type_name = semantic_type_name + + @property + def python_type(self) -> type[R]: + """Get the Python type this converter handles""" + return self.python_converter.get_python_type() + + @property + def arrow_type(self) -> pa.DataType: + """Get the Arrow DataType this converter handles""" + return self.arrow_converter.get_arrow_type() + + def from_python_to_arrow(self, python_value: R | Collection[R]) -> pa.Array: + """Convert from Python to Arrow representation""" + if isinstance(python_value, self.python_type): + python_value = [python_value] + assert isinstance(python_value, Collection), ( + "Expected a collection of values at this point" + ) + python_values = cast(Collection[R], python_value) + canonicals = [self.python_converter.to_canonical(val) for val in python_values] + return self.arrow_converter.from_canonical(canonicals) + + def from_arrow_to_python(self, arrow_value: pa.Array) -> list[R]: + """Convert from Arrow to Python representation""" + canonical = self.arrow_converter.to_canonical(arrow_value) + return [self.python_converter.from_canonical(value) for value in canonical] diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 71318aa..940820f 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -232,34 +232,62 @@ def get_compatible_type(type1: Any, type2: Any) -> Any: raise TypeError(f"Types {type1} and {type2} are not compatible") -def union_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: # Merge the two TypeSpecs but raise an error if conflicts in types are found - merged = dict(left) - for key, right_type in right.items(): - merged[key] = ( - get_compatible_type(merged[key], right_type) - if key in merged - else right_type - ) + merged = dict(typespecs[0]) + for typespec in typespecs[1:]: + for key, right_type in typespec.items(): + merged[key] = ( + get_compatible_type(merged[key], right_type) + if key in merged + else right_type + ) return merged -def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +def intersection_typespecs(*typespecs: TypeSpec) -> TypeSpec: """ - Returns the intersection of two TypeSpecs, only returning keys that are present in both. + Returns the intersection of all TypeSpecs, only returning keys that are present in all typespecs. If a key is present in both TypeSpecs, the type must be the same. """ # Find common keys and ensure types match - common_keys = set(left.keys()).intersection(set(right.keys())) - intersection = {} - for key in common_keys: - try: - intersection[key] = get_compatible_type(left[key], right[key]) - except TypeError: - # If types are not compatible, raise an error - raise TypeError( - f"Type conflict for key '{key}': {left[key]} vs {right[key]}" - ) + common_keys = set(typespecs[0].keys()) + for typespec in typespecs[1:]: + common_keys.intersection_update(typespec.keys()) + + intersection = {k: typespecs[0][k] for k in common_keys} + for typespec in typespecs[1:]: + for key in common_keys: + try: + intersection[key] = get_compatible_type( + intersection[key], typespec[key] + ) + except TypeError: + # If types are not compatible, raise an error + raise TypeError( + f"Type conflict for key '{key}': {intersection[key]} vs {typespec[key]}" + ) return intersection + + +# def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +# """ +# Returns the intersection of two TypeSpecs, only returning keys that are present in both. +# If a key is present in both TypeSpecs, the type must be the same. +# """ + +# # Find common keys and ensure types match +# common_keys = set(left.keys()).intersection(set(right.keys())) +# intersection = {} +# for key in common_keys: +# try: +# intersection[key] = get_compatible_type(left[key], right[key]) +# except TypeError: +# # If types are not compatible, raise an error +# raise TypeError( +# f"Type conflict for key '{key}': {left[key]} vs {right[key]}" +# ) + +# return intersection diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py new file mode 100644 index 0000000..5a072de --- /dev/null +++ b/src/orcapod/utils/arrow_utils.py @@ -0,0 +1,126 @@ +# TODO: move this to a separate module + +import pyarrow as pa + + +def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: + """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, + no field names should collide.""" + merged_fields = [] + for schema in schemas: + merged_fields.extend(schema) + return pa.schema(merged_fields) + + +def hstack_tables(*tables: pa.Table) -> pa.Table: + """ + Horizontally stack multiple PyArrow tables by concatenating their columns. + + All input tables must have the same number of rows and unique column names. + + Args: + *tables: Variable number of PyArrow tables to stack horizontally + + Returns: + Combined PyArrow table with all columns from input tables + + Raises: + ValueError: If no tables provided, tables have different row counts, + or duplicate column names are found + """ + if len(tables) == 0: + raise ValueError("At least one table is required for horizontal stacking.") + if len(tables) == 1: + return tables[0] + + N = len(tables[0]) + for table in tables[1:]: + if len(table) != N: + raise ValueError( + "All tables must have the same number of rows for horizontal stacking." + ) + + # create combined column names + all_column_names = [] + all_columns = [] + all_names = set() + for i, table in enumerate(tables): + if overlap := set(table.column_names).intersection(all_names): + raise ValueError( + f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" + ) + all_names.update(table.column_names) + all_column_names += table.column_names + all_columns += table.columns + + return pa.Table.from_arrays(all_columns, names=all_column_names) + + +def check_arrow_schema_compatibility( + incoming_schema: pa.Schema, target_schema: pa.Schema, strict: bool = False +) -> tuple[bool, list[str]]: + # TODO: add strict comparison + """ + Check if incoming schema is compatible with current schema. + + Args: + incoming_schema: Schema to validate + target_schema: Expected schema to match against + strict: If True, requires exact match of field names and types. If False (default), + incoming_schema can have additional fields or different types as long as they are compatible. + + Returns: + Tuple of (is_compatible, list_of_errors) + """ + errors = [] + + # Create lookup dictionaries for efficient access + incoming_fields = {field.name: field for field in incoming_schema} + target_fields = {field.name: field for field in target_schema} + + # Check each field in target_schema + for field_name, target_field in target_fields.items(): + if field_name not in incoming_fields: + errors.append(f"Missing field '{field_name}' in incoming schema") + continue + + incoming_field = incoming_fields[field_name] + + # Check data type compatibility + if not target_field.type.equals(incoming_field.type): + # TODO: if not strict, allow type coercion + errors.append( + f"Type mismatch for field '{field_name}': " + f"expected {target_field.type}, got {incoming_field.type}" + ) + + # Check semantic_type metadata if present in current schema + current_metadata = target_field.metadata or {} + incoming_metadata = incoming_field.metadata or {} + + if b"semantic_type" in current_metadata: + expected_semantic_type = current_metadata[b"semantic_type"] + + if b"semantic_type" not in incoming_metadata: + errors.append( + f"Missing 'semantic_type' metadata for field '{field_name}'" + ) + elif incoming_metadata[b"semantic_type"] != expected_semantic_type: + errors.append( + f"Semantic type mismatch for field '{field_name}': " + f"expected {expected_semantic_type.decode()}, " + f"got {incoming_metadata[b'semantic_type'].decode()}" + ) + elif b"semantic_type" in incoming_metadata: + errors.append( + f"Unexpected 'semantic_type' metadata for field '{field_name}': " + f"{incoming_metadata[b'semantic_type'].decode()}" + ) + + # If strict mode, check for additional fields in incoming schema + if strict: + for field_name in incoming_fields: + if field_name not in target_fields: + errors.append(f"Unexpected field '{field_name}' in incoming schema") + + return len(errors) == 0, errors diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py index dd09e1f..8949622 100644 --- a/src/orcapod/utils/object_spec.py +++ b/src/orcapod/utils/object_spec.py @@ -1,20 +1,29 @@ import importlib +from typing import Any -def parse_objectspec(obj_spec: dict) -> Any: - if "_class" in obj_spec: - # if _class is specified, treat the dict as an object specification - module_name, class_name = obj_spec["_class"].rsplit(".", 1) - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - configs = parse_objectspec(obj_spec.get("config", {})) - return cls(**configs) - else: - # otherwise, parse through the dictionary recursively - parsed_object = obj_spec - for k, v in obj_spec.items(): - if isinstance(v, dict): +def parse_objectspec(obj_spec: Any) -> Any: + if isinstance(obj_spec, dict): + if "_class" in obj_spec: + # if _class is specified, treat the dict as an object specification, looking for + # _config key to extract configuration parameters + module_name, class_name = obj_spec["_class"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + configs = parse_objectspec(obj_spec.get("_config", {})) + return cls(**configs) + else: + # otherwise, parse through the dictionary recursively + parsed_object = obj_spec + for k, v in obj_spec.items(): parsed_object[k] = parse_objectspec(v) - else: - parsed_object[k] = v - return parsed_object + return parsed_object + elif isinstance(obj_spec, list): + # if it's a list, parse each item in the list + return [parse_objectspec(item) for item in obj_spec] + elif isinstance(obj_spec, tuple): + # if it's a tuple, parse each item in the tuple + return tuple(parse_objectspec(item) for item in obj_spec) + else: + # if it's neither a dict nor a list, return it as is + return obj_spec diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index 09d84d7..d7f6a3c 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -13,7 +13,7 @@ LegacyPacketHasher, LegacyPathSetHasher, ) -from orcapod.stores.dict_data_stores import DirDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore class MockFileHasher(LegacyFileHasher): diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 2a6e253..0c50292 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -12,7 +12,7 @@ LegacyDefaultCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 4ff838f..564b449 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -3,7 +3,7 @@ import pytest -from orcapod.stores.dict_data_stores import NoOpDataStore +from orcapod.stores.legacy.dict_data_stores import NoOpDataStore def test_noop_data_store_memoize(): diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 4721691..f4076d6 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -6,8 +6,8 @@ import pytest from orcapod.hashing.types import LegacyPacketHasher -from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore -from orcapod.stores.dict_transfer_data_store import TransferDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.legacy.dict_transfer_data_store import TransferDataStore class MockPacketHasher(LegacyPacketHasher): From 29b800401a6118f4b7bb8789a5b0cfed6bfa0c2c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 19 Jul 2025 02:41:36 +0000 Subject: [PATCH 076/224] feat: implement data context to capture shared hashing and semantic context information easily --- src/orcapod/data/context.py | 70 +++++ src/orcapod/data/datagrams.py | 387 ++++++------------------ src/orcapod/data/kernels.py | 17 ++ src/orcapod/data/operators.py | 8 + src/orcapod/data/pods.py | 133 ++++++-- src/orcapod/data/streams.py | 143 +++++++-- src/orcapod/protocols/data_protocols.py | 68 ++++- src/orcapod/types/arrow_utils.py | 129 +++++++- src/orcapod/types/semantic_converter.py | 3 +- src/orcapod/types/typespec_utils.py | 9 +- src/orcapod/utils/arrow_utils.py | 124 ++++++++ 11 files changed, 735 insertions(+), 356 deletions(-) create mode 100644 src/orcapod/data/context.py diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py new file mode 100644 index 0000000..cc47cff --- /dev/null +++ b/src/orcapod/data/context.py @@ -0,0 +1,70 @@ +from typing import Self +from orcapod.types.semantic_types import SemanticTypeRegistry +from orcapod.types import default_registry +from orcapod.protocols import hashing_protocols as hp +from orcapod.hashing.defaults import get_default_arrow_hasher, get_default_object_hasher +from dataclasses import dataclass + + +DATA_CONTEXT_COLUMN = "_orcapod_context_key" + + +@dataclass +class DataContext: + context_key: str + semantic_type_registry: SemanticTypeRegistry + arrow_hasher: hp.ArrowHasher + object_hasher: hp.ObjectHasher + + @staticmethod + def get_data_context_column() -> str: + """ + Returns the column name used to store the data context key in Arrow tables. + """ + return DATA_CONTEXT_COLUMN + + @staticmethod + def resolve_data_context(data_context: "str | DataContext | None") -> "DataContext": + """ + Returns the default data context manager. + This is typically used when no specific context is provided. + """ + return orcapod_system_data_context_manager.resolve_context(data_context) + + +default_data_context = DataContext( + "std:v0.1.0:default", + default_registry, + get_default_arrow_hasher(), + get_default_object_hasher(), +) + + +class DataContextManager(dict[str, DataContext]): + def register_context(self, DataContext): + """ + Register a new DataContext instance. + + Args: + DataContext: The DataContext instance to register. + """ + if DataContext.context_key in self: + raise ValueError( + f"DataContext with key {DataContext.context_key} already exists." + ) + self[DataContext.context_key] = DataContext + + def resolve_context(self, context_info: str | DataContext | None) -> DataContext: + if isinstance(context_info, DataContext): + return context_info + if context_info is None: + return default_data_context + if isinstance(context_info, str): + if context_info in self: + return self[context_info] + else: + raise ValueError(f"DataContext with key {context_info} not found.") + + +orcapod_system_data_context_manager = DataContextManager() +orcapod_system_data_context_manager.register_context(default_data_context) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 5bab7ba..0e8df1e 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -22,9 +22,11 @@ from orcapod.types import TypeSpec from orcapod.types.semantic_converter import SemanticConverter from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import schemas from orcapod.types import typespec_utils as tsutils +from orcapod.data.context import ( + DataContext, +) import pyarrow as pa import logging from orcapod.utils import arrow_utils @@ -48,201 +50,6 @@ PythonStore: TypeAlias = Mapping[str, DataValue] -# class SemanticConverter: -# """ -# Converts data between different representations (Python, semantic stores, Arrow tables). - -# SemanticConverter only tracks the semantic columns to be converted and does not -# enforce any type checking on other columns. Consequently, two completely different -# schemas could share a semantic converter if the have same named fields with identical -# semantic types. Furthermore, semantic types are defined by the association of semantic -# type name with a specific TypeHandler. - -# """ - -# @staticmethod -# def prepare_handler( -# semantic_schema: schemas.SemanticSchema, -# semantic_type_registry: SemanticTypeRegistry, -# ) -> dict[str, TypeHandler]: -# """ -# Prepare type handlers for semantic type conversion. - -# Args: -# semantic_schema: Schema containing semantic type information -# semantic_type_registry: Registry for looking up type handlers - -# Returns: -# Dictionary mapping field names to their type handlers -# """ -# handler_lut = {} -# for key, (_, semantic_type) in semantic_schema.items(): -# if semantic_type is None: -# continue # Skip keys without semantic type -# handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( -# semantic_type -# ) -# return handler_lut - -# @classmethod -# def from_typespec( -# cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry -# ) -> "SemanticConverter": -# """ -# Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). - -# Args: -# typespec: Type specification dictionary -# semantic_type_registry: Registry for semantic type lookup - -# Returns: -# New SemanticConverter instance -# """ -# semantic_schema = schemas.from_typespec_to_semantic_schema( -# typespec, semantic_type_registry -# ) -# python_schema = schemas.PythonSchema(typespec) -# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) -# return cls(python_schema, semantic_schema, handler_lut) - -# @classmethod -# def from_arrow_schema( -# cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry -# ) -> "SemanticConverter": -# """ -# Create a SemanticConverter from an Arrow schema. - -# Args: -# arrow_schema: PyArrow schema with semantic type metadata -# semantic_type_registry: Registry for semantic type lookup - -# Returns: -# New SemanticConverter instance -# """ -# semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) -# python_schema = schemas.from_semantic_schema_to_python_schema( -# semantic_schema, semantic_type_registry=semantic_type_registry -# ) -# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) -# return cls(python_schema, semantic_schema, handler_lut) - -# def __init__( -# self, -# handler_lut: dict[str, tuple[str, TypeHandler]] | None = None, -# ): -# """ -# Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. -# Use class methods like `from_arrow_schema` or `from_typespec` instead. - -# Args: -# python_schema: Schema for Python data types -# semantic_schema: Schema for semantic types -# handler_lut: Optional dictionary of type handlers for conversion -# """ -# if handler_lut is None: -# handler_lut = {} -# self.handler_lut = handler_lut - -# def convert_from_semantic_to_python( -# self, semantic_value: Any, semantic_type: SemanticType -# ) -> Any: -# """ -# Convert a semantic value to a Python value. - -# Args: -# semantic_value: Value in semantic (storage-optimized) format -# semantic_type: Corresponding semantic type - -# Returns: -# Value in Python native format -# """ -# handler = self.handler_lut.get(semantic_type) -# if handler: -# return handler.to_canonical(semantic_value) -# return semantic_value - -# def from_semantic_store_to_python_store( -# self, semantic_store: SemanticStore -# ) -> dict[str, DataValue]: -# """ -# Convert a semantic store to a Python store. - -# Args: -# semantic_store: Store (dict) with data stored in semantic (storage-optimized) types - -# Returns: -# Store with Python native types -# """ -# python_store = dict(semantic_store) -# for key, handler in self.handler_lut.items(): -# python_store[key] = handler.storage_to_python(semantic_store[key]) -# # TODO: come up with a more robust handling/conversion -# return cast(dict[str, DataValue], python_store) - -# def from_python_store_to_semantic_store( -# self, python_store: PythonStore -# ) -> SemanticStore: -# """ -# Convert a Python store to a semantic store. - -# Args: -# python_store: Store with Python native types - -# Returns: -# Store with semantic (storage-optimized) types -# """ -# semantic_store = dict(python_store) -# for key, handler in self.handler_lut.items(): -# semantic_store[key] = handler.python_to_storage(python_store[key]) -# return semantic_store # type: ignore[return-value] - -# def from_semantic_store_to_arrow_table( -# self, semantic_store: SemanticStore -# ) -> pa.Table: -# """Convert a semantic store to an Arrow table.""" -# return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) - -# def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: -# """Convert a Python store to an Arrow table.""" -# semantic_store = self.from_python_store_to_semantic_store(python_store) -# return self.from_semantic_store_to_arrow_table(semantic_store) - -# def from_arrow_table_to_semantic_stores( -# self, arrow_table: pa.Table -# ) -> list[SemanticStore]: -# """Convert an Arrow table to a list of semantic stores.""" -# self.verify_compatible_arrow_schema(arrow_table.schema) -# return arrow_table.to_pylist() # Ensure the table is materialized - -# def from_arrow_table_to_python_stores( -# self, arrow_table: pa.Table -# ) -> list[dict[str, DataValue]]: -# """Convert an Arrow table to a list of Python stores.""" -# return [ -# self.from_semantic_store_to_python_store(semantic_store) -# for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) -# ] - -# def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): -# """ -# Verify that an Arrow schema is compatible with the expected schema. - -# Args: -# arrow_schema: Schema to verify - -# Raises: -# ValueError: If schemas are incompatible -# """ -# compatible, errors = check_arrow_schema_compatibility( -# arrow_schema, self.arrow_schema -# ) -# if not compatible: -# raise ValueError( -# "Arrow table schema is not compatible with the expected schema: " -# + ", ".join(errors) -# ) - - class ImmutableDict(Mapping[str, DataValue]): """ An immutable dictionary-like container for DataValues. @@ -299,52 +106,54 @@ def __init__( data: Mapping[str, DataValue], typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: - # normalize the data content and remove any source info keys super().__init__(data) + # normalize the data content and remove any source info keys + self._data_context = DataContext.resolve_data_context(data_context) # combine provided typespec info with inferred typespec from content - verified_typespec = {} - if typespec is not None: - verified_typespec = dict(typespec) - # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict - inferred_typespec = tsutils.get_typespec_from_dict(self) - for key in self: - if key not in verified_typespec: - verified_typespec[key] = inferred_typespec[key] - self._python_schema = schemas.PythonSchema(verified_typespec) + inferred_typespec = tsutils.get_typespec_from_dict(self, typespec) + self._python_schema = schemas.PythonSchema(inferred_typespec) # create semantic converter if semantic_converter is None: semantic_converter = SemanticConverter.from_semantic_schema( self._python_schema.to_semantic_schema( - semantic_type_registry=semantic_type_registry + semantic_type_registry=self._data_context.semantic_type_registry ), ) self.semantic_converter = semantic_converter - self._arrow_hasher = arrow_hasher - self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None - def as_table( - self, - ) -> pa.Table: + @property + def data_context_key(self) -> str: + """Return the context key of the datagram.""" + return self._data_context.context_key + + def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if self._cached_table is None: + typespec = self.types() + typespec[DataContext.get_data_context_column()] = str self._cached_table = self.semantic_converter.from_python_to_arrow( - self, self.types() + self.as_dict(include_data_context=True), typespec ) - assert self._cached_table is not None, "Cached table should not be None" - return self._cached_table + assert self._cached_table is not None, "Cached table should not be None" + if include_data_context: + return self._cached_table - def as_dict(self) -> dict[str, DataValue]: + return self._cached_table.drop([DataContext.get_data_context_column()]) + + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" - return dict(self) + data = dict(self) + if include_data_context: + data[DataContext.get_data_context_column()] = self._data_context.context_key + return data def content_hash( self, @@ -356,18 +165,12 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - if self._arrow_hasher is None: - raise ValueError( - "Arrow hasher must be provided to calculate content hash." - ) - self._cached_content_hash = self._arrow_hasher.hash_table( - self.as_table(), + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_data_context=False), prefix_hasher_id=True, ) return self._cached_content_hash - # use keys() implementation from dict - def types(self) -> schemas.PythonSchema: """Return copy of the Python schema.""" return self._python_schema.copy() @@ -378,7 +181,7 @@ def _from_copy( data: Mapping[str, DataValue], python_schema: schemas.PythonSchema, semantic_converter: SemanticConverter, - arrow_hasher: hp.ArrowHasher | None, + data_context: DataContext, ) -> Self: """Create a new instance from copy without full initialization.""" instance = cls.__new__(cls) @@ -387,7 +190,7 @@ def _from_copy( # Set attributes directly instance._python_schema = python_schema instance.semantic_converter = semantic_converter - instance._arrow_hasher = arrow_hasher + instance._data_context = data_context instance._cached_table = None instance._cached_content_hash = None @@ -399,7 +202,7 @@ def copy(self) -> Self: self, self._python_schema.copy(), self.semantic_converter, - self._arrow_hasher, + self._data_context, ) @@ -437,8 +240,7 @@ def __init__( source_info: Mapping[str, str | None] | None = None, typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: # normalize the data content and remove any source info keys data_only = { @@ -454,8 +256,7 @@ def __init__( data_only, typespec=typespec, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) self._source_info = {**contained_source_info, **(source_info or {})} @@ -463,10 +264,11 @@ def __init__( def as_table( self, + include_data_context: bool = False, include_source: bool = False, ) -> pa.Table: """Convert the packet to an Arrow table.""" - table = super().as_table() + table = super().as_table(include_data_context=include_data_context) if include_source: if self._cached_source_info_table is None: source_info_data = { @@ -488,7 +290,9 @@ def as_table( table = arrow_utils.hstack_tables(table, source_info_table) return table - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Return dictionary representation. @@ -498,7 +302,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - dict_copy = dict(self) + dict_copy = super().as_dict(include_data_context=include_data_context) if include_source: for key, value in self.source_info().items(): dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value @@ -524,7 +328,7 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: data, typespec=typespec, semantic_converter=self.semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) # def content_hash2(self) -> str: @@ -543,13 +347,6 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: # use keys() implementation from dict - def types(self) -> schemas.PythonSchema: - """ - Returns: - Packet type information as PythonSchema (dict mapping field names to types). - """ - return self._python_schema.copy() - def source_info(self) -> dict[str, str | None]: """ Return source information for all keys. @@ -567,8 +364,9 @@ def copy(self) -> Self: return instance -def prepare_data_and_source_tables( - table: pa.Table, source_info: dict[str, str | None] | None = None +def prepare_system_data_tables( + table: pa.Table, + source_info: dict[str, str | None] | None = None, ) -> tuple[pa.Table, pa.Table]: """ Process a table to ensure proper source_info columns. @@ -602,8 +400,6 @@ def prepare_data_and_source_tables( source_info_columns = [] source_info_column_names = [] - # Add all regular columns first - # Create source_info columns for each regular column num_rows = table.num_rows @@ -664,8 +460,7 @@ def __init__( self, table: pa.Table, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: # normalize the table to ensure it contains proper source columns if len(table) != 1: @@ -674,41 +469,63 @@ def __init__( ) # TODO: add check for compatible types, especially of str being pa.large_string + table, data_context_table = arrow_utils.split_by_column_groups( + table, [DataContext.get_data_context_column()] + ) + self._table = table + if data_context is None and data_context_table is not None: + data_context = data_context_table[ + DataContext.get_data_context_column() + ].to_pylist()[0] + + self._data_context = DataContext.resolve_data_context(data_context) + + schema = pa.schema({DataContext.get_data_context_column(): pa.large_string()}) + self._context_info_table = pa.Table.from_pylist( + [{DataContext.get_data_context_column(): self._data_context.context_key}], + schema=schema, + ) + # create semantic converter # TODO: consider some validation of passed semantic_converter if semantic_converter is None: - if semantic_type_registry is None: - raise ValueError( - "Semantic type registry must be provided if semantic converter is not specified." - ) semantic_converter = SemanticConverter.from_semantic_schema( schemas.SemanticSchema.from_arrow_schema( self._table.schema, - semantic_type_registry, + self._data_context.semantic_type_registry, ) ) self._semantic_converter = semantic_converter - self._arrow_hasher = arrow_hasher self._cached_python_schema: schemas.PythonSchema | None = None self._cached_python_dict: dict[str, DataValue] | None = None self._cached_content_hash: str | None = None - def as_table( - self, - ) -> pa.Table: + @property + def data_context_key(self) -> str: + """Return the context key of the datagram.""" + return self._data_context.context_key + + def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" + if include_data_context: + return arrow_utils.hstack_tables(self._table, self._context_info_table) return self._table - def as_dict(self) -> dict[str, DataValue]: + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" if self._cached_python_dict is None: self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self._table + self.as_table(include_data_context=False) )[0] assert self._cached_python_dict is not None, "Cached dict should not be None" - return dict(self._cached_python_dict) + output = dict(self._cached_python_dict) + if include_data_context: + output[DataContext.get_data_context_column()] = ( + self._data_context.context_key + ) + return output def content_hash( self, @@ -720,12 +537,8 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - if self._arrow_hasher is None: - raise ValueError( - "Arrow hasher must be provided to calculate content hash." - ) - self._cached_content_hash = self._arrow_hasher.hash_table( - self.as_table(), + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_data_context=False), prefix_hasher_id=True, ) return self._cached_content_hash @@ -747,14 +560,13 @@ def _from_copy( table: pa.Table, python_schema: schemas.PythonSchema, semantic_converter: SemanticConverter, - hash_keys: tuple[str, ...], arrow_hasher: hp.ArrowHasher, ) -> Self: """Create a new instance from copy without full initialization.""" instance = cls.__new__(cls) instance._table = table instance._semantic_converter = semantic_converter - instance._arrow_hasher = arrow_hasher + instance._data_context = arrow_hasher # Set attributes directly instance._cached_content_hash = None @@ -766,7 +578,7 @@ def copy(self) -> Self: new_datagram = self.__class__( self._table, semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) new_datagram._cached_python_schema = self._cached_python_schema new_datagram._cached_python_dict = self._cached_python_dict @@ -798,8 +610,7 @@ def __init__( self, table: pa.Table, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: if len(table) != 1: raise ValueError( @@ -809,8 +620,7 @@ def __init__( super().__init__( table=table, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) @@ -843,10 +653,9 @@ def __init__( self, data: pa.Table, source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, skip_source_info_extraction: bool = False, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, ) -> None: if len(data) != 1: raise ValueError( @@ -858,10 +667,10 @@ def __init__( if not skip_source_info_extraction: # normalize the table to ensure it has the expected source_info columns - data_table, self._source_info_table = prepare_data_and_source_tables( + data_table, self._source_info_table = prepare_system_data_tables( data, source_info ) - else: + else: # assume that data already contains source info columns with appropriate prefixes data_columns: tuple[str, ...] = tuple( [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) @@ -873,8 +682,7 @@ def __init__( super().__init__( data_table, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) self._cached_source_info: dict[str, str | None] | None = None @@ -883,9 +691,10 @@ def __init__( def as_table( self, + include_data_context: bool = False, include_source: bool = False, ) -> pa.Table: - table = super().as_table() + table = super().as_table(include_data_context=include_data_context) if include_source: # add source_info only for existing data columns table = arrow_utils.hstack_tables( @@ -896,7 +705,9 @@ def as_table( ) return table - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Convert to dictionary representation. @@ -906,7 +717,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - return_dict = super().as_dict() + return_dict = super().as_dict(include_data_context=include_data_context) if include_source: return_dict.update( {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} @@ -918,7 +729,7 @@ def as_datagram(self, include_source: bool = False) -> ArrowDatagram: return ArrowDatagram( table, semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) def source_info(self) -> dict[str, str | None]: @@ -941,7 +752,7 @@ def copy(self) -> Self: self.as_table(), self.source_info(), semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, skip_source_info_extraction=True, ) new_packet._cached_source_info = self._cached_source_info diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index e876916..f77f7e1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -4,6 +4,7 @@ import logging from orcapod.data.streams import KernelStream from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.data.context import DataContext from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER from orcapod.types import TypeSpec @@ -29,16 +30,32 @@ def __init__( self, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, + data_context: str | DataContext | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, **kwargs, ) -> None: super().__init__(**kwargs) self._label = label + + self._data_context = DataContext.resolve_data_context(data_context) + self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams + @property + def data_context_key(self) -> str: + return self._data_context.context_key + + @property + def data_context(self) -> DataContext: + return self._data_context + + @property + @abstractmethod + def kernel_id(self) -> tuple[str, ...]: ... + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index a276e23..b1b3d1b 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -245,6 +245,14 @@ def __repr__(self) -> str: class Join(NonZeroInputOperator): + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}",) + def op_identity_structure(self, *streams: dp.Stream) -> Any: # Join does not depend on the order of the streams -- convert it onto a set id_struct = (self.__class__.__name__,) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index e6c2c96..a625a3a 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -4,19 +4,21 @@ from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast -from orcapod.data.datagrams import DictPacket, DictTag +from orcapod.data.datagrams import ( + DictPacket, + ArrowPacket, +) +from orcapod.data.context import DataContext from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join from orcapod.data.streams import PodStream -from orcapod.hashing import get_default_arrow_hasher from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore -from orcapod.types import TypeSpec, default_registry +from orcapod.types import TypeSpec from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter -from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) @@ -213,12 +215,11 @@ def __init__( input_typespec: TypeSpec | None = None, output_typespec: TypeSpec | Sequence[type] | None = None, label: str | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, ) -> None: self.function = function + if output_keys is None: output_keys = [] if isinstance(output_keys, str): @@ -243,14 +244,17 @@ def __init__( ) self._input_packet_schema = PythonSchema(input_packet_types) self._output_packet_schema = PythonSchema(output_packet_types) - - semantic_type_registry = semantic_type_registry or default_registry self._output_semantic_converter = SemanticConverter.from_semantic_schema( - self._output_packet_schema.to_semantic_schema(semantic_type_registry) + self._output_packet_schema.to_semantic_schema( + semantic_type_registry=self.data_context.semantic_type_registry + ) ) - self.arrow_hasher = arrow_hasher or get_default_arrow_hasher() - self.function_info_extractor = function_info_extractor + self._function_info_extractor = function_info_extractor + + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.function_name,) def input_packet_types(self) -> PythonSchema: """ @@ -311,7 +315,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non {k: v for k, v in zip(self.output_keys, output_values)}, typespec=self.output_packet_types(), semantic_converter=self._output_semantic_converter, - arrow_hasher=self.arrow_hasher, + data_context=self._data_context, ) return tag, output_packet @@ -319,8 +323,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name - if self.function_info_extractor is not None: - function_info = self.function_info_extractor.extract_function_info( + if self._function_info_extractor is not None: + function_info = self._function_info_extractor.extract_function_info( self.function, function_name=self.function_name, input_typespec=self.input_packet_types(), @@ -357,7 +361,7 @@ class WrappedPod(ActivatablePodBase): def __init__( self, - pod: dp.Pod, + pod: FunctionPod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, **kwargs, @@ -365,6 +369,30 @@ def __init__( super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod + @property + def data_context_key(self) -> str: + """ + Return the data context for the wrapped pod. + This is used to resolve semantic types and other context-specific information. + """ + return self.pod.data_context_key + + @property + def data_context(self) -> DataContext: + """ + Return the data context for the wrapped pod. + This is used to resolve semantic types and other context-specific information. + """ + return self.pod.data_context + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.kernel_id + def computed_label(self) -> str | None: return self.pod.label @@ -403,10 +431,11 @@ class CachedPod(WrappedPod): def __init__( self, - pod: dp.Pod, + pod: FunctionPod, result_store: ArrowDataStore, lineage_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), + data_context: str | DataContext | None = None, **kwargs, ): super().__init__(pod, **kwargs) @@ -414,6 +443,72 @@ def __init__( self.result_store = result_store self.lineage_store = lineage_store - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[DictTag, DictPacket | None]: ... + self.pod_hash = self.data_context.object_hasher.hash_to_hex( + self.pod, prefix_hasher_id=True + ) + + @property + def pod_id(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.kernel_id + (self.pod_hash,) + + @property + def record_path(self) -> tuple[str, ...]: + """ + Return the path to the record in the result store. + This is used to store the results of the pod. + """ + return self.record_path_prefix + self.pod_id + + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + output_packet = self.get_recorded_output_packet(packet) + if output_packet is not None: + return tag, output_packet + output_tag, output_packet = self.pod.call(tag, packet) + if output_packet is not None: + self.record_packet(packet, output_packet) + return output_tag, output_packet + + def record_packet( + self, + input_packet: dp.Packet, + output_packet: dp.Packet, + ignore_duplicates: bool = False, + ) -> dp.Packet: + """ + Record the output packet against the input packet in the result store. + """ + result_flag = self.result_store.record_data( + self.record_path, + input_packet.content_hash(), + output_packet.as_table(include_source=True), + ignore_duplicates=ignore_duplicates, + ) + if result_flag is None: + # TODO: do more specific error handling + raise ValueError( + f"Failed to record packet {input_packet} in result store {self.result_store}" + ) + # TODO: make store return retrieved table + return output_packet + + def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | None: + """ + Retrieve the output packet from the result store based on the input packet. + If the output packet is not found, return None. + """ + result_table = self.result_store.get_recorded_data( + self.record_path, input_packet.content_hash() + ) + if result_table is None: + return None + + return ArrowPacket( + result_table, + semantic_converter=self.pod._output_semantic_converter, + data_context=self.data_context, + skip_source_info_extraction=True, + ) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index bce6585..ebe0249 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,9 +1,15 @@ -from orcapod.hashing.types import ArrowHasher from orcapod.protocols import data_protocols as dp -from orcapod.types import schemas, TypeSpec -from orcapod.types.semantic_types import SemanticTypeRegistry -from orcapod.data.datagrams import ArrowPacket, ArrowTag, DictTag, SemanticConverter +from orcapod.data.context import DataContext +from orcapod.data.datagrams import ( + ArrowPacket, + ArrowTag, + DictTag, + SemanticConverter, + SOURCE_INFO_PREFIX, +) +from orcapod.utils import arrow_utils from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.types import TypeSpec, schemas import pyarrow as pa from collections.abc import Iterator, Collection from abc import ABC, abstractmethod @@ -32,6 +38,7 @@ def __init__( self, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), + data_context: str | DataContext | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -39,6 +46,15 @@ def __init__( self._upstreams = upstreams self._last_modified: datetime | None = None self._set_modified_time() + self._data_context = DataContext.resolve_data_context(data_context) + + @property + def data_context(self) -> DataContext: + """ + Returns the data context for the stream. + This is used to resolve semantic types and other context-specific information. + """ + return self._data_context @property def source(self) -> dp.Kernel | None: @@ -121,7 +137,12 @@ def iter_packets( ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... @abstractmethod - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: ... + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: ... def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ @@ -255,12 +276,21 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) - return self._cached_stream.as_table(include_content_hash=include_content_hash) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() @@ -288,16 +318,32 @@ class ImmutableTableStream(StreamBase): def __init__( self, table: pa.Table, + source_info: dict[str, str | None] | None = None, tag_columns: Collection[str] = (), source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: ArrowHasher | None = None, **kwargs, ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) + table, data_context_table = arrow_utils.split_by_column_groups( + table, [DataContext.get_data_context_column()] + ) + if data_context_table is None: + data_context_table = pa.table( + { + DataContext.get_data_context_column(): pa.nulls( + len(table), pa.large_string() + ) + } + ) + + prefix_info = {SOURCE_INFO_PREFIX: source_info} + + table, prefix_tables = arrow_utils.prepare_prefixed_columns(table, prefix_info) self._table = table + self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._data_context_table = data_context_table self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) self._packet_columns = tuple( @@ -318,16 +364,16 @@ def __init__( self._tag_schema = tag_schema self._packet_schema = packet_schema self._tag_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema(tag_schema, semantic_type_registry) + schemas.SemanticSchema.from_arrow_schema( + tag_schema, self._data_context.semantic_type_registry + ) ) self._packet_converter = SemanticConverter.from_semantic_schema( schemas.SemanticSchema.from_arrow_schema( - packet_schema, semantic_type_registry + packet_schema, self._data_context.semantic_type_registry ) ) - self._arrow_hasher = arrow_hasher - self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now @@ -353,21 +399,35 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: ), ) - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. """ - if not include_content_hash: - return self._table - hash_column_name = ( - "_content_hash" if include_content_hash is True else include_content_hash - ) - content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] - table_with_hash = self._table.append_column( - hash_column_name, pa.array(content_hashes, type=pa.large_string()) - ) - return table_with_hash + output_table = self._table + if include_content_hash: + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + content_hashes = [ + packet.content_hash() for _, packet in self.iter_packets() + ] + output_table = output_table.append_column( + hash_column_name, pa.array(content_hashes, type=pa.large_string()) + ) + table_stack = (output_table,) + if include_data_context: + table_stack += (self._data_context_table,) + if include_source: + table_stack += (self._source_info_table,) + return arrow_utils.hstack_tables(*table_stack) def clear_cache(self) -> None: """ @@ -400,7 +460,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: tag = ArrowTag( tag_batch.slice(i, 1), # type: ignore semantic_converter=self._tag_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) else: @@ -411,7 +471,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: ArrowPacket( packet_batch.slice(i, 1), semantic_converter=self._packet_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ), ) ) @@ -459,6 +519,7 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ + tag_keys, _ = self.input_stream.keys() packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys @@ -493,7 +554,12 @@ def invalidate(self) -> None: self.clear_cache() self._set_modified_time(invalidate=True) - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: # TODO: note that this is likely NOT multi-thread safe self.refresh() if self._cached_output_table is None: @@ -502,7 +568,9 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: for tag, packet in self.iter_packets(): # TODO: evaluate handling efficiency here all_tags.append(tag.as_dict()) - all_packets.append(packet.as_dict(include_source=True)) + all_packets.append( + packet.as_dict(include_data_context=True, include_source=True) + ) all_tags: pa.Table = pa.Table.from_pylist(all_tags) all_packets: pa.Table = pa.Table.from_pylist(all_packets) @@ -518,6 +586,17 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: all_tags.columns + all_packets.columns, names=all_tags.column_names + all_packets.column_names, ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{SOURCE_INFO_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(DataContext.get_data_context_column()) + + output_table = self._cached_output_table.drop(drop_columns) # lazily prepare content hash column if requested if include_content_hash: @@ -528,18 +607,18 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: self._cached_content_hash_column = pa.array( content_hashes, type=pa.large_string() ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." ) hash_column_name = ( "_content_hash" if include_content_hash is True else include_content_hash ) - return self._cached_output_table.append_column( + output_table = output_table.append_column( hash_column_name, self._cached_content_hash_column ) - return self._cached_output_table + return output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 767ea0e..2e0e927 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -18,6 +18,27 @@ class Datagram(Protocol): enabling type checking and validation throughout the computational graph. """ + @property + def data_context_key(self) -> str: + """ + Return the data context key for this datagram. + + This key identifies the semantic type registry, arrow hasher, and other + contextual information needed to properly interpret and work with this + datagram across various operations (storage, visualization, processing, etc.). + + Context key formats: + - Standard contexts: "std:v1.2.3:fingerprint" + - Custom contexts: "custom:user_provided_id" + + Concrete implementation can make use of this context key to ensure necessary background + informaton / object is available for correct processing of the datagram. + + Returns: + str: Context key for proper datagram interpretation + """ + ... + def types(self) -> TypeSpec: """ Return the type specification for this datagram. @@ -42,7 +63,7 @@ def keys(self) -> Collection[str]: """ ... - def as_table(self) -> pa.Table: + def as_table(self, include_data_context: bool = False) -> pa.Table: """ Convert to PyArrow Table format. @@ -54,7 +75,7 @@ def as_table(self) -> pa.Table: """ ... - def as_dict(self) -> dict[str, DataValue]: + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """ Convert to dictionary format. @@ -123,7 +144,9 @@ class Packet(Datagram, Protocol): data flow: Tags provide context, Packets provide content. """ - def as_table(self, include_source: bool = False) -> pa.Table: + def as_table( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Table: """ Convert the packet to a PyArrow Table. @@ -136,7 +159,9 @@ def as_table(self, include_source: bool = False) -> pa.Table: """ ... - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Convert the packet to a dictionary. @@ -395,7 +420,12 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: """ Convert the entire stream to a PyArrow Table. @@ -509,6 +539,34 @@ class Kernel(ContentIdentifiable, Labelable, Protocol): full tracking) and testing/debugging (without side effects). """ + @property + def data_context_key(self) -> str: + """ + Return the data context key for this kernel. + + This key identifies the semantic type registry, arrow hasher, and other + contextual information needed to properly interpret and work with this + kernel across various operations (storage, visualization, processing, etc.). + + Returns: + str: Context key for proper kernel interpretation + """ + ... + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return a unique identifier for this Pod. + + The pod_id is used for caching and tracking purposes. It should + uniquely identify the Pod's computational logic, parameters, and + any relevant metadata that affects its behavior. + + Returns: + tuple[str, ...]: Unique identifier for this Pod + """ + ... + def __call__( self, *streams: Stream, label: str | None = None, **kwargs ) -> LiveStream: diff --git a/src/orcapod/types/arrow_utils.py b/src/orcapod/types/arrow_utils.py index c446901..34a06a3 100644 --- a/src/orcapod/types/arrow_utils.py +++ b/src/orcapod/types/arrow_utils.py @@ -1,10 +1,123 @@ -import pyarrow as pa +# from collections.abc import Mapping, Collection +# import pyarrow as pa +# from typing import Any -def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: - """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, - no field names should collide.""" - merged_fields = [] - for schema in schemas: - merged_fields.extend(schema) - return pa.schema(merged_fields) +# def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: +# """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, +# no field names should collide.""" +# merged_fields = [] +# for schema in schemas: +# merged_fields.extend(schema) +# return pa.schema(merged_fields) + + +# def split_by_column_groups( +# self, *column_groups: Collection[str] +# ) -> tuple[pa.Table | None]: +# """ +# Split the table into multiple tables based on the provided column groups. +# Each group is a collection of column names that should be included in the same table. +# The remaining columns that are not part of any group will be returned as the first table/None. +# """ +# if not column_groups: +# return (self,) + +# tables = [] +# remaining_columns = set(self.column_names) + +# for group in column_groups: +# group_columns = [col for col in group if col in remaining_columns] +# if group_columns: +# tables.append(self.select(group_columns)) +# remaining_columns.difference_update(group_columns) +# else: +# tables.append(None) + +# remaining_table = None +# if remaining_columns: +# orderd_remaining_columns = self.column_names +# remaining_columns = [ +# col for col in orderd_remaining_columns if col in remaining_columns +# ] +# remaining_table = self.select(orderd_remaining_columns) +# return (remaining_table, *tables) + + +# def prepare_prefixed_columns( +# table: pa.Table, +# prefix_group: Collection[str] | Mapping[str, Any | None], +# ) -> tuple[pa.Table, pa.Table]: +# """ """ +# if isinstance(prefix_group, Mapping): +# prefix_group = {k: v if v is not None else {} for k, v in prefix_group.items()} +# elif isinstance(prefix_group, Collection): +# prefix_group = {name: {} for name in prefix_group} +# else: +# raise TypeError( +# "prefix_group must be a Collection of strings or a Mapping of string to string or None." +# ) + +# # Visit each prefix group and split them into separate tables +# member_columns = {} + +# for col_name in table.column_names: +# for prefix in prefix_group: +# if col_name.startswith(prefix): +# # Remove the prefix from the column name +# base_name = col_name.removeprefix(prefix) +# if base_name not in member_columns: +# member_columns[base_name] = [] +# member_columns[base_name].append(table.column(col_name)) + +# data_columns = [] +# data_column_names = [] +# existing_source_info = {} + +# for i, name in enumerate(table.column_names): +# if name.startswith(SOURCE_INFO_PREFIX): +# # Extract the base column name +# base_name = name.removeprefix(SOURCE_INFO_PREFIX) +# existing_source_info[base_name] = table.column(i) +# else: +# data_columns.append(table.column(i)) +# data_column_names.append(name) + +# # Step 2: Create source_info columns for each regular column +# source_info_columns = [] +# source_info_column_names = [] + +# # Create source_info columns for each regular column +# num_rows = table.num_rows + +# for col_name in data_column_names: +# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" + +# # if col_name is in source_info, use that value +# if col_name in source_info: +# # Use value from source_info dictionary +# source_value = source_info[col_name] +# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) +# # if col_name is in existing_source_info, use that column +# elif col_name in existing_source_info: +# # Use existing source_info column, but convert to large_string +# existing_col = existing_source_info[col_name] +# if existing_col.type == pa.large_string(): +# source_values = existing_col +# else: +# # Convert to large_string +# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + +# else: +# # Use null values +# source_values = pa.array([None] * num_rows, type=pa.large_string()) + +# source_info_columns.append(source_values) +# source_info_column_names.append(source_info_col_name) + +# # Step 3: Create the final table +# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) +# source_info_table: pa.Table = pa.Table.from_arrays( +# source_info_columns, names=source_info_column_names +# ) +# return data_table, source_info_table diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 8dc0df1..118b110 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -2,7 +2,8 @@ from orcapod.types.schemas import PythonSchema, SemanticSchema from orcapod.types import typespec_utils as tsutils -from typing import Any, Mapping, Self +from typing import Any, Self +from collections.abc import Mapping import pyarrow as pa import logging diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 940820f..9f66654 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -214,12 +214,15 @@ def extract_function_typespecs( return param_info, inferred_output_types -def get_typespec_from_dict(dict: Mapping) -> TypeSpec: +def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> TypeSpec: """ Returns a TypeSpec for the given dictionary. - The TypeSpec is a mapping from field name to Python type. + The TypeSpec is a mapping from field name to Python type. If typespec is provided, then + it is used as a base when inferring types for the fields in dict """ - return {key: type(value) for key, value in dict.items()} + if typespec is None: + typespec = {} + return {key: typespec.get(key, type(value)) for key, value in data.items()} def get_compatible_type(type1: Any, type2: Any) -> Any: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 5a072de..f9a6d7f 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1,6 +1,10 @@ # TODO: move this to a separate module +from collections import defaultdict +from matplotlib.pylab import f import pyarrow as pa +from collections.abc import Mapping, Collection +from typing import Any def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: @@ -124,3 +128,123 @@ def check_arrow_schema_compatibility( errors.append(f"Unexpected field '{field_name}' in incoming schema") return len(errors) == 0, errors + + +def split_by_column_groups( + table, + *column_groups: Collection[str], +) -> tuple[pa.Table | None, ...]: + """ + Split the table into multiple tables based on the provided column groups. + Each group is a collection of column names that should be included in the same table. + The remaining columns that are not part of any group will be returned as the first table/None. + """ + if not column_groups: + return (table,) + + tables = [] + remaining_columns = set(table.column_names) + + for group in column_groups: + group_columns = [col for col in group if col in remaining_columns] + if group_columns: + tables.append(table.select(group_columns)) + remaining_columns.difference_update(group_columns) + else: + tables.append(None) + + remaining_table = None + if remaining_columns: + ordered_remaining_columns = [ + col for col in table.column_names if col in remaining_columns + ] + remaining_table = table.select(ordered_remaining_columns) + return (remaining_table, *tables) + + +def prepare_prefixed_columns( + table: pa.Table, + prefix_info: Collection[str] + | Mapping[str, Any | None] + | Mapping[str, Mapping[str, Any | None]], +) -> tuple[pa.Table, dict[str, pa.Table]]: + """ """ + all_prefix_info = {} + if isinstance(prefix_info, Mapping): + for prefix, info in prefix_info.items(): + if isinstance(info, Mapping): + all_prefix_info[prefix] = info + else: + all_prefix_info[prefix] = info + elif isinstance(prefix_info, Collection): + for prefix in prefix_info: + all_prefix_info[prefix] = {} + else: + raise TypeError( + "prefix_group must be a Collection of strings or a Mapping of string to string or None." + ) + + # split column into prefix groups + data_column_names = [] + data_columns = [] + existing_prefixed_columns = defaultdict(list) + + for col_name in table.column_names: + prefix_found = False + for prefix in all_prefix_info: + if col_name.startswith(prefix): + # Remove the prefix from the column name + base_name = col_name.removeprefix(prefix) + existing_prefixed_columns[prefix].append(base_name) + prefix_found = True + if not prefix_found: + # if no prefix found, consider this as a data column + data_column_names.append(col_name) + data_columns.append(table[col_name]) + + # Create source_info columns for each regular column + num_rows = table.num_rows + + prefixed_column_names = defaultdict(list) + prefixed_columns = defaultdict(list) + + for prefix, value_lut in all_prefix_info.items(): + target_prefixed_column_names = prefixed_column_names[prefix] + target_prefixed_columns = prefixed_columns[prefix] + + for col_name in data_column_names: + prefixed_col_name = f"{prefix}{col_name}" + existing_columns = existing_prefixed_columns[prefix] + + if isinstance(value_lut, Mapping): + value = value_lut.get(col_name) + else: + value = value_lut + + if value is not None: + # Use value from source_info dictionary + column_values = pa.array([value] * num_rows, type=pa.large_string()) + # if col_name is in existing_source_info, use that column + elif col_name in existing_columns: + # Use existing source_info column, but convert to large_string + existing_col = table[prefixed_col_name] + + if existing_col.type == pa.string(): + # Convert to large_string + column_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + else: + column_values = existing_col + else: + # Use null values + column_values = pa.array([None] * num_rows, type=pa.large_string()) + target_prefixed_column_names.append(prefixed_col_name) + target_prefixed_columns.append(column_values) + + # Step 3: Create the final table + data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) + result_tables = {} + for prefix in all_prefix_info: + result_tables[prefix] = pa.Table.from_arrays( + prefixed_columns[prefix], names=prefixed_column_names[prefix] + ) + return data_table, result_tables From 4c710dbf904f9f64e67b6ac6a04c54b64db40cac Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 19 Jul 2025 10:50:18 +0000 Subject: [PATCH 077/224] refactor: clean up protocol around types --- src/orcapod/data/datagrams.py | 337 ++++++++++++++--------- src/orcapod/data/kernels.py | 4 - src/orcapod/data/pods.py | 45 ++- src/orcapod/data/streams.py | 43 +-- src/orcapod/protocols/data_protocols.py | 59 +++- src/orcapod/protocols/store_protocols.py | 2 +- src/orcapod/stores/__init__.py | 26 +- src/orcapod/stores/delta_lake_stores.py | 14 +- src/orcapod/utils/arrow_utils.py | 19 +- 9 files changed, 340 insertions(+), 209 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 0e8df1e..b506a56 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -127,6 +127,7 @@ def __init__( self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None + self._cached_arrow_schema: pa.Schema | None = None @property def data_context_key(self) -> str: @@ -137,10 +138,9 @@ def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if self._cached_table is None: - typespec = self.types() - typespec[DataContext.get_data_context_column()] = str self._cached_table = self.semantic_converter.from_python_to_arrow( - self.as_dict(include_data_context=True), typespec + self.as_dict(include_data_context=True), + self.types(include_data_context=True), ) assert self._cached_table is not None, "Cached table should not be None" if include_data_context: @@ -171,9 +171,35 @@ def content_hash( ) return self._cached_content_hash - def types(self) -> schemas.PythonSchema: + def types(self, include_data_context: bool = False) -> schemas.PythonSchema: """Return copy of the Python schema.""" - return self._python_schema.copy() + schema = self._python_schema.copy() + if include_data_context: + schema[DataContext.get_data_context_column()] = str + return schema + + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + if self._cached_arrow_schema is None: + self._cached_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self.types(include_data_context=True) + ) + ) + if not include_data_context: + return arrow_utils.drop_schema_columns( + self._cached_arrow_schema, + [DataContext.get_data_context_column()], + ) + return self._cached_arrow_schema @classmethod def _from_copy( @@ -261,6 +287,15 @@ def __init__( self._source_info = {**contained_source_info, **(source_info or {})} self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + {f"{SOURCE_INFO_PREFIX}{k}": pa.large_string() for k in self.keys()} + ) + return self._cached_source_info_schema def as_table( self, @@ -274,18 +309,19 @@ def as_table( source_info_data = { f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() } - source_info_schema = pa.schema( - {k: pa.large_string() for k in source_info_data} - ) self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=source_info_schema + [source_info_data], schema=self._source_info_schema ) assert self._cached_source_info_table is not None, ( "Cached source info table should not be None" ) # subselect the corresponding _source_info as the columns present in the data table source_info_table = self._cached_source_info_table.select( - [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] + [ + f"{SOURCE_INFO_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] ) table = arrow_utils.hstack_tables(table, source_info_table) return table @@ -308,6 +344,34 @@ def as_dict( dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + def as_datagram(self, include_source: bool = False) -> DictDatagram: """ Convert the packet to a DictDatagram. @@ -319,11 +383,7 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: DictDatagram representation of the packet """ data = self.as_dict(include_source=include_source) - typespec = self.types() - # append source info to typespec if requested - if include_source: - for key in self.keys(): - typespec[f"{SOURCE_INFO_PREFIX}{key}"] = str + typespec = self.types(include_source=include_source) return DictDatagram( data, typespec=typespec, @@ -331,22 +391,6 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: data_context=self._data_context, ) - # def content_hash2(self) -> str: - # """ - # Calculate content hash excluding source information. - - # Returns: - # Hash string of the packet content - # """ - # # TODO: check if this is identical to DictDatagram.content_hash - # if self._cached_content_hash is None: - # self._cached_content_hash = self._arrow_hasher.hash_table( - # self.as_table(include_source=False), prefix_hasher_id=True - # ) - # return self._cached_content_hash - - # use keys() implementation from dict - def source_info(self) -> dict[str, str | None]: """ Return source information for all keys. @@ -364,76 +408,76 @@ def copy(self) -> Self: return instance -def prepare_system_data_tables( - table: pa.Table, - source_info: dict[str, str | None] | None = None, -) -> tuple[pa.Table, pa.Table]: - """ - Process a table to ensure proper source_info columns. - - Args: - table: Input PyArrow table - source_info: optional dictionary mapping column names to source info values. If present, - it will take precedence over existing source_info columns in the table. - - Returns: - tuple of table without any source info and another table only containing source info columns (with prefix) - """ - if source_info is None: - source_info = {} - - # Step 1: Separate source_info columns from regular columns - data_columns = [] - data_column_names = [] - existing_source_info = {} - - for i, name in enumerate(table.column_names): - if name.startswith(SOURCE_INFO_PREFIX): - # Extract the base column name - base_name = name.removeprefix(SOURCE_INFO_PREFIX) - existing_source_info[base_name] = table.column(i) - else: - data_columns.append(table.column(i)) - data_column_names.append(name) - - # Step 2: Create source_info columns for each regular column - source_info_columns = [] - source_info_column_names = [] - - # Create source_info columns for each regular column - num_rows = table.num_rows - - for col_name in data_column_names: - source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" - - # if col_name is in source_info, use that value - if col_name in source_info: - # Use value from source_info dictionary - source_value = source_info[col_name] - source_values = pa.array([source_value] * num_rows, type=pa.large_string()) - # if col_name is in existing_source_info, use that column - elif col_name in existing_source_info: - # Use existing source_info column, but convert to large_string - existing_col = existing_source_info[col_name] - if existing_col.type == pa.large_string(): - source_values = existing_col - else: - # Convert to large_string - source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore - - else: - # Use null values - source_values = pa.array([None] * num_rows, type=pa.large_string()) - - source_info_columns.append(source_values) - source_info_column_names.append(source_info_col_name) - - # Step 3: Create the final table - data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) - source_info_table: pa.Table = pa.Table.from_arrays( - source_info_columns, names=source_info_column_names - ) - return data_table, source_info_table +# def prepare_system_data_tables( +# table: pa.Table, +# source_info: dict[str, str | None] | None = None, +# ) -> tuple[pa.Table, pa.Table]: +# """ +# Process a table to ensure proper source_info columns. + +# Args: +# table: Input PyArrow table +# source_info: optional dictionary mapping column names to source info values. If present, +# it will take precedence over existing source_info columns in the table. + +# Returns: +# tuple of table without any source info and another table only containing source info columns (with prefix) +# """ +# if source_info is None: +# source_info = {} + +# # Step 1: Separate source_info columns from regular columns +# data_columns = [] +# data_column_names = [] +# existing_source_info = {} + +# for i, name in enumerate(table.column_names): +# if name.startswith(SOURCE_INFO_PREFIX): +# # Extract the base column name +# base_name = name.removeprefix(SOURCE_INFO_PREFIX) +# existing_source_info[base_name] = table.column(i) +# else: +# data_columns.append(table.column(i)) +# data_column_names.append(name) + +# # Step 2: Create source_info columns for each regular column +# source_info_columns = [] +# source_info_column_names = [] + +# # Create source_info columns for each regular column +# num_rows = table.num_rows + +# for col_name in data_column_names: +# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" + +# # if col_name is in source_info, use that value +# if col_name in source_info: +# # Use value from source_info dictionary +# source_value = source_info[col_name] +# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) +# # if col_name is in existing_source_info, use that column +# elif col_name in existing_source_info: +# # Use existing source_info column, but convert to large_string +# existing_col = existing_source_info[col_name] +# if existing_col.type == pa.large_string(): +# source_values = existing_col +# else: +# # Convert to large_string +# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + +# else: +# # Use null values +# source_values = pa.array([None] * num_rows, type=pa.large_string()) + +# source_info_columns.append(source_values) +# source_info_column_names.append(source_info_col_name) + +# # Step 3: Create the final table +# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) +# source_info_table: pa.Table = pa.Table.from_arrays( +# source_info_columns, names=source_info_column_names +# ) +# return data_table, source_info_table class ArrowDatagram: @@ -482,10 +526,12 @@ def __init__( self._data_context = DataContext.resolve_data_context(data_context) - schema = pa.schema({DataContext.get_data_context_column(): pa.large_string()}) - self._context_info_table = pa.Table.from_pylist( + data_context_schema = pa.schema( + {DataContext.get_data_context_column(): pa.large_string()} + ) + self._data_context_table = pa.Table.from_pylist( [{DataContext.get_data_context_column(): self._data_context.context_key}], - schema=schema, + schema=data_context_schema, ) # create semantic converter @@ -510,7 +556,7 @@ def data_context_key(self) -> str: def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if include_data_context: - return arrow_utils.hstack_tables(self._table, self._context_info_table) + return arrow_utils.hstack_tables(self._table, self._data_context_table) return self._table def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: @@ -546,13 +592,32 @@ def content_hash( def keys(self) -> tuple[str, ...]: return tuple(self._table.column_names) - def types(self) -> schemas.PythonSchema: + def types(self, include_data_context: bool = False) -> schemas.PythonSchema: """Return copy of the Python schema.""" if self._cached_python_schema is None: self._cached_python_schema = ( self._semantic_converter.from_arrow_to_python_schema(self._table.schema) ) - return self._cached_python_schema.copy() + schema = self._cached_python_schema.copy() + if include_data_context: + schema[DataContext.get_data_context_column()] = str + return schema + + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + if include_data_context: + return arrow_utils.join_arrow_schemas( + self._table.schema, self._data_context_table.schema + ) + return self._table.schema @classmethod def _from_copy( @@ -653,7 +718,6 @@ def __init__( self, data: pa.Table, source_info: dict[str, str | None] | None = None, - skip_source_info_extraction: bool = False, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -665,19 +729,13 @@ def __init__( if source_info is None: source_info = {} - if not skip_source_info_extraction: - # normalize the table to ensure it has the expected source_info columns - data_table, self._source_info_table = prepare_system_data_tables( - data, source_info - ) - else: # assume that data already contains source info columns with appropriate prefixes - data_columns: tuple[str, ...] = tuple( - [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] - ) - source_columns = [f"{SOURCE_INFO_PREFIX}{c}" for c in data_columns] - # Add conversion to large_string type - data_table = data.select(data_columns) - self._source_info_table = data.select(source_columns) + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + data, + {SOURCE_INFO_PREFIX: source_info}, + exclude_columns=[DataContext.get_data_context_column()], + ) + self._source_info_table = prefixed_tables[SOURCE_INFO_PREFIX] super().__init__( data_table, @@ -700,11 +758,45 @@ def as_table( table = arrow_utils.hstack_tables( table, self._source_info_table.select( - [f"{SOURCE_INFO_PREFIX}{c}" for c in table.column_names] + [ + f"{SOURCE_INFO_PREFIX}{c}" + for c in table.column_names + if c in self.keys() + ] ), ) return table + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + def as_dict( self, include_data_context: bool = False, include_source: bool = False ) -> dict[str, DataValue]: @@ -753,7 +845,6 @@ def copy(self) -> Self: self.source_info(), semantic_converter=self._semantic_converter, data_context=self._data_context, - skip_source_info_extraction=True, ) new_packet._cached_source_info = self._cached_source_info new_packet._cached_python_dict = self._cached_python_dict diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index f77f7e1..09cf09f 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -44,10 +44,6 @@ def __init__( self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams - @property - def data_context_key(self) -> str: - return self._data_context.context_key - @property def data_context(self) -> DataContext: return self._data_context diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a625a3a..cd06f34 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -20,6 +20,7 @@ from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter from orcapod.types import typespec_utils as tsutils +import pyarrow as pa logger = logging.getLogger(__name__) @@ -361,7 +362,7 @@ class WrappedPod(ActivatablePodBase): def __init__( self, - pod: FunctionPod, + pod: dp.Pod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, **kwargs, @@ -369,22 +370,6 @@ def __init__( super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod - @property - def data_context_key(self) -> str: - """ - Return the data context for the wrapped pod. - This is used to resolve semantic types and other context-specific information. - """ - return self.pod.data_context_key - - @property - def data_context(self) -> DataContext: - """ - Return the data context for the wrapped pod. - This is used to resolve semantic types and other context-specific information. - """ - return self.pod.data_context - @property def kernel_id(self) -> tuple[str, ...]: """ @@ -431,24 +416,24 @@ class CachedPod(WrappedPod): def __init__( self, - pod: FunctionPod, + pod: dp.Pod, result_store: ArrowDataStore, lineage_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), - data_context: str | DataContext | None = None, **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store self.lineage_store = lineage_store + # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( self.pod, prefix_hasher_id=True ) @property - def pod_id(self) -> tuple[str, ...]: + def kernel_id(self) -> tuple[str, ...]: """ Return the pod ID, which is the function name of the wrapped pod. This is used to identify the pod in the system. @@ -461,7 +446,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.pod_id + return self.record_path_prefix + self.kernel_id def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: output_packet = self.get_recorded_output_packet(packet) @@ -481,10 +466,19 @@ def record_packet( """ Record the output packet against the input packet in the result store. """ + data_table = output_packet.as_table( + include_data_context=True, include_source=True + ) + + data_table = data_table.append_column( + f"_input_packet{DataContext.get_data_context_column()}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + result_flag = self.result_store.record_data( self.record_path, input_packet.content_hash(), - output_packet.as_table(include_source=True), + data_table, ignore_duplicates=ignore_duplicates, ) if result_flag is None: @@ -507,8 +501,7 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non return None return ArrowPacket( - result_table, - semantic_converter=self.pod._output_semantic_converter, - data_context=self.data_context, - skip_source_info_extraction=True, + result_table.drop( + [f"_input_packet{DataContext.get_data_context_column()}"] + ), ) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index ebe0249..a5c2434 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -318,8 +318,8 @@ class ImmutableTableStream(StreamBase): def __init__( self, table: pa.Table, - source_info: dict[str, str | None] | None = None, tag_columns: Collection[str] = (), + source_info: dict[str, str | None] | None = None, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), **kwargs, @@ -340,15 +340,19 @@ def __init__( prefix_info = {SOURCE_INFO_PREFIX: source_info} - table, prefix_tables = arrow_utils.prepare_prefixed_columns(table, prefix_info) - self._table = table - self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] - self._data_context_table = data_context_table - + # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + table, prefix_tables = arrow_utils.prepare_prefixed_columns( + table, prefix_info, exclude_columns=self._tag_columns + ) + # now table should only contain tag columns and packet columns self._packet_columns = tuple( c for c in table.column_names if c not in tag_columns ) + self._table = table + self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._data_context_table = data_context_table + if len(self._packet_columns) == 0: raise ValueError( "No packet columns found in the table. At least one packet column is required." @@ -565,27 +569,26 @@ def as_table( if self._cached_output_table is None: all_tags = [] all_packets = [] + tag_schema, packet_schema = None, None for tag, packet in self.iter_packets(): - # TODO: evaluate handling efficiency here + if tag_schema is None: + tag_schema = tag.arrow_schema() + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_data_context=True, + include_source=True, + ) all_tags.append(tag.as_dict()) all_packets.append( packet.as_dict(include_data_context=True, include_source=True) ) - all_tags: pa.Table = pa.Table.from_pylist(all_tags) - all_packets: pa.Table = pa.Table.from_pylist(all_packets) - # assert that column names do not overlap - overlapping_columns = set(all_tags.column_names) & set( - all_packets.column_names - ) - if overlapping_columns: - raise ValueError( - f"Column names overlap between tags and packets: {overlapping_columns}. Overlapping tag and packet columns are not supported yet." - ) - self._cached_output_table = pa.Table.from_arrays( - all_tags.columns + all_packets.columns, - names=all_tags.column_names + all_packets.column_names, + all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) + all_packets: pa.Table = pa.Table.from_pylist( + all_packets, schema=packet_schema ) + + self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) assert self._cached_output_table is not None, ( "_cached_output_table should not be None here." ) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 2e0e927..012edaa 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -39,7 +39,7 @@ def data_context_key(self) -> str: """ ... - def types(self) -> TypeSpec: + def types(self, include_data_context: bool = False) -> TypeSpec: """ Return the type specification for this datagram. @@ -51,6 +51,19 @@ def types(self) -> TypeSpec: """ ... + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + The schema provides a structured representation of the datagram's + fields and their types, enabling efficient serialization and + deserialization with PyArrow. + + Returns: + pa.Schema: PyArrow schema representation of the datagram + """ + ... + def keys(self) -> Collection[str]: """ Return the available keys/fields in this datagram. @@ -203,6 +216,36 @@ def source_info(self) -> dict[str, str | None]: """ ... + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> TypeSpec: + """ + Return the type specification for this packet. + + Args: + include_source: If True, source information is included in the typespec + for debugging and lineage tracking + + Returns: + TypeSpec: Dictionary mapping field names to Python types + """ + ... + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this packet. + + Args: + include_source: If True, source information is included in the schema + for debugging and lineage tracking + + Returns: + pa.Schema: PyArrow schema representation of packet data + """ + ... + # def join(self, other: "Packet") -> "Packet": ... # def get_as(self, packet_type: PacketType) -> PacketType: ... @@ -539,20 +582,6 @@ class Kernel(ContentIdentifiable, Labelable, Protocol): full tracking) and testing/debugging (without side effects). """ - @property - def data_context_key(self) -> str: - """ - Return the data context key for this kernel. - - This key identifies the semantic type registry, arrow hasher, and other - contextual information needed to properly interpret and work with this - kernel across various operations (storage, visualization, processing, etc.). - - Returns: - str: Context key for proper kernel interpretation - """ - ... - @property def kernel_id(self) -> tuple[str, ...]: """ diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index d5ca902..618d7a4 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -12,7 +12,7 @@ def record_data( record_path: tuple[str, ...], record_id: str, data: pa.Table, - ignore_duplicates: bool = False, + ignore_duplicates: bool | None = None, ) -> str | None: ... def get_recorded_data( diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 573a316..434e2f4 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -1,14 +1,14 @@ -from .legacy.types import DataStore, ArrowDataStore -from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore -from .legacy.dict_data_stores import DirDataStore, NoOpDataStore -from .legacy.safe_dir_data_store import SafeDirDataStore +# from .legacy.types import DataStore, ArrowDataStore +# from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore +# from .legacy.dict_data_stores import DirDataStore, NoOpDataStore +# from .legacy.safe_dir_data_store import SafeDirDataStore -__all__ = [ - "DataStore", - "ArrowDataStore", - "DirDataStore", - "SafeDirDataStore", - "NoOpDataStore", - "MockArrowDataStore", - "SimpleParquetDataStore", -] +# __all__ = [ +# "DataStore", +# "ArrowDataStore", +# "DirDataStore", +# "SafeDirDataStore", +# "NoOpDataStore", +# "MockArrowDataStore", +# "SimpleParquetDataStore", +# ] diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index f8f0451..f04a7b7 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -385,15 +385,15 @@ def record_data( entry_id: str, data: pa.Table, force_flush: bool = False, - error_on_duplicate: bool | None = None, + ignore_duplicates: bool | None = None, ) -> pa.Table: self._validate_source_path(record_path) source_key = self._get_source_key(record_path) # Check for existing entry - if error_on_duplicate is None: - error_on_duplicate = self.duplicate_entry_behavior == "error" - if error_on_duplicate: + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + if not ignore_duplicates: pending_table = self._pending_batches[source_key].get(entry_id, None) if pending_table is not None: raise ValueError( @@ -480,8 +480,10 @@ def get_recorded_data( # check if entry_id is found in pending batches source_key = self._get_source_key(record_path) if entry_id in self._pending_batches[source_key]: - # Return the pending record directly - return self._pending_batches[source_key][entry_id] + # Return the pending record after removing the entry id column + return self._remove_entry_id_column( + self._pending_batches[source_key][entry_id] + ) delta_table = self._get_existing_delta_table(record_path) if delta_table is None: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index f9a6d7f..5237eb3 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -167,6 +167,7 @@ def prepare_prefixed_columns( prefix_info: Collection[str] | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], + exclude_columns: Collection[str] = (), ) -> tuple[pa.Table, dict[str, pa.Table]]: """ """ all_prefix_info = {} @@ -208,11 +209,13 @@ def prepare_prefixed_columns( prefixed_column_names = defaultdict(list) prefixed_columns = defaultdict(list) + target_column_names = [c for c in data_column_names if c not in exclude_columns] + for prefix, value_lut in all_prefix_info.items(): target_prefixed_column_names = prefixed_column_names[prefix] target_prefixed_columns = prefixed_columns[prefix] - for col_name in data_column_names: + for col_name in target_column_names: prefixed_col_name = f"{prefix}{col_name}" existing_columns = existing_prefixed_columns[prefix] @@ -248,3 +251,17 @@ def prepare_prefixed_columns( prefixed_columns[prefix], names=prefixed_column_names[prefix] ) return data_table, result_tables + + +def drop_schema_columns(schema: pa.Schema, columns: Collection[str]) -> pa.Schema: + """ + Drop specified columns from a PyArrow schema. + + Args: + schema (pa.Schema): The original schema. + columns (list[str]): List of column names to drop. + + Returns: + pa.Schema: New schema with specified columns removed. + """ + return pa.schema([field for field in schema if field.name not in columns]) From 8b84c02e68f84a9be71de2313593b721674fbffd Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 04:42:25 +0000 Subject: [PATCH 078/224] wip: further refinement of datagram implementations --- src/orcapod/__init__.py | 46 +- src/orcapod/data/__init__.py | 6 + src/orcapod/data/context.py | 10 - src/orcapod/data/datagrams.py | 859 ------- src/orcapod/data/datagrams/__init__.py | 13 + src/orcapod/data/datagrams/arrow_datagram.py | 867 +++++++ .../data/datagrams/arrow_tag_packet.py | 268 ++ src/orcapod/data/datagrams/base.py | 301 +++ src/orcapod/data/datagrams/dict_datagram.py | 835 ++++++ src/orcapod/data/datagrams/dict_tag_packet.py | 256 ++ src/orcapod/data/kernels.py | 5 + src/orcapod/data/old_datagrams.py | 2281 +++++++++++++++++ src/orcapod/data/pods.py | 156 +- src/orcapod/data/streams.py | 48 +- src/orcapod/data/system_constants.py | 25 + src/orcapod/hashing/arrow_hashers.py | 3 +- src/orcapod/hashing/object_hashers.py | 2 +- src/orcapod/protocols/data_protocols.py | 778 +++++- src/orcapod/protocols/hashing_protocols.py | 12 +- src/orcapod/protocols/store_protocols.py | 29 +- src/orcapod/stores/delta_lake_stores.py | 413 ++- src/orcapod/types/semantic_converter.py | 29 +- src/orcapod/types/semantic_types.py | 48 +- src/orcapod/types/typespec_utils.py | 9 +- src/orcapod/utils/arrow_utils.py | 9 +- 25 files changed, 5989 insertions(+), 1319 deletions(-) delete mode 100644 src/orcapod/data/datagrams.py create mode 100644 src/orcapod/data/datagrams/__init__.py create mode 100644 src/orcapod/data/datagrams/arrow_datagram.py create mode 100644 src/orcapod/data/datagrams/arrow_tag_packet.py create mode 100644 src/orcapod/data/datagrams/base.py create mode 100644 src/orcapod/data/datagrams/dict_datagram.py create mode 100644 src/orcapod/data/datagrams/dict_tag_packet.py create mode 100644 src/orcapod/data/old_datagrams.py create mode 100644 src/orcapod/data/system_constants.py diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index b4de8e1..b49b19c 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,45 +1,17 @@ from .data import DEFAULT_TRACKER_MANAGER +from .data.pods import function_pod, FunctionPod, CachedPod +from .data import streams +from .stores.delta_lake_stores import BasicDeltaTableArrowStore + no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking __all__ = [ "DEFAULT_TRACKER_MANAGER", "no_tracking", + "function_pod", + "FunctionPod", + "CachedPod", + "streams", + "BasicDeltaTableArrowStore", ] - -# from .core import operators, sources, streams -# from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -# from . import hashing, stores -# from .core.operators import Join, MapPackets, MapTags, packet, tag -# from .core.pod import FunctionPod, function_pod -# from .core.sources import GlobSource -# from .stores import DirDataStore, SafeDirDataStore -# from .core.tracker import GraphTracker -# from .pipeline import Pipeline - -# DEFAULT_TRACKER = GraphTracker() -# DEFAULT_TRACKER.activate() - - -# __all__ = [ -# "hashing", -# "stores", -# "pod", -# "operators", -# "streams", -# "sources", -# "MapTags", -# "MapPackets", -# "Join", -# "tag", -# "packet", -# "FunctionPod", -# "function_pod", -# "GlobSource", -# "DirDataStore", -# "SafeDirDataStore", -# "DEFAULT_TRACKER", -# "SyncStreamFromLists", -# "SyncStreamFromGenerator", -# "Pipeline", -# ] diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py index 6d7e206..eb005c1 100644 --- a/src/orcapod/data/__init__.py +++ b/src/orcapod/data/__init__.py @@ -1 +1,7 @@ from .trackers import DEFAULT_TRACKER_MANAGER +from .system_constants import orcapod_constants as constants + +__all__ = [ + "DEFAULT_TRACKER_MANAGER", + "constants", +] diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py index cc47cff..85261d2 100644 --- a/src/orcapod/data/context.py +++ b/src/orcapod/data/context.py @@ -6,9 +6,6 @@ from dataclasses import dataclass -DATA_CONTEXT_COLUMN = "_orcapod_context_key" - - @dataclass class DataContext: context_key: str @@ -16,13 +13,6 @@ class DataContext: arrow_hasher: hp.ArrowHasher object_hasher: hp.ObjectHasher - @staticmethod - def get_data_context_column() -> str: - """ - Returns the column name used to store the data context key in Arrow tables. - """ - return DATA_CONTEXT_COLUMN - @staticmethod def resolve_data_context(data_context: "str | DataContext | None") -> "DataContext": """ diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py deleted file mode 100644 index b506a56..0000000 --- a/src/orcapod/data/datagrams.py +++ /dev/null @@ -1,859 +0,0 @@ -""" -Data structures and utilities for working with datagrams in OrcaPod. - -This module provides classes and functions for handling packet-like data structures -that can represent data in various formats (Python dicts, Arrow tables, etc.) while -maintaining type information, source metadata, and semantic type conversion capability. - -Key classes: -- SemanticConverter: Converts between different data representations. Intended for internal use. -- DictDatagram: Immutable dict-based data structure -- PythonDictPacket: Python dict-based packet with source info -- ArrowPacket: Arrow table-based packet implementation -- PythonDictTag/ArrowTag: Tag implementations for data identification - -The module also provides utilities for schema validation, table operations, -and type conversions between semantic stores, Python stores, and Arrow tables. -""" - -from orcapod.types.core import DataValue -from typing import TypeAlias, Self -from collections.abc import Mapping, Collection -from orcapod.types import TypeSpec -from orcapod.types.semantic_converter import SemanticConverter -from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types import schemas -from orcapod.types import typespec_utils as tsutils -from orcapod.data.context import ( - DataContext, -) -import pyarrow as pa -import logging -from orcapod.utils import arrow_utils - - -# Constants used for source info keys -SOURCE_INFO_PREFIX = "_source_info_" - - -logger = logging.getLogger(__name__) -# A conveniece packet-like type that defines a value that can be -# converted to a packet. It's broader than Packet and a simple mapping -# from string keys to DataValue (e.g., int, float, str) can be regarded -# as PacketLike, allowing for more flexible interfaces. -# Anything that requires Packet-like data but without the strict features -# of a Packet should accept PacketLike. -# One should be careful when using PacketLike as a return type as it does not -# enforce the typespec or source_info, which are important for packet integrity. -PacketLike: TypeAlias = Mapping[str, DataValue] - -PythonStore: TypeAlias = Mapping[str, DataValue] - - -class ImmutableDict(Mapping[str, DataValue]): - """ - An immutable dictionary-like container for DataValues. - - Provides a read-only view of a dictionary mapping strings to DataValues, - implementing the Mapping protocol for compatibility with dict-like operations. - - Initialize with data from a mapping. - Args: - data: Source mapping to copy data from - """ - - def __init__(self, data: Mapping[str, DataValue]): - self._data = dict(data) - - def __getitem__(self, key: str) -> DataValue: - return self._data[key] - - def __iter__(self): - return iter(self._data) - - def __len__(self) -> int: - return len(self._data) - - def __repr__(self) -> str: - return self._data.__repr__() - - def __str__(self) -> str: - return self._data.__str__() - - -# TODO: Inherit from Mapping instead to provide immutable datagram -class DictDatagram(ImmutableDict): - """ - An immutable datagram implementation using a dictionary backend. - - Extends ImmutableDict to provide additional functionality for type handling, - semantic conversion, and Arrow table representation while maintaining - immutability of the underlying data. - - - Initialize DictDatagram with data and optional type information. - - Args: - data: Source data mapping - typespec: Optional type specification for fields - semantic_converter: Optional converter for semantic types - semantic_type_registry: Registry for semantic type lookup - arrow_hasher: Optional hasher for Arrow table content - """ - - def __init__( - self, - data: Mapping[str, DataValue], - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - super().__init__(data) - # normalize the data content and remove any source info keys - self._data_context = DataContext.resolve_data_context(data_context) - - # combine provided typespec info with inferred typespec from content - inferred_typespec = tsutils.get_typespec_from_dict(self, typespec) - self._python_schema = schemas.PythonSchema(inferred_typespec) - - # create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - self._python_schema.to_semantic_schema( - semantic_type_registry=self._data_context.semantic_type_registry - ), - ) - self.semantic_converter = semantic_converter - - self._cached_table: pa.Table | None = None - self._cached_content_hash: str | None = None - self._cached_arrow_schema: pa.Schema | None = None - - @property - def data_context_key(self) -> str: - """Return the context key of the datagram.""" - return self._data_context.context_key - - def as_table(self, include_data_context: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - - if self._cached_table is None: - self._cached_table = self.semantic_converter.from_python_to_arrow( - self.as_dict(include_data_context=True), - self.types(include_data_context=True), - ) - assert self._cached_table is not None, "Cached table should not be None" - if include_data_context: - return self._cached_table - - return self._cached_table.drop([DataContext.get_data_context_column()]) - - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - data = dict(self) - if include_data_context: - data[DataContext.get_data_context_column()] = self._data_context.context_key - return data - - def content_hash( - self, - ) -> str: - """ - Calculate and return content hash of the datagram. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_data_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - def types(self, include_data_context: bool = False) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = self._python_schema.copy() - if include_data_context: - schema[DataContext.get_data_context_column()] = str - return schema - - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - if self._cached_arrow_schema is None: - self._cached_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self.types(include_data_context=True) - ) - ) - if not include_data_context: - return arrow_utils.drop_schema_columns( - self._cached_arrow_schema, - [DataContext.get_data_context_column()], - ) - return self._cached_arrow_schema - - @classmethod - def _from_copy( - cls, - data: Mapping[str, DataValue], - python_schema: schemas.PythonSchema, - semantic_converter: SemanticConverter, - data_context: DataContext, - ) -> Self: - """Create a new instance from copy without full initialization.""" - instance = cls.__new__(cls) - ImmutableDict.__init__(instance, data) - - # Set attributes directly - instance._python_schema = python_schema - instance.semantic_converter = semantic_converter - instance._data_context = data_context - instance._cached_table = None - instance._cached_content_hash = None - - return instance - - def copy(self) -> Self: - """Return a copy of the datagram.""" - return self._from_copy( - self, - self._python_schema.copy(), - self.semantic_converter, - self._data_context, - ) - - -class DictTag(DictDatagram): - """ - A simple tag implementation using Python dictionary. - - Represents a tag (metadata) as a dictionary that can be converted - to different representations like Arrow tables. - """ - - -class DictPacket(DictDatagram): - """ - Enhanced packet implementation with source information support. - - Extends DictDatagram to include source information tracking and - enhanced table conversion capabilities that can include or exclude - source metadata. - - Initialize packet with data and optional source information. - - Args: - data: Primary data content - source_info: Optional mapping of field names to source information - typespec: Optional type specification - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types. Defaults to system default registry. - arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. - """ - - def __init__( - self, - data: Mapping[str, DataValue], - source_info: Mapping[str, str | None] | None = None, - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data_only = { - k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX) - } - contained_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in data.items() - if k.startswith(SOURCE_INFO_PREFIX) - } - - super().__init__( - data_only, - typespec=typespec, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._source_info = {**contained_source_info, **(source_info or {})} - self._cached_source_info_table: pa.Table | None = None - self._cached_source_info_schema: pa.Schema | None = None - - @property - def _source_info_schema(self) -> pa.Schema: - if self._cached_source_info_schema is None: - self._cached_source_info_schema = pa.schema( - {f"{SOURCE_INFO_PREFIX}{k}": pa.large_string() for k in self.keys()} - ) - return self._cached_source_info_schema - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - """Convert the packet to an Arrow table.""" - table = super().as_table(include_data_context=include_data_context) - if include_source: - if self._cached_source_info_table is None: - source_info_data = { - f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() - } - self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=self._source_info_schema - ) - assert self._cached_source_info_table is not None, ( - "Cached source info table should not be None" - ) - # subselect the corresponding _source_info as the columns present in the data table - source_info_table = self._cached_source_info_table.select( - [ - f"{SOURCE_INFO_PREFIX}{k}" - for k in table.column_names - if k in self.keys() - ] - ) - table = arrow_utils.hstack_tables(table, source_info_table) - return table - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Return dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - dict_copy = super().as_dict(include_data_context=include_data_context) - if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value - return dict_copy - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) - return schema - - def as_datagram(self, include_source: bool = False) -> DictDatagram: - """ - Convert the packet to a DictDatagram. - - Args: - include_source: Whether to include source info fields - - Returns: - DictDatagram representation of the packet - """ - data = self.as_dict(include_source=include_source) - typespec = self.types(include_source=include_source) - return DictDatagram( - data, - typespec=typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info - """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> Self: - """Return a shallow copy of the packet.""" - instance = super().copy() - instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table - return instance - - -# def prepare_system_data_tables( -# table: pa.Table, -# source_info: dict[str, str | None] | None = None, -# ) -> tuple[pa.Table, pa.Table]: -# """ -# Process a table to ensure proper source_info columns. - -# Args: -# table: Input PyArrow table -# source_info: optional dictionary mapping column names to source info values. If present, -# it will take precedence over existing source_info columns in the table. - -# Returns: -# tuple of table without any source info and another table only containing source info columns (with prefix) -# """ -# if source_info is None: -# source_info = {} - -# # Step 1: Separate source_info columns from regular columns -# data_columns = [] -# data_column_names = [] -# existing_source_info = {} - -# for i, name in enumerate(table.column_names): -# if name.startswith(SOURCE_INFO_PREFIX): -# # Extract the base column name -# base_name = name.removeprefix(SOURCE_INFO_PREFIX) -# existing_source_info[base_name] = table.column(i) -# else: -# data_columns.append(table.column(i)) -# data_column_names.append(name) - -# # Step 2: Create source_info columns for each regular column -# source_info_columns = [] -# source_info_column_names = [] - -# # Create source_info columns for each regular column -# num_rows = table.num_rows - -# for col_name in data_column_names: -# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" - -# # if col_name is in source_info, use that value -# if col_name in source_info: -# # Use value from source_info dictionary -# source_value = source_info[col_name] -# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) -# # if col_name is in existing_source_info, use that column -# elif col_name in existing_source_info: -# # Use existing source_info column, but convert to large_string -# existing_col = existing_source_info[col_name] -# if existing_col.type == pa.large_string(): -# source_values = existing_col -# else: -# # Convert to large_string -# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore - -# else: -# # Use null values -# source_values = pa.array([None] * num_rows, type=pa.large_string()) - -# source_info_columns.append(source_values) -# source_info_column_names.append(source_info_col_name) - -# # Step 3: Create the final table -# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) -# source_info_table: pa.Table = pa.Table.from_arrays( -# source_info_columns, names=source_info_column_names -# ) -# return data_table, source_info_table - - -class ArrowDatagram: - """ - An immutable datagram implementation using a PyArrow Table backend. - TODO: handle RecordBatch in addition to table - - This basic datagram provides functionality for type handling, - semantic conversion, and dict-based content representation while maintaining - immutability of the underlying data. - - - Initialize ArrowDatagram with a PyArrow table. - - Args: - data: Source data mapping - typespec: Optional type specification for fields - semantic_converter: Optional converter for semantic types - semantic_type_registry: Registry for semantic type lookup - arrow_hasher: Optional hasher for Arrow table content - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the table to ensure it contains proper source columns - if len(table) != 1: - raise ValueError( - "Table must contain exactly one row to be a valid datagram." - ) - - # TODO: add check for compatible types, especially of str being pa.large_string - table, data_context_table = arrow_utils.split_by_column_groups( - table, [DataContext.get_data_context_column()] - ) - - self._table = table - - if data_context is None and data_context_table is not None: - data_context = data_context_table[ - DataContext.get_data_context_column() - ].to_pylist()[0] - - self._data_context = DataContext.resolve_data_context(data_context) - - data_context_schema = pa.schema( - {DataContext.get_data_context_column(): pa.large_string()} - ) - self._data_context_table = pa.Table.from_pylist( - [{DataContext.get_data_context_column(): self._data_context.context_key}], - schema=data_context_schema, - ) - - # create semantic converter - # TODO: consider some validation of passed semantic_converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - self._table.schema, - self._data_context.semantic_type_registry, - ) - ) - self._semantic_converter = semantic_converter - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_content_hash: str | None = None - - @property - def data_context_key(self) -> str: - """Return the context key of the datagram.""" - return self._data_context.context_key - - def as_table(self, include_data_context: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - if include_data_context: - return arrow_utils.hstack_tables(self._table, self._data_context_table) - return self._table - - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - if self._cached_python_dict is None: - self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self.as_table(include_data_context=False) - )[0] - assert self._cached_python_dict is not None, "Cached dict should not be None" - output = dict(self._cached_python_dict) - if include_data_context: - output[DataContext.get_data_context_column()] = ( - self._data_context.context_key - ) - return output - - def content_hash( - self, - ) -> str: - """ - Calculate and return content hash of the datagram. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_data_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - def keys(self) -> tuple[str, ...]: - return tuple(self._table.column_names) - - def types(self, include_data_context: bool = False) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - if self._cached_python_schema is None: - self._cached_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema(self._table.schema) - ) - schema = self._cached_python_schema.copy() - if include_data_context: - schema[DataContext.get_data_context_column()] = str - return schema - - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - if include_data_context: - return arrow_utils.join_arrow_schemas( - self._table.schema, self._data_context_table.schema - ) - return self._table.schema - - @classmethod - def _from_copy( - cls, - table: pa.Table, - python_schema: schemas.PythonSchema, - semantic_converter: SemanticConverter, - arrow_hasher: hp.ArrowHasher, - ) -> Self: - """Create a new instance from copy without full initialization.""" - instance = cls.__new__(cls) - instance._table = table - instance._semantic_converter = semantic_converter - instance._data_context = arrow_hasher - - # Set attributes directly - instance._cached_content_hash = None - - return instance - - def copy(self) -> Self: - """Return a copy of the datagram.""" - new_datagram = self.__class__( - self._table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_python_dict = self._cached_python_dict - return new_datagram - - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict()}" - - -class ArrowTag(ArrowDatagram): - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - super().__init__( - table=table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - -class ArrowPacket(ArrowDatagram): - """ - Arrow table-based packet implementation with comprehensive features. - - A packet implementation that uses Arrow tables as the primary storage format, - providing efficient memory usage and columnar data operations while supporting - source information tracking and content hashing. - - - Initialize ArrowPacket with Arrow table and configuration. - - Args: - table: Single-row Arrow table representing the packet - source_info: Optional source information mapping - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint for tracking - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - skip_source_info_extraction: Whether to skip source info processing - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - data: pa.Table, - source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(data) != 1: - raise ValueError( - "ArrowPacket should only contain a single row, " - "as it represents a single packet." - ) - if source_info is None: - source_info = {} - - # normalize the table to ensure it has the expected source_info columns - data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( - data, - {SOURCE_INFO_PREFIX: source_info}, - exclude_columns=[DataContext.get_data_context_column()], - ) - self._source_info_table = prefixed_tables[SOURCE_INFO_PREFIX] - - super().__init__( - data_table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - table = super().as_table(include_data_context=include_data_context) - if include_source: - # add source_info only for existing data columns - table = arrow_utils.hstack_tables( - table, - self._source_info_table.select( - [ - f"{SOURCE_INFO_PREFIX}{c}" - for c in table.column_names - if c in self.keys() - ] - ), - ) - return table - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas( - schema, self._source_info_table.schema - ) - return schema - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Convert to dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - return_dict = super().as_dict(include_data_context=include_data_context) - if include_source: - return_dict.update( - {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} - ) - return return_dict - - def as_datagram(self, include_source: bool = False) -> ArrowDatagram: - table = self.as_table(include_source=include_source) - return ArrowDatagram( - table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Copy of the dictionary mapping field names to their source info - """ - if self._cached_source_info is None: - self._cached_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in self._source_info_table.to_pylist()[0].items() - } - return self._cached_source_info.copy() - - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash - - return new_packet - - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] -"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/datagrams/__init__.py b/src/orcapod/data/datagrams/__init__.py new file mode 100644 index 0000000..0c255e3 --- /dev/null +++ b/src/orcapod/data/datagrams/__init__.py @@ -0,0 +1,13 @@ +from .arrow_datagram import ArrowDatagram +from .arrow_tag_packet import ArrowTag, ArrowPacket +from .dict_datagram import DictDatagram +from .dict_tag_packet import DictTag, DictPacket + +__all__ = [ + "ArrowDatagram", + "ArrowTag", + "ArrowPacket", + "DictDatagram", + "DictTag", + "DictPacket", +] diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py new file mode 100644 index 0000000..5ed5307 --- /dev/null +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -0,0 +1,867 @@ +import logging +from collections.abc import Collection, Iterator, Mapping +from typing import Any, Self + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.data.datagrams.base import BaseDatagram +from orcapod.types import schemas, typespec_utils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class ArrowDatagram(BaseDatagram): + """ + Immutable datagram implementation using PyArrow Table as storage backend. + + This implementation provides high-performance columnar data operations while + maintaining the datagram interface. It efficiently handles type conversions, + semantic processing, and interoperability with Arrow-based tools. + + The underlying table is split into separate components: + - Data table: Primary business data columns + - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context table: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> table = pa.Table.from_pydict({ + ... "user_id": [123], + ... "name": ["Alice"], + ... "__pipeline_version": ["v2.1.0"], + ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] + ... }) + >>> datagram = ArrowDatagram(table) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize ArrowDatagram from PyArrow Table. + + Args: + table: PyArrow Table containing the data. Must have exactly one row. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on the data context and table schema. + data_context: Context key string or DataContext object. + If None and table contains context column, will extract from table. + + Raises: + ValueError: If table doesn't contain exactly one row. + + Note: + The input table is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Validate table has exactly one row for datagram + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # Split table into data, meta, and context components + context_columns = ( + [constants.CONTEXT_KEY] + if constants.CONTEXT_KEY in table.column_names + else [] + ) + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] + + # Extract context table if present + if constants.CONTEXT_KEY in table.column_names and data_context is None: + context_table = table.select([constants.CONTEXT_KEY]) + data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] + + # Initialize base class with data context + super().__init__(data_context) + + # Split table into components + self._data_table = table.drop_columns(context_columns + meta_columns) + self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: + raise ValueError("Data table must contain at least one data column.") + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._data_table.schema, + self._data_context.semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + + # Create data context table + data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) + self._data_context_table = pa.Table.from_pylist( + [{constants.CONTEXT_KEY: self._data_context.context_key}], + schema=data_context_schema, + ) + + # Initialize caches + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_meta_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def _core_info(self) -> dict[str, Any]: + core_info = { + "data_table": self._data_table, + "meta_table": self._meta_table, + "data_context_table": self._data_context_table, + "semantic_converter": self._semantic_converter, + "cached_python_schema": self._cached_python_schema, + "cached_python_dict": self._cached_python_dict, + "cached_meta_python_schema": self._cached_meta_python_schema, + "cached_content_hash": self._cached_content_hash, + } + return core_info + + def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: + new_copy = object.__new__(self.__class__) + new_copy._data_table = core_info["data_table"] + new_copy._meta_table = core_info["meta_table"] + new_copy._data_context_table = core_info["data_context_table"] + new_copy._semantic_converter = core_info["semantic_converter"] + new_copy._cached_python_schema = core_info["cached_python_schema"] + new_copy._cached_python_dict = core_info["cached_python_dict"] + new_copy._cached_meta_python_schema = core_info["cached_meta_python_schema"] + new_copy._cached_content_hash = core_info["cached_content_hash"] + return new_copy + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + if self._meta_table is None: + return () + return tuple(self._meta_table.column_names) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data_table.column_names: + raise KeyError(f"Data column '{key}' not found") + + return self._data_table[key].to_pylist()[0] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data_table.column_names + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data_table.column_names) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + if key in self._data_table.column_names: + return self.as_dict()[key] + return default + + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + result_keys = list(self._data_table.column_names) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Get data schema (cached) + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._data_table.schema + ) + ) + + schema = dict(self._cached_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if self._cached_meta_python_schema is None: + self._cached_meta_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._meta_table.schema + ) + ) + meta_schema = dict(self._cached_meta_python_schema) + if include_meta_columns is True: + schema.update(meta_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in meta_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + # order matters + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + all_schemas = [self._data_table.schema] + + # Add context schema if requested + if include_context: + # TODO: reassess the efficiency of this approach + all_schemas.append(self._data_context_table.schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_schema = self._meta_table.schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._meta_table.schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self._data_table, + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Get data dict (cached) + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._data_table + )[0] + + result_dict = dict(self._cached_python_dict) + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta data if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_dict = self._meta_table.to_pylist()[0] + elif isinstance(include_meta_columns, Collection): + meta_dict = self._meta_table.to_pylist()[0] + # Include only meta columns matching prefixes + meta_dict = { + k: v + for k, v in meta_dict.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + if meta_dict is not None: + result_dict.update(meta_dict) + + return result_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + all_tables = [self._data_table] + + # Add context if requested + if include_context: + all_tables.append(self._data_context_table) + + # Add meta columns if requested + if include_meta_columns and self._meta_table is not None: + meta_table = None + if include_meta_columns is True: + meta_table = self._meta_table + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + # ensure all given prefixes start with the meta prefix + prefixes = ( + f"{constants.META_PREFIX}{prefix}" + if not prefix.startswith(constants.META_PREFIX) + else prefix + for prefix in include_meta_columns + ) + + matched_cols = [ + col + for col in self._meta_table.column_names + if any(col.startswith(prefix) for prefix in prefixes) + ] + if matched_cols: + meta_table = self._meta_table.select(matched_cols) + else: + meta_table = None + + if meta_table is not None: + all_tables.append(meta_table) + + return arrow_utils.hstack_tables(*all_tables) + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get a meta column value. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) + default: Default value if not found + + Returns: + Meta column value + """ + if self._meta_table is None: + return default + + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + if key not in self._meta_table.column_names: + return default + + return self._meta_table[key].to_pylist()[0] + + def with_meta_columns(self, **meta_updates: DataValue) -> Self: + """ + Create a new ArrowDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New ArrowDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + meta_dict = {} + if self._meta_table is not None: + meta_dict = self._meta_table.to_pylist()[0] + + # Apply updates + meta_dict.update(prefixed_updates) + + # Create new meta table + new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None + + # Combine all tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: + """ + Create a new ArrowDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + + Returns: + New ArrowDatagram instance without specified meta columns + """ + if self._meta_table is None: + return self # No meta columns to drop + + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_table.column_names) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter meta columns + remaining_cols = [ + col for col in self._meta_table.column_names if col not in prefixed_keys + ] + + # Create new meta table + new_meta_table = ( + self._meta_table.select(remaining_cols) if remaining_cols else None + ) + + # Combine tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> Self: + """ + Create a new ArrowDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New ArrowDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data_table.column_names) + if missing_cols: + raise ValueError(f"Columns not found: {missing_cols}") + + new_data_table = self._data_table.select(list(column_names)) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: + """ + Create a new ArrowDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New ArrowDatagram instance without specified data columns + """ + + # Filter out specified data columns + missing = set(column_names) - set(self._data_table.column_names) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + # Filter data columns + remaining_cols = [ + col for col in self._data_table.column_names if col not in column_names + ] + + if not remaining_cols: + raise ValueError("Cannot drop all data columns") + + new_data_table = self._data_table.select(remaining_cols) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """ + Create a new ArrowDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New ArrowDatagram instance with renamed data columns + """ + # Create new schema with renamed fields, preserving original types + new_fields = [] + for field in self._data_table.schema: + old_name = field.name + new_name = column_mapping.get(old_name, old_name) + new_field = pa.field(new_name, field.type) + new_fields.append(new_field) + + # Create new data table with renamed columns + new_schema = pa.schema(new_fields) + new_data_table = self._data_table.rename_columns( + [column_mapping.get(name, name) for name in self._data_table.column_names] + ).cast(new_schema) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> Self: + """ + Create a new ArrowDatagram with specific column values updated. + + Args: + **updates: Column names and their new values + + Returns: + New ArrowDatagram instance with updated values + + Raises: + KeyError: If any specified column doesn't exist + + Example: + # Convert relative path to absolute path + updated = datagram.update(file_path="/absolute/path/to/file.txt") + + # Update multiple values + updated = datagram.update(status="processed", file_path="/new/path") + """ + # Only update if there are columns to update + if not updates: + return self + + # Validate all columns exist + missing_cols = set(updates.keys()) - set(self._data_table.column_names) + if missing_cols: + raise KeyError( + f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" + ) + + updates_typespec = schemas.PythonSchema( + {k: v for k, v in self.types().items() if k in updates} + ) + + update_table = self._semantic_converter.from_python_to_arrow( + updates, updates_typespec + ) + all_tables = [self._data_table.drop_columns(list(updates.keys())), update_table] + + if self._meta_table is not None: + all_tables.append(self._meta_table) + + return self.__class__( + table=arrow_utils.hstack_tables(*all_tables), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create a new ArrowDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New ArrowDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # TODO: consider simplifying this conversion logic + typespec = typespec_utils.get_typespec_from_dict(updates, column_types) + + updates_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_typespec( + typespec, self._data_context.semantic_type_registry + ) + ) + # TODO: cleanup the handling of typespec python schema and various conversion points + new_data_table = updates_converter.from_python_to_arrow(updates, typespec) + + # Combine with meta table for reconstruction + all_tables = [self._data_table, new_data_table] + if self._meta_table is not None: + all_tables.append(self._meta_table) + + combined_table = arrow_utils.hstack_tables(*all_tables) + + # prepare the joined converter + total_converter = self._semantic_converter.join(updates_converter) + + return self.__class__( + table=combined_table, + semantic_converter=total_converter, + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> Self: + """ + Create a new ArrowDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New ArrowDatagram instance with new context + """ + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + data_context=new_context_key, + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """Return a copy of the datagram.""" + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + new_datagram = self.__class__( + combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + + Example: + >>> str(datagram) + "{'user_id': 123, 'name': 'Alice'}" + >>> print(datagram) + {'user_id': 123, 'name': 'Alice'} + """ + return str(self.as_dict()) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + + Example: + >>> repr(datagram) + "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" + """ + data_dict = self.as_dict() + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"{self.__class__.__name__}(" + f"data={data_dict}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py new file mode 100644 index 0000000..f776365 --- /dev/null +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -0,0 +1,268 @@ +import logging +from collections.abc import Collection +from typing import Self + + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.types import schemas +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +from orcapod.data.datagrams.arrow_datagram import ArrowDatagram + +logger = logging.getLogger(__name__) + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + +class ArrowPacket(ArrowDatagram): + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + table, + {constants.SOURCE_PREFIX: source_info}, + exclude_columns=[constants.CONTEXT_KEY], + exclude_prefixes=[constants.META_PREFIX], + ) + self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] + + super().__init__( + data_table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + keys += tuple(f"{constants.SOURCE_PREFIX}{k}" for k in self.keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + return_dict = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return_dict.update( + { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + ) + return return_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables(table, self._source_info_table) + return table + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> ArrowDatagram: + table = self.as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ + if self._cached_source_info is None: + self._cached_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() + } + return self._cached_source_info.copy() + + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema + new_packet._cached_content_hash = self._cached_content_hash + + return new_packet diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py new file mode 100644 index 0000000..0ec1501 --- /dev/null +++ b/src/orcapod/data/datagrams/base.py @@ -0,0 +1,301 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + +import logging +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator, Mapping +from typing import Any, Self, TypeAlias + +import pyarrow as pa + +from orcapod.data.context import ( + DataContext, +) +from orcapod.types import TypeSpec +from orcapod.types.core import DataValue + +logger = logging.getLogger(__name__) + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +PythonStore: TypeAlias = Mapping[str, DataValue] + + +class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + def __or__(self, other: Mapping[str, DataValue]) -> Self: + """ + Create a new ImmutableDict by merging with another mapping. + + Args: + other: Another mapping to merge with + + Returns: + A new ImmutableDict containing the combined data + """ + return self.__class__(self._data | dict(other)) + + +def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: + """ + Check if a column name matches any of the given prefixes. + + Args: + column: Column name to check + prefixes: Collection of prefixes to match against + + Returns: + True if the column starts with any of the prefixes, False otherwise + """ + for prefix in prefixes: + if column.startswith(prefix): + return True + return False + + +class BaseDatagram(ABC): + """ + Abstract base class for immutable datagram implementations. + + Provides shared functionality and enforces consistent interface across + different storage backends (dict, Arrow table, etc.). Concrete subclasses + must implement the abstract methods to handle their specific storage format. + + The base class only manages the data context key string - how that key + is interpreted and used is left to concrete implementations. + """ + + def __init__(self, data_context: DataContext | str | None = None) -> None: + """ + Initialize base datagram with data context. + + Args: + data_context: Context for semantic interpretation. Can be a string key + or a DataContext object, or None for default. + """ + self._data_context = DataContext.resolve_data_context(data_context) + + # 1. Core Properties (Identity & Structure) + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + + @property + @abstractmethod + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + ... + + # 2. Dict-like Interface (Data Access) + @abstractmethod + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + ... + + @abstractmethod + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + ... + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + ... + + @abstractmethod + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + ... + + # 3. Structural Information + @abstractmethod + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + ... + + @abstractmethod + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: + """Return type specification for the datagram.""" + ... + + @abstractmethod + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """Return the PyArrow schema for this datagram.""" + ... + + @abstractmethod + def content_hash(self) -> str: + """Calculate and return content hash of the datagram.""" + ... + + # 4. Format Conversions (Export) + @abstractmethod + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + ... + + @abstractmethod + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """Convert the datagram to an Arrow table.""" + ... + + # 5. Meta Column Operations + @abstractmethod + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """Get a meta column value.""" + ... + + @abstractmethod + def with_meta_columns(self, **updates: DataValue) -> Self: + """Create new datagram with updated meta columns.""" + ... + + @abstractmethod + def drop_meta_columns(self, *keys: str) -> Self: + """Create new datagram with specified meta columns removed.""" + ... + + # 6. Data Column Operations + @abstractmethod + def select(self, *column_names: str) -> Self: + """Create new datagram with only specified data columns.""" + ... + + @abstractmethod + def drop(self, *column_names: str) -> Self: + """Create new datagram with specified data columns removed.""" + ... + + @abstractmethod + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Create new datagram with data columns renamed.""" + ... + + @abstractmethod + def update(self, **updates: DataValue) -> Self: + """Create new datagram with existing column values updated.""" + ... + + @abstractmethod + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """Create new datagram with additional data columns.""" + ... + + # 7. Context Operations + @abstractmethod + def with_context_key(self, new_context_key: str) -> Self: + """Create new datagram with different data context.""" + ... + + # 8. Utility Operations + @abstractmethod + def copy(self) -> Self: + """Create a shallow copy of the datagram.""" + ... + + @abstractmethod + def _core_info(self) -> dict[str, Any]: + """ + Return core information about the datagram. + This is meant to be used for internal purposes only and is not part of the public API. + It provides necessary information to create an efficient copy of the datagram + and in a manner that works across inheritance hierarchies. + + Returns: + Dictionary with all information necessary to recreate the datagram in a copy. + """ + ... + + @abstractmethod + def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: + """ + Create a new datagram instance from core information. + + Args: + core_info: Dictionary with core information about the datagram + + Returns: + New datagram instance + """ + ... diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py new file mode 100644 index 0000000..5ebd926 --- /dev/null +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -0,0 +1,835 @@ +import logging +from collections.abc import Collection, Iterator, Mapping +from typing import Self, cast + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.data.datagrams.base import BaseDatagram +from orcapod.types import TypeSpec, schemas +from orcapod.types import typespec_utils as tsutils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class DictDatagram(BaseDatagram): + """ + Immutable datagram implementation using dictionary as storage backend. + + This implementation uses composition (not inheritance from Mapping) to maintain + control over the interface while leveraging dictionary efficiency for data access. + Provides clean separation between data, meta, and context components. + + The underlying data is split into separate components: + - Data dict: Primary business data columns + - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> data = {{ + ... "user_id": 123, + ... "name": "Alice", + ... "__pipeline_version": "v2.1.0", + ... "{orcapod.CONTEXT_KEY}": "financial_v1" + ... }} + >>> datagram = DictDatagram(data) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize DictDatagram from dictionary data. + + Args: + data: Source data mapping containing all column data. + typespec: Optional type specification for fields. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on data context and inferred types. + data_context: Data context for semantic type resolution. + If None and data contains context column, will extract from data. + + Note: + The input data is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Parse through data and extract different column types + data_columns = {} + meta_columns = {} + extracted_context = None + + for k, v in data.items(): + if k == constants.CONTEXT_KEY: + # Extract data context but keep it separate from meta data + if data_context is None: + extracted_context = v + # Don't store context in meta_data - it's managed separately + elif k.startswith(constants.META_PREFIX): + # Double underscore = meta metadata + meta_columns[k] = v + else: + # Everything else = user data (including _source_ and semantic types) + data_columns[k] = v + + # Initialize base class with data context + final_context = data_context or cast(str, extracted_context) + super().__init__(final_context) + + # Store data and meta components separately (immutable) + self._data = dict(data_columns) + self._meta_data = dict(meta_columns) + + # Combine provided typespec info with inferred typespec from content + # If the column value is None and no type spec is provided, defaults to str. + self._data_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._data, + typespec, + ) + ) + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._data_python_schema.to_semantic_schema( + semantic_type_registry=self._data_context.semantic_type_registry + ), + ) + self.semantic_converter = semantic_converter + + # Create schema for meta data + self._meta_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._meta_data, + typespec=typespec, + ) + ) + + # Initialize caches + self._cached_data_table: pa.Table | None = None + self._cached_meta_table: pa.Table | None = None + self._cached_content_hash: str | None = None + self._cached_data_arrow_schema: pa.Schema | None = None + self._cached_meta_arrow_schema: pa.Schema | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + return tuple(self._meta_data.keys()) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data: + raise KeyError(f"Data column '{key}' not found") + return self._data[key] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + return self._data.get(key, default) + + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + # Start with data columns + result_keys = list(self._data.keys()) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Start with data schema + schema = dict(self._data_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + schema.update(self._meta_python_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in self._meta_python_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Build data schema (cached) + if self._cached_data_arrow_schema is None: + self._cached_data_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._data_python_schema + ) + ) + + all_schemas = [self._cached_data_arrow_schema] + + # Add context schema if requested + if include_context: + context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) + all_schemas.append(context_schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + + if include_meta_columns is True: + meta_schema = self._cached_meta_arrow_schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._cached_meta_arrow_schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_meta_columns=False, include_context=False), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + include_context = include_all_info or include_context + include_meta_columns = include_all_info or include_meta_columns + + result_dict = dict(self._data) # Start with user data + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta columns if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + # Include all meta columns + result_dict.update(self._meta_data) + elif isinstance(include_meta_columns, Collection): + # Include only meta columns matching prefixes + filtered_meta_data = { + k: v + for k, v in self._meta_data.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + result_dict.update(filtered_meta_data) + + return result_dict + + def _get_meta_arrow_table(self) -> pa.Table: + if self._cached_meta_table is None: + arrow_schema = self._get_meta_arrow_schema() + self._cached_meta_table = pa.Table.from_pylist( + [self._meta_data], + schema=arrow_schema, + ) + assert self._cached_meta_table is not None, ( + "Meta Arrow table should be initialized by now" + ) + return self._cached_meta_table + + def _get_meta_arrow_schema(self) -> pa.Schema: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + return self._cached_meta_arrow_schema + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + include_context = include_all_info or include_context + include_meta_columns = include_all_info or include_meta_columns + + # Build data table (cached) + if self._cached_data_table is None: + self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._data, + self._data_python_schema, + ) + assert self._cached_data_table is not None, ( + "Data Arrow table should be initialized by now" + ) + result_table = self._cached_data_table + + # Add context if requested + if include_context: + result_table = result_table.append_column( + constants.CONTEXT_KEY, + pa.array([self._data_context.context_key], type=pa.large_string()), + ) + + # Add meta columns if requested + meta_table = None + if include_meta_columns and self._meta_data: + meta_table = self._get_meta_arrow_table() + # Select appropriate meta columns + if isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_data.keys() + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = meta_table.select(matched_cols) + else: + meta_table = None + + # Combine tables if we have meta columns to add + if meta_table is not None: + result_table = arrow_utils.hstack_tables(result_table, meta_table) + + return result_table + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + """ + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + return self._meta_data.get(key, default) + + def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New DictDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + new_meta_data = dict(self._meta_data) + new_meta_data.update(prefixed_updates) + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = False + ) -> "DictDatagram": + """ + Create a new DictDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + ignore_missing: If True, ignore missing meta columns without raising an error. + + Raises: + KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Returns: + New DictDatagram instance without specified meta columns + """ + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_data.keys()) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter out specified meta columns + new_meta_data = { + k: v for k, v in self._meta_data.items() if k not in prefixed_keys + } + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Filtered meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "DictDatagram": + """ + Create a new DictDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New DictDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data.keys()) + if missing_cols: + raise KeyError(f"Columns not found: {missing_cols}") + + # Keep only specified data columns + new_data = {k: v for k, v in self._data.items() if k in column_names} + + # Reconstruct full data dict for new instance + full_data = new_data # Selected user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": + """ + Create a new DictDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New DictDatagram instance without specified data columns + """ + # Filter out specified data columns + missing = set(column_names) - set(self._data.keys()) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + new_data = {k: v for k, v in self._data.items() if k not in column_names} + + if not new_data: + raise ValueError("Cannot drop all data columns") + + # Reconstruct full data dict for new instance + full_data = new_data # Filtered user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": + """ + Create a new DictDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New DictDatagram instance with renamed data columns + """ + # Rename data columns according to mapping, preserving original types + new_data = {} + for old_name, value in self._data.items(): + new_name = column_mapping.get(old_name, old_name) + new_data[new_name] = value + + # Handle typespec updates for renamed columns + new_typespec = None + if self._data_python_schema: + existing_typespec = dict(self._data_python_schema) + + # Rename types according to column mapping + renamed_typespec = {} + for old_name, old_type in existing_typespec.items(): + new_name = column_mapping.get(old_name, old_name) + renamed_typespec[new_name] = old_type + + new_typespec = renamed_typespec + + # Reconstruct full data dict for new instance + full_data = new_data # Renamed user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with existing column values updated. + Maintains immutability by returning a new instance. + + Args: + **updates: Column names and their new values (columns must exist) + + Returns: + New DictDatagram instance with updated values + + Raises: + KeyError: If any column doesn't exist (use with_columns() to add new columns) + """ + if not updates: + return self + + # Error if any column doesn't exist + missing_columns = set(updates.keys()) - set(self._data.keys()) + if missing_columns: + raise KeyError( + f"Columns not found: {sorted(missing_columns)}. " + f"Use with_columns() to add new columns." + ) + + # Update existing columns + new_data = dict(self._data) + new_data.update(updates) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, # Keep existing converter + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "DictDatagram": + """ + Create a new DictDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New DictDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data.keys()) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # Update user data with new columns + new_data = dict(self._data) + new_data.update(updates) + + # Create updated typespec - handle None values by defaulting to str + typespec = self.types() + if column_types is not None: + typespec.update(column_types) + + new_typespec = tsutils.get_typespec_from_dict( + new_data, + typespec=typespec, + ) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + # semantic converter needs to be rebuilt for new columns + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "DictDatagram": + """ + Create a new DictDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New DictDatagram instance with new context + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + return DictDatagram( + data=full_data, + data_context=new_context_key, # New context + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New DictDatagram instance with copied data and caches. + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + new_datagram = self.__class__( + full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + """ + return str(self._data) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + """ + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"{self.__class__.__name__}(" + f"data={self._data}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py new file mode 100644 index 0000000..92bf6aa --- /dev/null +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -0,0 +1,256 @@ +import logging +from collections.abc import Collection, Mapping +from typing import Self + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import DataContext +from orcapod.data.datagrams.dict_datagram import DictDatagram +from orcapod.types import TypeSpec, schemas +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class DictTag(DictDatagram): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + + +class DictPacket(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) + } + contained_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in data.items() + if k.startswith(constants.SOURCE_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + { + f"{constants.SOURCE_PREFIX}{k}": pa.large_string() + for k in self.keys() + } + ) + return self._cached_source_info_schema + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=self._source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [ + f"{constants.SOURCE_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] + ) + table = arrow_utils.hstack_tables(table, source_info_table) + return table + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + dict_copy = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key, value in self.source_info().items(): + dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value + return dict_copy + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: + """Return keys of the Python schema.""" + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + keys += tuple(f"{constants.SOURCE_PREFIX}{key}" for key in super().keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> DictDatagram: + """ + Convert the packet to a DictDatagram. + + Args: + include_source: Whether to include source info fields + + Returns: + DictDatagram representation of the packet + """ + + data = self.as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + typespec = self.types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> Self: + """Return a shallow copy of the packet.""" + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 09cf09f..58a920f 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -48,6 +48,11 @@ def __init__( def data_context(self) -> DataContext: return self._data_context + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + @property @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... diff --git a/src/orcapod/data/old_datagrams.py b/src/orcapod/data/old_datagrams.py new file mode 100644 index 0000000..a0386c8 --- /dev/null +++ b/src/orcapod/data/old_datagrams.py @@ -0,0 +1,2281 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + +from hmac import new +import logging +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator, Mapping +from types import new_class +from typing import Self, TypeAlias, cast + +from matplotlib.pyplot import arrow +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import TypeSpec, schemas, typespec_utils +from orcapod.types import typespec_utils as tsutils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +PythonStore: TypeAlias = Mapping[str, DataValue] + + +class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + def __or__(self, other: Mapping[str, DataValue]) -> Self: + """ + Create a new ImmutableDict by merging with another mapping. + + Args: + other: Another mapping to merge with + + Returns: + A new ImmutableDict containing the combined data + """ + return self.__class__(self._data | dict(other)) + + +def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: + """ + Check if a column name matches any of the given prefixes. + + Args: + column: Column name to check + prefixes: Collection of prefixes to match against + + Returns: + True if the column starts with any of the prefixes, False otherwise + """ + for prefix in prefixes: + if column.startswith(prefix): + return True + return False + + +class BaseDatagram(ABC): + """ + Abstract base class for immutable datagram implementations. + + Provides shared functionality and enforces consistent interface across + different storage backends (dict, Arrow table, etc.). Concrete subclasses + must implement the abstract methods to handle their specific storage format. + + The base class only manages the data context key string - how that key + is interpreted and used is left to concrete implementations. + """ + + def __init__(self, data_context: DataContext | str | None = None) -> None: + """ + Initialize base datagram with data context. + + Args: + data_context: Context for semantic interpretation. Can be a string key + or a DataContext object, or None for default. + """ + self._data_context = DataContext.resolve_data_context(data_context) + + # 1. Core Properties (Identity & Structure) + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + + @property + @abstractmethod + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + ... + + # 2. Dict-like Interface (Data Access) + @abstractmethod + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + ... + + @abstractmethod + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + ... + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + ... + + @abstractmethod + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + ... + + # 3. Structural Information + @abstractmethod + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + ... + + @abstractmethod + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: + """Return type specification for the datagram.""" + ... + + @abstractmethod + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """Return the PyArrow schema for this datagram.""" + ... + + @abstractmethod + def content_hash(self) -> str: + """Calculate and return content hash of the datagram.""" + ... + + # 4. Format Conversions (Export) + @abstractmethod + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + ... + + @abstractmethod + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """Convert the datagram to an Arrow table.""" + ... + + # 5. Meta Column Operations + @abstractmethod + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """Get a meta column value.""" + ... + + @abstractmethod + def with_meta_columns(self, **updates: DataValue) -> Self: + """Create new datagram with updated meta columns.""" + ... + + @abstractmethod + def drop_meta_columns(self, *keys: str) -> Self: + """Create new datagram with specified meta columns removed.""" + ... + + # 6. Data Column Operations + @abstractmethod + def select(self, *column_names: str) -> Self: + """Create new datagram with only specified data columns.""" + ... + + @abstractmethod + def drop(self, *column_names: str) -> Self: + """Create new datagram with specified data columns removed.""" + ... + + @abstractmethod + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Create new datagram with data columns renamed.""" + ... + + @abstractmethod + def update(self, **updates: DataValue) -> Self: + """Create new datagram with existing column values updated.""" + ... + + @abstractmethod + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """Create new datagram with additional data columns.""" + ... + + # 7. Context Operations + @abstractmethod + def with_context_key(self, new_context_key: str) -> Self: + """Create new datagram with different data context.""" + ... + + # 8. Utility Operations + @abstractmethod + def copy(self) -> Self: + """Create a shallow copy of the datagram.""" + ... + + +class DictDatagram(BaseDatagram): + """ + Immutable datagram implementation using dictionary as storage backend. + + This implementation uses composition (not inheritance from Mapping) to maintain + control over the interface while leveraging dictionary efficiency for data access. + Provides clean separation between data, meta, and context components. + + The underlying data is split into separate components: + - Data dict: Primary business data columns + - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> data = {{ + ... "user_id": 123, + ... "name": "Alice", + ... "__pipeline_version": "v2.1.0", + ... "{orcapod.CONTEXT_KEY}": "financial_v1" + ... }} + >>> datagram = DictDatagram(data) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize DictDatagram from dictionary data. + + Args: + data: Source data mapping containing all column data. + typespec: Optional type specification for fields. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on data context and inferred types. + data_context: Data context for semantic type resolution. + If None and data contains context column, will extract from data. + + Note: + The input data is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Parse through data and extract different column types + data_columns = {} + meta_columns = {} + extracted_context = None + + for k, v in data.items(): + if k == constants.CONTEXT_KEY: + # Extract data context but keep it separate from meta data + if data_context is None: + extracted_context = v + # Don't store context in meta_data - it's managed separately + elif k.startswith(constants.META_PREFIX): + # Double underscore = meta metadata + meta_columns[k] = v + else: + # Everything else = user data (including _source_ and semantic types) + data_columns[k] = v + + # Initialize base class with data context + final_context = data_context or cast(str, extracted_context) + super().__init__(final_context) + + # Store data and meta components separately (immutable) + self._data = dict(data_columns) + self._meta_data = dict(meta_columns) + + # Combine provided typespec info with inferred typespec from content + # If the column value is None and no type spec is provided, defaults to str. + self._data_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._data, + typespec, + ) + ) + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._data_python_schema.to_semantic_schema( + semantic_type_registry=self._data_context.semantic_type_registry + ), + ) + self.semantic_converter = semantic_converter + + # Create schema for meta data + self._meta_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._meta_data, + typespec=typespec, + ) + ) + + # Initialize caches + self._cached_data_table: pa.Table | None = None + self._cached_meta_table: pa.Table | None = None + self._cached_content_hash: str | None = None + self._cached_data_arrow_schema: pa.Schema | None = None + self._cached_meta_arrow_schema: pa.Schema | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + return tuple(self._meta_data.keys()) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data: + raise KeyError(f"Data column '{key}' not found") + return self._data[key] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + return self._data.get(key, default) + + # 3. Structural Information + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + result_keys = list(self._data.keys()) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + # Start with data schema + schema = dict(self._data_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + schema.update(self._meta_python_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in self._meta_python_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + # Build data schema (cached) + if self._cached_data_arrow_schema is None: + self._cached_data_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._data_python_schema + ) + ) + + all_schemas = [self._cached_data_arrow_schema] + + # Add context schema if requested + if include_context: + context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) + all_schemas.append(context_schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + + if include_meta_columns is True: + meta_schema = self._cached_meta_arrow_schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._cached_meta_arrow_schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_meta_columns=False, include_context=False), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + result_dict = dict(self._data) # Start with user data + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta columns if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + # Include all meta columns + result_dict.update(self._meta_data) + elif isinstance(include_meta_columns, Collection): + # Include only meta columns matching prefixes + filtered_meta_data = { + k: v + for k, v in self._meta_data.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + result_dict.update(filtered_meta_data) + + return result_dict + + def _get_meta_arrow_table(self) -> pa.Table: + if self._cached_meta_table is None: + arrow_schema = self._get_meta_arrow_schema() + self._cached_meta_table = pa.Table.from_pylist( + [self._meta_data], + schema=arrow_schema, + ) + assert self._cached_meta_table is not None, ( + "Meta Arrow table should be initialized by now" + ) + return self._cached_meta_table + + def _get_meta_arrow_schema(self) -> pa.Schema: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + return self._cached_meta_arrow_schema + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + # Build data table (cached) + if self._cached_data_table is None: + self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._data, + self._data_python_schema, + ) + assert self._cached_data_table is not None, ( + "Data Arrow table should be initialized by now" + ) + result_table = self._cached_data_table + + # Add context if requested + if include_context: + result_table = result_table.append_column( + constants.CONTEXT_KEY, + pa.array([self._data_context.context_key], type=pa.large_string()), + ) + + # Add meta columns if requested + meta_table = None + if include_meta_columns and self._meta_data: + meta_table = self._get_meta_arrow_table() + # Select appropriate meta columns + if isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_data.keys() + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = meta_table.select(matched_cols) + else: + meta_table = None + + # Combine tables if we have meta columns to add + if meta_table is not None: + result_table = arrow_utils.hstack_tables(result_table, meta_table) + + return result_table + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + """ + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + return self._meta_data.get(key, default) + + def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New DictDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + new_meta_data = dict(self._meta_data) + new_meta_data.update(prefixed_updates) + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = False + ) -> "DictDatagram": + """ + Create a new DictDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + ignore_missing: If True, ignore missing meta columns without raising an error. + + Raises: + KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Returns: + New DictDatagram instance without specified meta columns + """ + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_data.keys()) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter out specified meta columns + new_meta_data = { + k: v for k, v in self._meta_data.items() if k not in prefixed_keys + } + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Filtered meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "DictDatagram": + """ + Create a new DictDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New DictDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data.keys()) + if missing_cols: + raise KeyError(f"Columns not found: {missing_cols}") + + # Keep only specified data columns + new_data = {k: v for k, v in self._data.items() if k in column_names} + + # Reconstruct full data dict for new instance + full_data = new_data # Selected user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": + """ + Create a new DictDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New DictDatagram instance without specified data columns + """ + # Filter out specified data columns + missing = set(column_names) - set(self._data.keys()) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + new_data = {k: v for k, v in self._data.items() if k not in column_names} + + if not new_data: + raise ValueError("Cannot drop all data columns") + + # Reconstruct full data dict for new instance + full_data = new_data # Filtered user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": + """ + Create a new DictDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New DictDatagram instance with renamed data columns + """ + # Rename data columns according to mapping, preserving original types + new_data = {} + for old_name, value in self._data.items(): + new_name = column_mapping.get(old_name, old_name) + new_data[new_name] = value + + # Handle typespec updates for renamed columns + new_typespec = None + if self._data_python_schema: + existing_typespec = dict(self._data_python_schema) + + # Rename types according to column mapping + renamed_typespec = {} + for old_name, old_type in existing_typespec.items(): + new_name = column_mapping.get(old_name, old_name) + renamed_typespec[new_name] = old_type + + new_typespec = renamed_typespec + + # Reconstruct full data dict for new instance + full_data = new_data # Renamed user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with existing column values updated. + Maintains immutability by returning a new instance. + + Args: + **updates: Column names and their new values (columns must exist) + + Returns: + New DictDatagram instance with updated values + + Raises: + KeyError: If any column doesn't exist (use with_columns() to add new columns) + """ + if not updates: + return self + + # Error if any column doesn't exist + missing_columns = set(updates.keys()) - set(self._data.keys()) + if missing_columns: + raise KeyError( + f"Columns not found: {sorted(missing_columns)}. " + f"Use with_columns() to add new columns." + ) + + # Update existing columns + new_data = dict(self._data) + new_data.update(updates) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, # Keep existing converter + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "DictDatagram": + """ + Create a new DictDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New DictDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data.keys()) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # Update user data with new columns + new_data = dict(self._data) + new_data.update(updates) + + # Create updated typespec - handle None values by defaulting to str + typespec = self.types() + if column_types is not None: + typespec.update(column_types) + + new_typespec = tsutils.get_typespec_from_dict( + new_data, + typespec=typespec, + ) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + # semantic converter needs to be rebuilt for new columns + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "DictDatagram": + """ + Create a new DictDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New DictDatagram instance with new context + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + return DictDatagram( + data=full_data, + data_context=new_context_key, # New context + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New DictDatagram instance with copied data and caches. + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + new_datagram = self.__class__( + full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + """ + return str(self._data) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + """ + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"DictDatagram(" + f"data={self._data}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + + +class ArrowDatagram(BaseDatagram): + """ + Immutable datagram implementation using PyArrow Table as storage backend. + + This implementation provides high-performance columnar data operations while + maintaining the datagram interface. It efficiently handles type conversions, + semantic processing, and interoperability with Arrow-based tools. + + The underlying table is split into separate components: + - Data table: Primary business data columns + - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context table: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> table = pa.Table.from_pydict({ + ... "user_id": [123], + ... "name": ["Alice"], + ... "__pipeline_version": ["v2.1.0"], + ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] + ... }) + >>> datagram = ArrowDatagram(table) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize ArrowDatagram from PyArrow Table. + + Args: + table: PyArrow Table containing the data. Must have exactly one row. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on the data context and table schema. + data_context: Context key string or DataContext object. + If None and table contains context column, will extract from table. + + Raises: + ValueError: If table doesn't contain exactly one row. + + Note: + The input table is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Validate table has exactly one row for datagram + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # Split table into data, meta, and context components + context_columns = [constants.CONTEXT_KEY] + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] + + # Extract context table if present + if constants.CONTEXT_KEY in table.column_names and data_context is None: + context_table = table.select([constants.CONTEXT_KEY]) + data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] + + # Initialize base class with data context + super().__init__(data_context) + + # Split table into components + self._data_table = table.drop(context_columns + meta_columns) + self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: + raise ValueError("Data table must contain at least one data column.") + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._data_table.schema, + self._data_context.semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + + # Create data context table + data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) + self._data_context_table = pa.Table.from_pylist( + [{constants.CONTEXT_KEY: self._data_context.context_key}], + schema=data_context_schema, + ) + + # Initialize caches + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_meta_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + if self._meta_table is None: + return () + return tuple(self._meta_table.column_names) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data_table.column_names: + raise KeyError(f"Data column '{key}' not found") + + return self._data_table[key].to_pylist()[0] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data_table.column_names + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data_table.column_names) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + if key in self._data_table.column_names: + return self.as_dict()[key] + return default + + # 3. Structural Information + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + result_keys = list(self._data_table.column_names) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + # Get data schema (cached) + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._data_table.schema + ) + ) + + schema = dict(self._cached_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if self._cached_meta_python_schema is None: + self._cached_meta_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._meta_table.schema + ) + ) + meta_schema = dict(self._cached_meta_python_schema) + if include_meta_columns is True: + schema.update(meta_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in meta_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + all_schemas = [self._data_table.schema] + + # Add context schema if requested + if include_context: + # TODO: reassess the efficiency of this approach + all_schemas.append(self._data_context_table.schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_schema = self._meta_table.schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._meta_table.schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self._data_table, + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + # Get data dict (cached) + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._data_table + )[0] + + result_dict = dict(self._cached_python_dict) + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta data if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_dict = self._meta_table.to_pylist()[0] + elif isinstance(include_meta_columns, Collection): + meta_dict = self._meta_table.to_pylist()[0] + # Include only meta columns matching prefixes + meta_dict = { + k: v + for k, v in meta_dict.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + if meta_dict is not None: + result_dict.update(meta_dict) + + return result_dict + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + all_tables = [self._data_table] + + # Add context if requested + if include_context: + all_tables.append(self._data_context_table) + + # Add meta columns if requested + if include_meta_columns and self._meta_table is not None: + meta_table = None + if include_meta_columns is True: + meta_table = self._meta_table + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_table.column_names + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = self._meta_table.select(matched_cols) + else: + meta_table = None + + if meta_table is not None: + all_tables.append(meta_table) + + return arrow_utils.hstack_tables(*all_tables) + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get a meta column value. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) + default: Default value if not found + + Returns: + Meta column value + """ + if self._meta_table is None: + return default + + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + if key not in self._meta_table.column_names: + return default + + return self._meta_table[key].to_pylist()[0] + + def with_meta_columns(self, **meta_updates: DataValue) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New ArrowDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + meta_dict = {} + if self._meta_table is not None: + meta_dict = self._meta_table.to_pylist()[0] + + # Apply updates + meta_dict.update(prefixed_updates) + + # Create new meta table + new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None + + # Combine all tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = True + ) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + + Returns: + New ArrowDatagram instance without specified meta columns + """ + if self._meta_table is None: + return self # No meta columns to drop + + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_table.column_names) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter meta columns + remaining_cols = [ + col for col in self._meta_table.column_names if col not in prefixed_keys + ] + + # Create new meta table + new_meta_table = ( + self._meta_table.select(remaining_cols) if remaining_cols else None + ) + + # Combine tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New ArrowDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data_table.column_names) + if missing_cols: + raise ValueError(f"Columns not found: {missing_cols}") + + new_data_table = self._data_table.select(list(column_names)) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New ArrowDatagram instance without specified data columns + """ + + # Filter out specified data columns + missing = set(column_names) - set(self._data_table.column_names) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + # Filter data columns + remaining_cols = [ + col for col in self._data_table.column_names if col not in column_names + ] + + if not remaining_cols: + raise ValueError("Cannot drop all data columns") + + new_data_table = self._data_table.select(remaining_cols) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New ArrowDatagram instance with renamed data columns + """ + # Create new schema with renamed fields, preserving original types + new_fields = [] + for field in self._data_table.schema: + old_name = field.name + new_name = column_mapping.get(old_name, old_name) + new_field = pa.field(new_name, field.type) + new_fields.append(new_field) + + # Create new data table with renamed columns + new_schema = pa.schema(new_fields) + new_data_table = self._data_table.rename_columns( + [column_mapping.get(name, name) for name in self._data_table.column_names] + ).cast(new_schema) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specific column values updated. + + Args: + **updates: Column names and their new values + + Returns: + New ArrowDatagram instance with updated values + + Raises: + KeyError: If any specified column doesn't exist + + Example: + # Convert relative path to absolute path + updated = datagram.update(file_path="/absolute/path/to/file.txt") + + # Update multiple values + updated = datagram.update(status="processed", file_path="/new/path") + """ + # Only update if there are columns to update + if not updates: + return self + + # Validate all columns exist + missing_cols = set(updates.keys()) - set(self._data_table.column_names) + if missing_cols: + raise KeyError( + f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" + ) + + updates_typespec = schemas.PythonSchema( + {k: v for k, v in self.types().items() if k in updates} + ) + + update_table = self._semantic_converter.from_python_to_arrow( + updates, updates_typespec + ) + all_tables = [self._data_table.drop(list(updates.keys())), update_table] + + if self._meta_table is not None: + all_tables.append(self._meta_table) + + return ArrowDatagram( + table=arrow_utils.hstack_tables(*all_tables), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New ArrowDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # TODO: consider simplifying this conversion logic + typespec = typespec_utils.get_typespec_from_dict(updates, column_types) + + updates_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_typespec( + typespec, self._data_context.semantic_type_registry + ) + ) + # TODO: cleanup the handling of typespec python schema and various conversion points + new_data_table = updates_converter.from_python_to_arrow(updates, typespec) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + # prepare the joined converter + total_converter = self._semantic_converter.join(updates_converter) + + return ArrowDatagram( + table=combined_table, + semantic_converter=total_converter, + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New ArrowDatagram instance with new context + """ + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + data_context=new_context_key, + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """Return a copy of the datagram.""" + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + new_datagram = self.__class__( + combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + + Example: + >>> str(datagram) + "{'user_id': 123, 'name': 'Alice'}" + >>> print(datagram) + {'user_id': 123, 'name': 'Alice'} + """ + return str(self.as_dict()) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + + Example: + >>> repr(datagram) + "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" + """ + data_dict = self.as_dict() + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"ArrowDatagram(" + f"data={data_dict}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + + +class DictTag(DictDatagram): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + + +class DictPacket(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) + } + contained_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in data.items() + if k.startswith(constants.SOURCE_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + { + f"{constants.SOURCE_PREFIX}{k}": pa.large_string() + for k in self.keys() + } + ) + return self._cached_source_info_schema + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table( + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=self._source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [ + f"{constants.SOURCE_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] + ) + table = arrow_utils.hstack_tables(table, source_info_table) + return table + + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + dict_copy = super().as_dict( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value + return dict_copy + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + + def as_datagram( + self, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> DictDatagram: + """ + Convert the packet to a DictDatagram. + + Args: + include_source: Whether to include source info fields + + Returns: + DictDatagram representation of the packet + """ + data = self.as_dict( + include_meta_columns=include_meta_columns, include_source=include_source + ) + typespec = self.types(include_source=include_source) + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> Self: + """Return a shallow copy of the packet.""" + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + +class ArrowPacket(ArrowDatagram): + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + data: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(data) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + data, + {constants.SOURCE_PREFIX: source_info}, + exclude_columns=[constants.CONTEXT_KEY], + ) + self._source_info_table = prefixed_tables[constants.SOURCE_INFO_PREFIX] + + super().__init__( + data_table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table(include_data_context=include_data_context) + if include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables( + table, + self._source_info_table.select( + [ + f"{constants.SOURCE_INFO_PREFIX}{c}" + for c in table.column_names + if c in self.keys() + ] + ), + ) + return table + + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + return_dict = super().as_dict(include_data_context=include_data_context) + if include_source: + return_dict.update( + { + f"{constants.SOURCE_INFO_PREFIX}{k}": v + for k, v in self.source_info().items() + } + ) + return return_dict + + def as_datagram(self, include_source: bool = False) -> ArrowDatagram: + table = self.as_table(include_source=include_source) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ + if self._cached_source_info is None: + self._cached_source_info = { + k.removeprefix(constants.SOURCE_INFO_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() + } + return self._cached_source_info.copy() + + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema + new_packet._cached_content_hash = self._cached_content_hash + + return new_packet + + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] +"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index cd06f34..f22b9fe 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -20,6 +20,8 @@ from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter from orcapod.types import typespec_utils as tsutils +from orcapod.utils import arrow_utils +from orcapod.data.system_constants import orcapod_constants as constants import pyarrow as pa logger = logging.getLogger(__name__) @@ -255,7 +257,10 @@ def __init__( @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name,) + return ( + self.function_name, + self.data_context.object_hasher.hash_to_hex(self), + ) def input_packet_types(self) -> PythonSchema: """ @@ -284,6 +289,8 @@ def __str__(self) -> str: return f"FunctionPod:{func_sig}" def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: + v: dp.Packet = DictPacket({}) + print(v) if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -311,9 +318,12 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" ) - # TODO: add source info based on this function call + output_data = {k: v for k, v in zip(self.output_keys, output_values)} + source_info = {k: ":".join(self.kernel_id + (k,)) for k in output_data} + output_packet = DictPacket( {k: v for k, v in zip(self.output_keys, output_values)}, + source_info=source_info, typespec=self.output_packet_types(), semantic_converter=self._output_semantic_converter, data_context=self._data_context, @@ -365,9 +375,17 @@ def __init__( pod: dp.Pod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, + data_context: str | DataContext | None = None, **kwargs, ) -> None: - super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) + if data_context is None: + data_context = pod.data_context_key + super().__init__( + fixed_input_streams=fixed_input_streams, + label=label, + data_context=data_context, + **kwargs, + ) self.pod = pod @property @@ -414,32 +432,27 @@ class CachedPod(WrappedPod): This is useful for pods that are expensive to compute and can benefit from caching. """ + # name of the column in the tag store that contains the packet hash + PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" + def __init__( self, pod: dp.Pod, result_store: ArrowDataStore, - lineage_store: ArrowDataStore | None, + pipeline_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store - self.lineage_store = lineage_store + self.pipeline_store = pipeline_store # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( self.pod, prefix_hasher_id=True ) - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Return the pod ID, which is the function name of the wrapped pod. - This is used to identify the pod in the system. - """ - return self.pod.kernel_id + (self.pod_hash,) - @property def record_path(self) -> tuple[str, ...]: """ @@ -448,14 +461,65 @@ def record_path(self) -> tuple[str, ...]: """ return self.record_path_prefix + self.kernel_id - def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + def call( + self, + tag: dp.Tag, + packet: dp.Packet, + skip_recording: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: output_packet = self.get_recorded_output_packet(packet) + if output_packet is None: + tag, output_packet = self.pod.call(tag, packet) + if output_packet is not None and not skip_recording: + self.record_packet(packet, output_packet) + if output_packet is not None: - return tag, output_packet - output_tag, output_packet = self.pod.call(tag, packet) - if output_packet is not None: - self.record_packet(packet, output_packet) - return output_tag, output_packet + self.add_pipeline_record(tag, input_packet=packet) + return tag, output_packet + + def add_pipeline_record(self, tag: dp.Tag, input_packet: dp.Packet) -> None: + if self.pipeline_store is None: + # no pipeline store configured, skip recording + return + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.record_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.record_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) def record_packet( self, @@ -466,16 +530,9 @@ def record_packet( """ Record the output packet against the input packet in the result store. """ - data_table = output_packet.as_table( - include_data_context=True, include_source=True - ) + data_table = output_packet.as_table(include_context=True, include_source=True) - data_table = data_table.append_column( - f"_input_packet{DataContext.get_data_context_column()}", - pa.array([input_packet.data_context_key], type=pa.large_string()), - ) - - result_flag = self.result_store.record_data( + result_flag = self.result_store.add_record( self.record_path, input_packet.content_hash(), data_table, @@ -494,14 +551,47 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non Retrieve the output packet from the result store based on the input packet. If the output packet is not found, return None. """ - result_table = self.result_store.get_recorded_data( + result_table = self.result_store.get_record_by_id( self.record_path, input_packet.content_hash() ) if result_table is None: return None - return ArrowPacket( - result_table.drop( - [f"_input_packet{DataContext.get_data_context_column()}"] - ), + return ArrowPacket(result_table) + + def _get_all_records(self) -> "pa.Table | None": + results = self.result_store.get_all_records( + self.record_path, record_id_column=self.PACKET_HASH_COLUMN + ) + + if self.pipeline_store is None: + raise ValueError( + "Pipeline store is not configured, cannot retrieve tag info" + ) + taginfo = self.pipeline_store.get_all_records( + self.record_path, + ) + + if results is None or taginfo is None: + return None + + tag_columns = [ + c + for c in taginfo.column_names + if not c.startswith(constants.META_PREFIX) + and not c.startswith(constants.SOURCE_PREFIX) + ] + + packet_columns = [ + c for c in results.column_names if c != self.PACKET_HASH_COLUMN + ] + + # TODO: do not hardcode the join keys + joined_info = taginfo.join( + results, + self.PACKET_HASH_COLUMN, + join_type="inner", ) + + joined_info = joined_info.select([*tag_columns, *packet_columns]) + return joined_info diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index a5c2434..b8ce85d 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,23 +1,25 @@ -from orcapod.protocols import data_protocols as dp +import logging +import warnings +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator +from datetime import datetime, timezone +from itertools import repeat +from typing import Any, Literal + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext from orcapod.data.datagrams import ( ArrowPacket, ArrowTag, DictTag, - SemanticConverter, - SOURCE_INFO_PREFIX, ) -from orcapod.utils import arrow_utils -from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas -import pyarrow as pa -from collections.abc import Iterator, Collection -from abc import ABC, abstractmethod -from datetime import timezone, datetime -from typing import Any, Literal -import logging -import warnings -from itertools import repeat +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -327,18 +329,14 @@ def __init__( super().__init__(source=source, upstreams=upstreams, **kwargs) table, data_context_table = arrow_utils.split_by_column_groups( - table, [DataContext.get_data_context_column()] + table, [constants.CONTEXT_KEY] ) if data_context_table is None: data_context_table = pa.table( - { - DataContext.get_data_context_column(): pa.nulls( - len(table), pa.large_string() - ) - } + {constants.CONTEXT_KEY: pa.nulls(len(table), pa.large_string())} ) - prefix_info = {SOURCE_INFO_PREFIX: source_info} + prefix_info = {constants.SOURCE_PREFIX: source_info} # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) @@ -350,7 +348,7 @@ def __init__( c for c in table.column_names if c not in tag_columns ) self._table = table - self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._source_info_table = prefix_tables[constants.SOURCE_PREFIX] self._data_context_table = data_context_table if len(self._packet_columns) == 0: @@ -575,12 +573,12 @@ def as_table( tag_schema = tag.arrow_schema() if packet_schema is None: packet_schema = packet.arrow_schema( - include_data_context=True, + include_context=True, include_source=True, ) all_tags.append(tag.as_dict()) all_packets.append( - packet.as_dict(include_data_context=True, include_source=True) + packet.as_dict(include_context=True, include_source=True) ) all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) @@ -595,9 +593,9 @@ def as_table( drop_columns = [] if not include_source: - drop_columns.extend(f"{SOURCE_INFO_PREFIX}{c}" for c in self.keys()[1]) + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) if not include_data_context: - drop_columns.append(DataContext.get_data_context_column()) + drop_columns.append(constants.CONTEXT_KEY) output_table = self._cached_output_table.drop(drop_columns) diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py new file mode 100644 index 0000000..de1bebc --- /dev/null +++ b/src/orcapod/data/system_constants.py @@ -0,0 +1,25 @@ +# Constants used for source info keys +SYSTEM_COLUMN_PREFIX = "__" +SOURCE_INFO_PREFIX = "_source_" + +DATA_CONTEXT_KEY = "_context_key" + + +class SystemConstant: + def __init__(self, global_prefix: str = ""): + self._global_prefix = global_prefix + + @property + def META_PREFIX(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}" + + @property + def SOURCE_PREFIX(self) -> str: + return f"{self._global_prefix}{SOURCE_INFO_PREFIX}" + + @property + def CONTEXT_KEY(self) -> str: + return f"{self._global_prefix}{DATA_CONTEXT_KEY}" + + +orcapod_constants = SystemConstant() diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index a89ab4e..695ffe8 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,7 +1,6 @@ import hashlib from typing import Any import pyarrow as pa -import polars as pl import json from orcapod.protocols.hashing_protocols import SemanticTypeHasher, StringCacher from orcapod.hashing import arrow_serialization @@ -214,6 +213,8 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: # normalize all string to large strings by passing through polars # TODO: consider cleaner approach in the future + import polars as pl + sorted_table = pl.DataFrame(sorted_table).to_arrow() # Step 3: Serialize using Arrow IPC format diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 97568f5..2a92f69 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,4 +1,4 @@ -from orcapod.protocols.hashing_protocols import FunctionInfoExtractor, ObjectHasher +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from orcapod.hashing import legacy_core from orcapod.hashing import hash_utils from typing import Any diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 012edaa..968d70e 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,23 +1,43 @@ -from typing import Protocol, Any, ContextManager -from orcapod.types import DataValue, TypeSpec -from orcapod.protocols.hashing_protocols import ContentIdentifiable -from collections.abc import Iterator, Collection -import pyarrow as pa +from collections.abc import Collection, Iterator, Mapping from datetime import datetime +from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import DataValue, TypeSpec + +if TYPE_CHECKING: + import pyarrow as pa class Datagram(Protocol): """ - Base protocol for all data containers in Orcapod. + Protocol for immutable datagram containers in Orcapod. Datagrams are the fundamental units of data that flow through the system. - They provide a unified interface for data access and conversion, ensuring - consistent behavior across different data types and sources. + They provide a unified interface for data access, conversion, and manipulation, + ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). + + Each datagram contains: + - **Data columns**: The primary business data (user_id, name, etc.) + - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes (__processed_at, etc.) + - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) + + Future Packet subclass will also include: + - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) + + When exposing to external tools without field metadata support, semantic types + are encoded as `_{semantic_type}_` prefixes (_path_config_file, _id_user_name, etc.). + + All operations are immutable - methods return new datagram instances rather than + modifying existing ones. - TypeSpec is a dict[str, type] mapping field names to their Python types, - enabling type checking and validation throughout the computational graph. + Example: + >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) + >>> updated = datagram.update(name="Alice Smith") + >>> filtered = datagram.select("user_id", "name") + >>> table = datagram.as_table() """ + # 1. Core Properties (Identity & Structure) @property def data_context_key(self) -> str: """ @@ -27,91 +47,527 @@ def data_context_key(self) -> str: contextual information needed to properly interpret and work with this datagram across various operations (storage, visualization, processing, etc.). - Context key formats: - - Standard contexts: "std:v1.2.3:fingerprint" - - Custom contexts: "custom:user_provided_id" + Returns: + str: Context key for proper datagram interpretation + """ + ... + + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" + ... + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """ + Get data column value by key. + + Provides dict-like access to data columns only. Meta columns + are not accessible through this method (use `get_meta_value()` instead). - Concrete implementation can make use of this context key to ensure necessary background - informaton / object is available for correct processing of the datagram. + Args: + key: Data column name. Returns: - str: Context key for proper datagram interpretation + The value stored in the specified data column. + + Raises: + KeyError: If the column doesn't exist in data columns. + + Example: + >>> datagram["user_id"] + 123 + >>> datagram["name"] + 'Alice' """ ... - def types(self, include_data_context: bool = False) -> TypeSpec: + def __contains__(self, key: str) -> bool: """ - Return the type specification for this datagram. + Check if data column exists. - The TypeSpec maps field names to their Python types, enabling - type checking and validation throughout the system. + Args: + key: Column name to check. Returns: - TypeSpec: Dictionary mapping field names to Python types + True if column exists in data columns, False otherwise. + + Example: + >>> "user_id" in datagram + True + >>> "nonexistent" in datagram + False + """ + ... + + def __iter__(self) -> Iterator[str]: + """ + Iterate over data column names. + + Provides for-loop support over column names, enabling natural iteration + patterns without requiring conversion to dict. + + Yields: + Data column names in no particular order. + + Example: + >>> for column in datagram: + ... value = datagram[column] + ... print(f"{column}: {value}") """ ... - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + def get(self, key: str, default: DataValue = None) -> DataValue: """ - Return the PyArrow schema for this datagram. + Get data column value with default fallback. - The schema provides a structured representation of the datagram's - fields and their types, enabling efficient serialization and - deserialization with PyArrow. + Args: + key: Data column name. + default: Value to return if column doesn't exist. Returns: - pa.Schema: PyArrow schema representation of the datagram + Column value if exists, otherwise the default value. + + Example: + >>> datagram.get("user_id") + 123 + >>> datagram.get("missing", "default") + 'default' """ ... - def keys(self) -> Collection[str]: + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: """ - Return the available keys/fields in this datagram. + Return tuple of column names. - This provides a way to inspect the structure of the datagram - without accessing the actual data values. + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. Returns: - Collection[str]: Available field names + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') """ ... - def as_table(self, include_data_context: bool = False) -> pa.Table: + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: """ - Convert to PyArrow Table format. + Return type specification mapping field names to Python types. - Provides a standardized way to convert datagram content to - a columnar format suitable for analysis and processing. + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. Returns: - pa.Table: PyArrow table representation + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } """ ... - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Schema": """ - Convert to dictionary format. + Return PyArrow schema representation. - Provides a simple key-value representation of the datagram - content, useful for debugging and simple data access. + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. Returns: - dict[str, DataValue]: Dictionary representation + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] """ ... def content_hash(self) -> str: """ - Return a hash of the packet content for caching/comparison. + Return deterministic hash of datagram content. + + The hash should reflect the data content, typically excluding meta columns + and context. Used for caching, comparison, and deduplication. For exact details of + hash computation, refer to the implementation in the specific datagram class/subclass. + + Returns: + Deterministic content hash string. + + Note: + Two datagrams with identical data columns will have the same hash, + even if they differ in meta columns or context. + + Example: + >>> datagram.content_hash() + 'sha256:abc123def456...' + """ + ... + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. + This method handles both prefixed and unprefixed key formats. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + + Example: + >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed + 'v2.1.0' + >>> datagram.get_meta_value("__pipeline_version") # Already prefixed + 'v2.1.0' + >>> datagram.get_meta_value("missing", "default") + 'default' + """ + ... + + def with_meta_columns(self, **updates: DataValue) -> Self: + """ + Create new datagram with updated meta columns. + + Adds or updates operational metadata while preserving all data columns. + Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. + + Args: + **updates: Meta column updates as keyword arguments. + + Returns: + New datagram instance with updated meta columns. + + Example: + >>> tracked = datagram.with_meta_columns( + ... processed_by="pipeline_v2", + ... timestamp="2024-01-15T10:30:00Z" + ... ) + """ + ... + + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified meta columns removed. + + Args: + *keys: Meta column keys to remove (prefixes optional). + ignore_missing: If True, ignore missing columns without raising an error. + + + Returns: + New datagram instance without specified meta columns. + + Raises: + KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Example: + >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") + """ + ... + + # 6. Data Column Operations + def select(self, *column_names: str) -> Self: + """ + Create new datagram with only specified data columns. + + Args: + *column_names: Data column names to keep. + + + Returns: + New datagram instance with only specified data columns. All other columns including + meta columns and context are preserved. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> subset = datagram.select("user_id", "name", "email") + """ + ... + + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified data columns removed. Note that this does not + remove meta columns or context column. Refer to `drop_meta_columns()` for dropping + specific meta columns. Context key column can never be dropped but a modified copy + can be created with a different context key using `with_data_context()`. + + Args: + *column_names: Data column names to remove. + ignore_missing: If True, ignore missing columns without raising an error. + + Returns: + New datagram instance without specified data columns. + + Raises: + KeryError: If any specified column to drop doesn't exist and ignore_missing=False. + + Example: + >>> filtered = datagram.drop("temp_field", "debug_info") + """ + ... + + def rename( + self, + column_mapping: Mapping[str, str], + ) -> Self: + """ + Create new datagram with data columns renamed. + + Args: + column_mapping: Mapping from old names to new names. + + Returns: + New datagram instance with renamed data columns. + + Example: + >>> renamed = datagram.rename( + ... {"old_id": "user_id", "old_name": "full_name"}, + ... column_types={"user_id": int} + ... ) + """ + ... + + def update(self, **updates: DataValue) -> Self: + """ + Create new datagram with existing column values updated. + + Updates values in existing data columns. Will error if any specified + column doesn't exist - use with_columns() to add new columns. + + Args: + **updates: Column names and their new values. + + Returns: + New datagram instance with updated values. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> updated = datagram.update( + ... file_path="/new/absolute/path.txt", + ... status="processed" + ... ) + """ + ... + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create new datagram with additional data columns. + + Adds new data columns to the datagram. Will error if any specified + column already exists - use update() to modify existing columns. + + Args: + column_types: Optional type specifications for new columns. If not provided, the column type is + inferred from the provided values. If value is None, the column type defaults to `str`. + **kwargs: New columns as keyword arguments. + + Returns: + New datagram instance with additional data columns. + + Raises: + ValueError: If any specified column already exists. + + Example: + >>> expanded = datagram.with_columns( + ... status="active", + ... score=95.5, + ... column_types={"score": float} + ... ) + """ + ... + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> Self: + """ + Create new datagram with different context key. + + Changes the semantic interpretation context while preserving all data. + The context key affects how columns are processed and converted. + + Args: + new_context_key: New context key string. + + Returns: + New datagram instance with updated context key. + + Note: + How the context is interpreted depends on the datagram implementation. + Semantic processing may be rebuilt for the new context. + + Example: + >>> financial_datagram = datagram.with_context_key("financial_v1") + """ + ... + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New datagram instance with copied data and caches. + + Example: + >>> copied = datagram.copy() + >>> copied is datagram # False - different instance + False + """ + ... - This hash should be deterministic and based only on the packet content, - not on source information or metadata. Used for: - - Caching computation results - - Detecting data changes - - Deduplication operations + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. Returns: - str: Deterministic hash of packet content + Dictionary-style string representation of data columns only. + """ + ... + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information for debugging. + + Returns: + Detailed representation with type and metadata information. """ ... @@ -157,98 +613,218 @@ class Packet(Datagram, Protocol): data flow: Tags provide context, Packets provide content. """ - def as_table( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Table: + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: """ - Convert the packet to a PyArrow Table. + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. Args: - include_source: If True, source information is included in the table - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + Returns: - pa.Table: PyArrow table representation of packet data + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') """ ... - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> TypeSpec: """ - Convert the packet to a dictionary. + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. Args: - include_source: If True, source information is included in the dictionary - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + include_source: Whether to include source info fields. Returns: - dict[str, DataValue]: Dictionary representation of packet data + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } """ ... - def as_datagram(self, include_source: bool = False) -> Datagram: + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Schema": """ - Convert the packet to a Datagram. + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. Args: - include_source: If True, source information is included in the datagram - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + Returns: - Datagram: Datagram representation of packet data + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] """ ... - def source_info(self) -> dict[str, str | None]: + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: """ - Return metadata about the packet's source/origin. + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_source: Whether to include source info fields. - Provides debugging and lineage information about where the packet - originated. May include information like: - - File paths for file-based sources - - Database connection strings - - API endpoints - - Processing pipeline information Returns: - dict[str, str | None]: Source metadata for debugging/lineage + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) """ ... - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> TypeSpec: + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Table": """ - Return the type specification for this packet. + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. Args: - include_source: If True, source information is included in the typespec - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_source: Whether to include source info columns in the schema. Returns: - TypeSpec: Dictionary mapping field names to Python types + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" """ ... - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> Datagram: """ - Return the PyArrow schema for this packet. + Convert the packet to a Datagram. Args: - include_source: If True, source information is included in the schema - for debugging and lineage tracking + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. Returns: - pa.Schema: PyArrow schema representation of packet data + Datagram: Datagram representation of packet data """ ... - # def join(self, other: "Packet") -> "Packet": ... + def source_info(self) -> dict[str, str | None]: + """ + Return metadata about the packet's source/origin. + + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information - # def get_as(self, packet_type: PacketType) -> PacketType: ... + Returns: + dict[str, str | None]: Source information for each data column as key-value pairs. + """ + ... class PodFunction(Protocol): @@ -468,7 +1044,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": """ Convert the entire stream to a PyArrow Table. @@ -596,6 +1172,20 @@ def kernel_id(self) -> tuple[str, ...]: """ ... + @property + def data_context_key(self) -> str: + """ + Return the context key for this kernel's data processing. + + The context key is used to interpret how data columns should be + processed and converted. It provides semantic meaning to the data + being processed by this kernel. + + Returns: + str: Context key for this kernel's data processing + """ + ... + def __call__( self, *streams: Stream, label: str | None = None, **kwargs ) -> LiveStream: diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 1767509..16c96cd 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -1,11 +1,13 @@ """Hash strategy protocols for dependency injection.""" from collections.abc import Callable -from typing import Any, Protocol, runtime_checkable +from typing import Any, Protocol, runtime_checkable, TYPE_CHECKING import uuid from orcapod.types import TypeSpec, PathLike -import pyarrow as pa + +if TYPE_CHECKING: + import pyarrow as pa @runtime_checkable @@ -101,7 +103,7 @@ class ArrowHasher(Protocol): def get_hasher_id(self) -> str: ... - def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... + def hash_table(self, table: "pa.Table", prefix_hasher_id: bool = True) -> str: ... class StringCacher(Protocol): @@ -134,8 +136,8 @@ def hasher_id(self) -> str: def hash_column( self, - column: pa.Array, - ) -> pa.Array: + column: "pa.Array", + ) -> "pa.Array": """Hash a column with this semantic type and return the hash bytes.""" ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 618d7a4..d51ead8 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -1,13 +1,10 @@ -from typing import Collection, Protocol, TYPE_CHECKING -from orcapod.protocols import data_protocols as dp +from typing import Protocol +from collections.abc import Collection import pyarrow as pa -if TYPE_CHECKING: - import polars as pl - class ArrowDataStore(Protocol): - def record_data( + def add_record( self, record_path: tuple[str, ...], record_id: str, @@ -15,13 +12,26 @@ def record_data( ignore_duplicates: bool | None = None, ) -> str | None: ... - def get_recorded_data( + def add_records( + self, + record_path: tuple[str, ...], + records: pa.Table, + record_id_column: str | None = None, + ignore_duplicates: bool | None = None, + ) -> list[str]: ... + + def get_record_by_id( self, record_path: tuple[str, ...], record_id: str, + record_id_column: str | None = None, ) -> pa.Table | None: ... - def get_all_records(self, record_path: tuple[str, ...]) -> pa.Table | None: + def get_all_records( + self, + record_path: tuple[str, ...], + record_id_column: str | None = None, + ) -> pa.Table | None: """Retrieve all records for a given path as a stream.""" ... @@ -29,6 +39,5 @@ def get_records_by_ids( self, record_path: tuple[str, ...], record_ids: Collection[str], - add_entry_id_column: bool | str = False, - preseve_input_order: bool = False, + record_id_column: str | None = None, ) -> pa.Table: ... diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index f04a7b7..218c0e0 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -1,5 +1,4 @@ import pyarrow as pa -import pyarrow.dataset as ds import polars as pl from pathlib import Path from typing import Any @@ -7,6 +6,7 @@ from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict +from orcapod.data import constants # Module-level logger @@ -28,6 +28,8 @@ class BasicDeltaTableArrowStore: - ("year", "month", "day", "experiment") -> year/month/day/experiment/ """ + RECORD_ID_COLUMN = f"{constants.META_PREFIX}record_id" + def __init__( self, base_path: str | Path, @@ -41,8 +43,8 @@ def __init__( Args: base_path: Base directory path where Delta tables will be stored - duplicate_entry_behavior: How to handle duplicate entry_ids: - - 'error': Raise ValueError when entry_id already exists + duplicate_entry_behavior: How to handle duplicate record_ids: + - 'error': Raise ValueError when record_id already exists - 'overwrite': Replace existing entry with new data create_base_path: Whether to create the base path if it doesn't exist max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) @@ -87,15 +89,15 @@ def flush(self) -> None: except Exception as e: logger.error(f"Error during flush: {e}") - def flush_batch(self, source_path: tuple[str, ...]) -> None: + def flush_batch(self, record_path: tuple[str, ...]) -> None: """ Flush pending batch for a specific source path. Args: - source_path: Tuple of path components + record_path: Tuple of path components """ logger.debug("Flushing triggered!!") - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) if ( source_key not in self._pending_batches @@ -111,11 +113,11 @@ def flush_batch(self, source_path: tuple[str, ...]) -> None: # Combine all tables in the batch combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() - table_path = self._get_table_path(source_path) + table_path = self._get_table_path(record_path) table_path.mkdir(parents=True, exist_ok=True) # Check if table exists - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: # TODO: reconsider mode="overwrite" here @@ -130,27 +132,31 @@ def flush_batch(self, source_path: tuple[str, ...]) -> None: else: if self.duplicate_entry_behavior == "overwrite": # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) + record_ids = combined_table.column( + self.RECORD_ID_COLUMN + ).to_pylist() + unique_record_ids = list(set(record_ids)) # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + if unique_record_ids: + record_ids_str = "', '".join(unique_record_ids) + delete_predicate = ( + f"{self.RECORD_ID_COLUMN} IN ('{record_ids_str}')" + ) try: delta_table.delete(delete_predicate) logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + f"Deleted {len(unique_record_ids)} existing records from {source_key}" ) except Exception as e: logger.debug( f"No existing records to delete from {source_key}: {e}" ) - # otherwise, only insert if same entry_id does not exist yet + # otherwise, only insert if same record_id does not exist yet delta_table.merge( source=combined_table, - predicate="target.__entry_id = source.__entry_id", + predicate=f"target.{self.RECORD_ID_COLUMN} = source.{self.RECORD_ID_COLUMN}", source_alias="source", target_alias="target", ).when_not_matched_insert_all().execute() @@ -174,9 +180,9 @@ def flush_all_batches(self) -> None: # TODO: capture and re-raise exceptions at the end for source_key in source_keys: - source_path = tuple(source_key.split("/")) + record_path = tuple(source_key.split("/")) try: - self.flush_batch(source_path) + self.flush_batch(record_path) except Exception as e: logger.error(f"Error flushing batch for {source_key}: {e}") @@ -184,27 +190,27 @@ def __del__(self): """Cleanup when object is destroyed.""" self.flush() - def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + def _validate_record_path(self, record_path: tuple[str, ...]) -> None: # TODO: consider removing this as path creation can be tried directly """ Validate source path components. Args: - source_path: Tuple of path components + record_path: Tuple of path components Raises: ValueError: If path is invalid """ - if not source_path: + if not record_path: raise ValueError("Source path cannot be empty") - if len(source_path) > self.max_hierarchy_depth: + if len(record_path) > self.max_hierarchy_depth: raise ValueError( - f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + f"Source path depth {len(record_path)} exceeds maximum {self.max_hierarchy_depth}" ) # Validate path components - for i, component in enumerate(source_path): + for i, component in enumerate(record_path): if not component or not isinstance(component, str): raise ValueError( f"Source path component {i} is invalid: {repr(component)}" @@ -217,31 +223,31 @@ def _validate_source_path(self, source_path: tuple[str, ...]) -> None: f"Source path component contains invalid characters: {repr(component)}" ) - def _get_source_key(self, source_path: tuple[str, ...]) -> str: + def _get_source_key(self, record_path: tuple[str, ...]) -> str: """Generate cache key for source storage.""" - return "/".join(source_path) + return "/".join(record_path) - def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + def _get_table_path(self, record_path: tuple[str, ...]) -> Path: """Get the filesystem path for a given source path.""" path = self.base_path - for subpath in source_path: + for subpath in record_path: path = path / subpath return path def _get_existing_delta_table( - self, source_path: tuple[str, ...] + self, record_path: tuple[str, ...] ) -> DeltaTable | None: """ Get or create a Delta table, handling schema initialization properly. Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: DeltaTable instance or None if table doesn't exist """ - source_key = self._get_source_key(source_path) - table_path = self._get_table_path(source_path) + source_key = self._get_source_key(record_path) + table_path = self._get_table_path(record_path) # Check cache first if dt := self._delta_table_cache.get(source_key): @@ -263,75 +269,79 @@ def _get_existing_delta_table( del self._delta_table_cache[source_key] return None - def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: - """Ensure the table has an __entry_id column.""" - if "__entry_id" not in arrow_data.column_names: - # Add entry_id column at the beginning - key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) - arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + def _ensure_record_id_column( + self, arrow_data: pa.Table, record_id: str + ) -> pa.Table: + """Ensure the table has an record id column.""" + if self.RECORD_ID_COLUMN not in arrow_data.column_names: + # Add record_id column at the beginning + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data - def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: - """Remove the __entry_id column if it exists.""" - if "__entry_id" in arrow_data.column_names: + def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the record id column if it exists.""" + if self.RECORD_ID_COLUMN in arrow_data.column_names: column_names = arrow_data.column_names indices_to_keep = [ - i for i, name in enumerate(column_names) if name != "__entry_id" + i + for i, name in enumerate(column_names) + if name != self.RECORD_ID_COLUMN ] arrow_data = arrow_data.select(indices_to_keep) return arrow_data - def _handle_entry_id_column( - self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + def _handle_record_id_column( + self, arrow_data: pa.Table, record_id_column: str | None = None ) -> pa.Table: """ - Handle entry_id column based on add_entry_id_column parameter. + Handle record_id column based on add_record_id_column parameter. Args: - arrow_data: Arrow table with __entry_id column - add_entry_id_column: Control entry ID column inclusion: - - False: Remove __entry_id column - - True: Keep __entry_id column as is - - str: Rename __entry_id column to custom name - """ - if add_entry_id_column is False: - # Remove the __entry_id column - return self._remove_entry_id_column(arrow_data) - elif isinstance(add_entry_id_column, str): - # Rename __entry_id to custom name - if "__entry_id" in arrow_data.column_names: - schema = arrow_data.schema - new_names = [ - add_entry_id_column if name == "__entry_id" else name - for name in schema.names - ] - return arrow_data.rename_columns(new_names) - # If add_entry_id_column is True, keep __entry_id as is - return arrow_data + arrow_data: Arrow table with record id column + record_id_column: Control entry ID column inclusion: - def _create_entry_id_filter(self, entry_id: str) -> list: + """ + if not record_id_column: + # Remove the record id column + return self._remove_record_id_column(arrow_data) + + # Rename record id column + if self.RECORD_ID_COLUMN in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + record_id_column if name == self.RECORD_ID_COLUMN else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + else: + raise ValueError( + f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." + ) + + def _create_record_id_filter(self, record_id: str) -> list: """ Create a proper filter expression for Delta Lake. Args: - entry_id: The entry ID to filter by + record_id: The entry ID to filter by Returns: List containing the filter expression for Delta Lake """ - return [("__entry_id", "=", entry_id)] + return [(self.RECORD_ID_COLUMN, "=", record_id)] - def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + def _create_record_ids_filter(self, record_ids: list[str]) -> list: """ Create a proper filter expression for multiple entry IDs. Args: - entry_ids: List of entry IDs to filter by + record_ids: List of entry IDs to filter by Returns: List containing the filter expression for Delta Lake """ - return [("__entry_id", "in", entry_ids)] + return [(self.RECORD_ID_COLUMN, "in", record_ids)] def _read_table_with_filter( self, @@ -349,7 +359,7 @@ def _read_table_with_filter( Arrow table with preserved schema """ # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading - dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + dataset = delta_table.to_pyarrow_dataset(as_large_types=True) if filters: # Apply filters at dataset level for better performance import pyarrow.compute as pc @@ -379,36 +389,36 @@ def _read_table_with_filter( return dataset.to_table() - def record_data( + def add_record( self, record_path: tuple[str, ...], - entry_id: str, + record_id: str, data: pa.Table, - force_flush: bool = False, ignore_duplicates: bool | None = None, + force_flush: bool = False, ) -> pa.Table: - self._validate_source_path(record_path) + self._validate_record_path(record_path) source_key = self._get_source_key(record_path) # Check for existing entry if ignore_duplicates is None: ignore_duplicates = self.duplicate_entry_behavior != "error" if not ignore_duplicates: - pending_table = self._pending_batches[source_key].get(entry_id, None) + pending_table = self._pending_batches[source_key].get(record_id, None) if pending_table is not None: raise ValueError( - f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Entry '{record_id}' already exists in pending batch for {source_key}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + existing_record = self.get_record_by_id(record_path, record_id, flush=False) if existing_record is not None: raise ValueError( - f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Entry '{record_id}' already exists in {'/'.join(record_path)}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - # Add entry_id column to the data - data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + # Add record_id column to the data + data_with_record_id = self._ensure_record_id_column(data, record_id) if force_flush: # Write immediately @@ -419,25 +429,25 @@ def record_data( if delta_table is None: # Create new table - save original schema first - write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + write_deltalake(str(table_path), data_with_record_id, mode="overwrite") logger.debug(f"Created new Delta table for {source_key}") else: if self.duplicate_entry_behavior == "overwrite": try: delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" ) logger.debug( - f"Deleted existing record {entry_id} from {source_key}" + f"Deleted existing record {record_id} from {source_key}" ) except Exception as e: logger.debug( - f"No existing record to delete for {entry_id}: {e}" + f"No existing record to delete for {record_id}: {e}" ) write_deltalake( table_path, - data_with_entry_id, + data_with_record_id, mode="append", schema_mode="merge", ) @@ -446,28 +456,41 @@ def record_data( self._delta_table_cache[source_key] = DeltaTable(str(table_path)) else: # Add to the batch for later flushing - self._pending_batches[source_key][entry_id] = data_with_entry_id + self._pending_batches[source_key][record_id] = data_with_record_id batch_size = len(self._pending_batches[source_key]) # Check if we need to flush if batch_size >= self.batch_size: self.flush_batch(record_path) - logger.debug(f"Added record {entry_id} to {source_key}") + logger.debug(f"Added record {record_id} to {source_key}") return data - def get_recorded_data( + def add_records( self, record_path: tuple[str, ...], - entry_id: str, + records: pa.Table, + record_id_column: str | None = None, + ignore_duplicates: bool | None = None, + ) -> list[str]: + raise NotImplementedError( + "add_records is not implemented in BasicDeltaTableArrowStore yet. " + "Use add_record for single record insertion." + ) + + def get_record_by_id( + self, + record_path: tuple[str, ...], + record_id: str, + record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: """ - Get a specific record by entry_id with schema preservation. + Get a specific record by record_id with schema preservation. Args: - source_path: Tuple of path components - entry_id: Unique identifier for the record + record_path: Tuple of path components + record_id: Unique identifier for the record Returns: Arrow table for the record or None if not found @@ -475,14 +498,14 @@ def get_recorded_data( if flush: self.flush_batch(record_path) - self._validate_source_path(record_path) + self._validate_record_path(record_path) - # check if entry_id is found in pending batches + # check if record_id is found in pending batches source_key = self._get_source_key(record_path) - if entry_id in self._pending_batches[source_key]: + if record_id in self._pending_batches[source_key]: # Return the pending record after removing the entry id column - return self._remove_entry_id_column( - self._pending_batches[source_key][entry_id] + return self._remove_record_id_column( + self._pending_batches[source_key][record_id] ) delta_table = self._get_existing_delta_table(record_path) @@ -491,25 +514,25 @@ def get_recorded_data( try: # Use schema-preserving read - filter_expr = self._create_entry_id_filter(entry_id) + filter_expr = self._create_record_id_filter(record_id) result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None - # Remove the __entry_id column before returning - return self._remove_entry_id_column(result) + # Handle (remove/rename) the record id column before returning + return self._handle_record_id_column(result, record_id_column) except Exception as e: logger.error( - f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + f"Error getting record {record_id} from {'/'.join(record_path)}: {e}" ) raise e def get_all_records( self, record_path: tuple[str, ...], - add_entry_id_column: bool | str = False, + record_id_column: str | None = None, retrieve_pending: bool = True, flush: bool = False, ) -> pa.Table | None: @@ -517,11 +540,8 @@ def get_all_records( Retrieve all records for a given source path as a single table with schema preservation. Args: - source_path: Tuple of path components - add_entry_id_column: Control entry ID column inclusion: - - False: Don't include entry ID column (default) - - True: Include entry ID column as "__entry_id" - - str: Include entry ID column with custom name + record_path: Tuple of path components + record_id_column: If not None or empty, record id is returned in the result with the specified column name Returns: Arrow table containing all records with original schema, or None if no records found @@ -530,16 +550,16 @@ def get_all_records( if flush: self.flush_batch(record_path) - self._validate_source_path(record_path) + self._validate_record_path(record_path) collected_tables = [] if retrieve_pending: # Check if there are pending records in the batch - for entry_id, arrow_table in self._pending_batches[ + for record_id, arrow_table in self._pending_batches[ self._get_source_key(record_path) ].items(): collected_tables.append( - self._ensure_entry_id_column(arrow_table, entry_id) + self._ensure_record_id_column(arrow_table, record_id) ) delta_table = self._get_existing_delta_table(record_path) @@ -558,44 +578,25 @@ def get_all_records( if collected_tables: total_table = pa.concat_tables(collected_tables) - # Handle entry_id column based on parameter - return self._handle_entry_id_column(total_table, add_entry_id_column) + # Handle record_id column based on parameter + return self._handle_record_id_column(total_table, record_id_column) return None - # def get_all_records_as_polars( - # self, source_path: tuple[str, ...], flush: bool = True - # ) -> pl.LazyFrame | None: - # """ - # Retrieve all records for a given source path as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - - # Returns: - # Polars LazyFrame containing all records, or None if no records found - # """ - # all_records = self.get_all_records(source_path, flush=flush) - # if all_records is None: - # return None - # # TODO: take care of converting semantics to Python objects - # return pl.LazyFrame(all_records.as_table()) - def get_records_by_ids( self, - source_path: tuple[str, ...], - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, + record_path: tuple[str, ...], + record_ids: list[str] | pl.Series | pa.Array, + record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: """ Retrieve records by entry IDs as a single table with schema preservation. Args: - source_path: Tuple of path components - entry_ids: Entry IDs to retrieve - add_entry_id_column: Control entry ID column inclusion + record_path: Tuple of path components + record_ids: Entry IDs to retrieve + add_record_id_column: Control entry ID column inclusion preserve_input_order: If True, return results in input order with nulls for missing Returns: @@ -603,99 +604,49 @@ def get_records_by_ids( """ if flush: - self.flush_batch(source_path) + self.flush_batch(record_path) - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Convert input to list of strings for consistency - if isinstance(entry_ids, list): - if not entry_ids: + if isinstance(record_ids, list): + if not record_ids: return None - entry_ids_list = entry_ids - elif isinstance(entry_ids, pl.Series): - if len(entry_ids) == 0: + record_ids_list = record_ids + elif isinstance(record_ids, pl.Series): + if len(record_ids) == 0: return None - entry_ids_list = entry_ids.to_list() - elif isinstance(entry_ids, pa.Array): - if len(entry_ids) == 0: + record_ids_list = record_ids.to_list() + elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): + if len(record_ids) == 0: return None - entry_ids_list = entry_ids.to_pylist() + record_ids_list = record_ids.to_pylist() else: raise TypeError( - f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" ) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return None try: # Use schema-preserving read with filters - filter_expr = self._create_entry_ids_filter(entry_ids_list) + filter_expr = self._create_record_ids_filter(record_ids_list) result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None - if preserve_input_order: - raise NotImplementedError("Preserve input order is not yet implemented") - # Need to reorder results and add nulls for missing entries - import pandas as pd - - df = result.to_pandas() - df = df.set_index("__entry_id") - - # Create a DataFrame with the desired order, filling missing with NaN - ordered_df = df.reindex(entry_ids_list) - - # Convert back to Arrow - result = pa.Table.from_pandas(ordered_df.reset_index()) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) + # Handle record_id column based on parameter + return self._handle_record_id_column(result, record_id_column) except Exception as e: logger.error( - f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + f"Error getting records by IDs from {'/'.join(record_path)}: {e}" ) return None - # def get_records_by_ids_as_polars( - # self, - # source_path: tuple[str, ...], - # entry_ids: list[str] | pl.Series | pa.Array, - # add_entry_id_column: bool | str = False, - # preserve_input_order: bool = False, - # flush: bool = False, - # ) -> pl.LazyFrame | None: - # """ - # Retrieve records by entry IDs as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - # entry_ids: Entry IDs to retrieve - # add_entry_id_column: Control entry ID column inclusion - # preserve_input_order: If True, return results in input order with nulls for missing - - # Returns: - # Polars LazyFrame containing all found records, or None if no records found - # """ - # arrow_result = self.get_records_by_ids( - # source_path, - # entry_ids, - # add_entry_id_column, - # preserve_input_order, - # flush=flush, - # ) - - # if arrow_result is None: - # return None - - # # Convert to Polars LazyFrame - # return pl.LazyFrame(arrow_result) - - # Additional utility methods - def get_pending_batch_info(self) -> dict[str, int]: """ Get information about pending batches. @@ -738,23 +689,23 @@ def _scan_directory(current_path: Path, path_components: tuple[str, ...]): _scan_directory(self.base_path, ()) return sources - def delete_source(self, source_path: tuple[str, ...]) -> bool: + def delete_source(self, record_path: tuple[str, ...]) -> bool: """ Delete an entire source (all records for a source path). Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: True if source was deleted, False if it didn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Flush any pending batches first - self.flush_batch(source_path) + self.flush_batch(record_path) - table_path = self._get_table_path(source_path) - source_key = self._get_source_key(source_path) + table_path = self._get_table_path(record_path) + source_key = self._get_source_key(record_path) if not table_path.exists(): return False @@ -776,64 +727,64 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: logger.error(f"Error deleting source {source_key}: {e}") return False - def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + def delete_record(self, record_path: tuple[str, ...], record_id: str) -> bool: """ Delete a specific record. Args: - source_path: Tuple of path components - entry_id: ID of the record to delete + record_path: Tuple of path components + record_id: ID of the record to delete Returns: True if record was deleted, False if it didn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Flush any pending batches first - self.flush_batch(source_path) + self.flush_batch(record_path) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return False try: # Check if record exists using proper filter - filter_expr = self._create_entry_id_filter(entry_id) + filter_expr = self._create_record_id_filter(record_id) existing = self._read_table_with_filter(delta_table, filters=filter_expr) if len(existing) == 0: return False # Delete the record using SQL-style predicate (this is correct for delete operations) delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" ) # Update cache - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) self._delta_table_cache[source_key] = delta_table - logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + logger.debug(f"Deleted record {record_id} from {'/'.join(record_path)}") return True except Exception as e: logger.error( - f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + f"Error deleting record {record_id} from {'/'.join(record_path)}: {e}" ) return False - def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + def get_table_info(self, record_path: tuple[str, ...]) -> dict[str, Any] | None: """ Get metadata information about a Delta table. Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: Dictionary with table metadata, or None if table doesn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return None @@ -841,15 +792,15 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: # Get basic info schema = delta_table.schema() history = delta_table.history() - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) # Add pending batch info pending_info = self.get_pending_batch_info() pending_count = pending_info.get(source_key, 0) return { - "path": str(self._get_table_path(source_path)), - "source_path": source_path, + "path": str(self._get_table_path(record_path)), + "record_path": record_path, "schema": schema, "version": delta_table.version(), "num_files": len(delta_table.files()), @@ -859,5 +810,5 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: } except Exception as e: - logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + logger.error(f"Error getting table info for {'/'.join(record_path)}: {e}") return None diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 118b110..817c249 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -1,6 +1,6 @@ from orcapod.types.semantic_types import PythonArrowConverter from orcapod.types.schemas import PythonSchema, SemanticSchema -from orcapod.types import typespec_utils as tsutils +from orcapod.types import TypeSpec, typespec_utils as tsutils from typing import Any, Self from collections.abc import Mapping @@ -29,9 +29,11 @@ def __init__( ): self._converter_lut = converter_lut - def from_python_to_arrow_schema(self, python_schema: PythonSchema) -> pa.Schema: + def from_python_to_arrow_schema(self, python_schema: TypeSpec) -> pa.Schema: """Convert a Python schema to an Arrow schema""" - return python_schema.to_arrow_schema(converters=self._converter_lut) + return PythonSchema(python_schema).to_arrow_schema( + converters=self._converter_lut + ) def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: """Convert an Arrow schema to a Python schema""" @@ -40,7 +42,7 @@ def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: ) def from_python_to_arrow( - self, python_data: Mapping[str, Any], python_schema: PythonSchema | None = None + self, python_data: Mapping[str, Any], python_schema: TypeSpec | None = None ) -> pa.Table: """Convert a dictionary of Python values to Arrow arrays""" if python_schema is None: @@ -85,3 +87,22 @@ def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: def as_dict(self) -> dict[str, PythonArrowConverter]: """Return the converter lookup table as a dictionary.""" return self._converter_lut.copy() + + def join(self, other: Self, strict: bool = False) -> Self: + """Join two SemanticConverters by merging their converter lookup tables.""" + if not isinstance(other, SemanticConverter): + raise TypeError("Can only join with another SemanticConverter.") + + new_converter_lut = self._converter_lut.copy() + for key, converter in other._converter_lut.items(): + if key in new_converter_lut: + if strict: + raise ValueError( + f"Key '{key}' already exists in the converter lookup table. Cannot overwrite in strict mode." + ) + logger.warning( + f"Key '{key}' already exists in the converter lookup table. Overwriting with new converter." + ) + new_converter_lut[key] = converter + + return self.__class__(new_converter_lut) diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py index 169da69..258617a 100644 --- a/src/orcapod/types/semantic_types.py +++ b/src/orcapod/types/semantic_types.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from pathlib import Path import pyarrow as pa + from collections.abc import Collection @@ -344,7 +345,7 @@ def to_canonical_from_arrow(self, value: pa.Array) -> list[T]: def from_canonical_to_arrow( self, value: T, target_type: pa.DataType | None = None - ) -> Any: + ) -> pa.Array: """Convert from canonical to Arrow representation using explicit Arrow DataType""" if target_type is None: @@ -438,7 +439,45 @@ def get_semantic_type_for_python_type( self, python_type: type ) -> SemanticType | None: """Get a semantic type by Python type""" - return self._python_to_semantic_lut.get(python_type) + + # check if it's directly registered + semantic_type = self._python_to_semantic_lut.get(python_type) + if semantic_type is None: + # check if it's a subclass + for ( + registered_type, + registered_semantic_type, + ) in self._python_to_semantic_lut.items(): + if issubclass(python_type, registered_type): + return registered_semantic_type + return semantic_type + + def get_arrow_type_for_semantic_type( + self, semantic_type_name: str + ) -> pa.DataType | None: + """Get the default Arrow DataType for a semantic type by name""" + semantic_type = self._semantic_type_lut.get(semantic_type_name) + if semantic_type: + return semantic_type.get_default_arrow_type() + return None + + def get_arrow_type_for_python_type( + self, python_type: type + ) -> tuple[str | None, pa.DataType] | None: + """Get the default Arrow DataType for a Python type""" + semantic_type = self.get_semantic_type_for_python_type(python_type) + if semantic_type: + return semantic_type.name, semantic_type.get_default_arrow_type() + return None + + def from_python_to_arrow(self, python_value: Any) -> tuple[str | None, Any]: + """Convert a Python value to Arrow-targetting representation using the semantic type registry""" + semantic_type = self.get_semantic_type_for_python_type(type(python_value)) + if semantic_type: + return semantic_type.name, semantic_type.convert_python_to_arrow( + python_value + ) + return None, python_value def get_semantic_type(self, name: str) -> SemanticType | None: """Get a semantic type by name""" @@ -448,11 +487,10 @@ def list_semantic_types(self) -> list[SemanticType]: """Get all registered semantic types""" return list(self._semantic_type_lut.values()) - def supports_python_type(self, python_type: type) -> bool: - """Check if registry supports the given Python type""" + def registered_with_semantic_type(self, python_type: type) -> bool: + """Check if registry has the Python type registered with a semantic type""" return python_type in self._python_to_semantic_lut - # Python-specific registry methods def supports_semantic_and_arrow_type( self, semantic_type_name: str, arrow_type: pa.DataType ) -> bool: diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 9f66654..609a6a0 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -214,7 +214,9 @@ def extract_function_typespecs( return param_info, inferred_output_types -def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> TypeSpec: +def get_typespec_from_dict( + data: Mapping, typespec: TypeSpec | None = None, default=str +) -> TypeSpec: """ Returns a TypeSpec for the given dictionary. The TypeSpec is a mapping from field name to Python type. If typespec is provided, then @@ -222,7 +224,10 @@ def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> T """ if typespec is None: typespec = {} - return {key: typespec.get(key, type(value)) for key, value in data.items()} + return { + key: typespec.get(key, type(value) if value is not None else default) + for key, value in data.items() + } def get_compatible_type(type1: Any, type2: Any) -> Any: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 5237eb3..0947499 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1,7 +1,6 @@ # TODO: move this to a separate module from collections import defaultdict -from matplotlib.pylab import f import pyarrow as pa from collections.abc import Mapping, Collection from typing import Any @@ -168,6 +167,7 @@ def prepare_prefixed_columns( | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], exclude_columns: Collection[str] = (), + exclude_prefixes: Collection[str] = (), ) -> tuple[pa.Table, dict[str, pa.Table]]: """ """ all_prefix_info = {} @@ -209,7 +209,12 @@ def prepare_prefixed_columns( prefixed_column_names = defaultdict(list) prefixed_columns = defaultdict(list) - target_column_names = [c for c in data_column_names if c not in exclude_columns] + target_column_names = [ + c + for c in data_column_names + if not any(c.startswith(prefix) for prefix in exclude_prefixes) + and c not in exclude_columns + ] for prefix, value_lut in all_prefix_info.items(): target_prefixed_column_names = prefixed_column_names[prefix] From 8429611924167df4bc4d928da1592b38118af417 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 07:37:45 +0000 Subject: [PATCH 079/224] fix: handling of schema when merging tables --- src/orcapod/data/datagrams/arrow_datagram.py | 216 ++++++------------ src/orcapod/data/datagrams/base.py | 29 +-- src/orcapod/data/pods.py | 2 +- src/orcapod/data/trackers.py | 3 +- .../pipeline/{nodes.py => legacy_nodes.py} | 0 .../{pipeline.py => legacy_pipeline.py} | 0 src/orcapod/types/semantic_converter.py | 10 + src/orcapod/utils/arrow_utils.py | 27 ++- 8 files changed, 98 insertions(+), 189 deletions(-) rename src/orcapod/pipeline/{nodes.py => legacy_nodes.py} (100%) rename src/orcapod/pipeline/{pipeline.py => legacy_pipeline.py} (100%) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5ed5307..5eb158c 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -1,14 +1,14 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Any, Self +from typing import Self import pyarrow as pa -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.context import ( DataContext, ) from orcapod.data.datagrams.base import BaseDatagram +from orcapod.data.system_constants import orcapod_constants as constants from orcapod.types import schemas, typespec_utils from orcapod.types.core import DataValue from orcapod.types.semantic_converter import SemanticConverter @@ -125,31 +125,6 @@ def __init__( self._cached_meta_python_schema: schemas.PythonSchema | None = None self._cached_content_hash: str | None = None - def _core_info(self) -> dict[str, Any]: - core_info = { - "data_table": self._data_table, - "meta_table": self._meta_table, - "data_context_table": self._data_context_table, - "semantic_converter": self._semantic_converter, - "cached_python_schema": self._cached_python_schema, - "cached_python_dict": self._cached_python_dict, - "cached_meta_python_schema": self._cached_meta_python_schema, - "cached_content_hash": self._cached_content_hash, - } - return core_info - - def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: - new_copy = object.__new__(self.__class__) - new_copy._data_table = core_info["data_table"] - new_copy._meta_table = core_info["meta_table"] - new_copy._data_context_table = core_info["data_context_table"] - new_copy._semantic_converter = core_info["semantic_converter"] - new_copy._cached_python_schema = core_info["cached_python_schema"] - new_copy._cached_python_dict = core_info["cached_python_dict"] - new_copy._cached_meta_python_schema = core_info["cached_meta_python_schema"] - new_copy._cached_content_hash = core_info["cached_content_hash"] - return new_copy - # 1. Core Properties (Identity & Structure) @property def meta_columns(self) -> tuple[str, ...]: @@ -492,6 +467,8 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: k = constants.META_PREFIX + k prefixed_updates[k] = v + new_datagram = self.copy(include_cache=False) + # Start with existing meta data meta_dict = {} if self._meta_table is not None: @@ -501,18 +478,10 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: meta_dict.update(prefixed_updates) # Create new meta table - new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None - - # Combine all tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, + new_datagram._meta_table = ( + pa.Table.from_pylist([meta_dict]) if meta_dict else None ) + return new_datagram def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: """ @@ -541,26 +510,10 @@ def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" ) - # Filter meta columns - remaining_cols = [ - col for col in self._meta_table.column_names if col not in prefixed_keys - ] + new_datagram = self.copy(include_cache=False) + new_datagram._meta_table = self._meta_table.drop_columns(prefixed_keys) - # Create new meta table - new_meta_table = ( - self._meta_table.select(remaining_cols) if remaining_cols else None - ) - - # Combine tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram # 6. Data Column Operations def select(self, *column_names: str) -> Self: @@ -579,18 +532,10 @@ def select(self, *column_names: str) -> Self: if missing_cols: raise ValueError(f"Columns not found: {missing_cols}") - new_data_table = self._data_table.select(list(column_names)) + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = new_datagram._data_table.select(column_names) - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: """ @@ -610,27 +555,12 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: raise KeyError( f"Following columns do not exist and cannot be dropped: {sorted(missing)}" ) + column_names = tuple(c for c in column_names if self._data_table.columns) - # Filter data columns - remaining_cols = [ - col for col in self._data_table.column_names if col not in column_names - ] - - if not remaining_cols: - raise ValueError("Cannot drop all data columns") - - new_data_table = self._data_table.select(remaining_cols) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = self._data_table.drop_columns(column_names) + # TODO: consider dropping extra semantic columns if they are no longer needed + return new_datagram def rename(self, column_mapping: Mapping[str, str]) -> Self: """ @@ -644,30 +574,22 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: New ArrowDatagram instance with renamed data columns """ # Create new schema with renamed fields, preserving original types - new_fields = [] - for field in self._data_table.schema: - old_name = field.name - new_name = column_mapping.get(old_name, old_name) - new_field = pa.field(new_name, field.type) - new_fields.append(new_field) - - # Create new data table with renamed columns - new_schema = pa.schema(new_fields) - new_data_table = self._data_table.rename_columns( - [column_mapping.get(name, name) for name in self._data_table.column_names] - ).cast(new_schema) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, + if not column_mapping: + return self + + new_names = [column_mapping.get(k, k) for k in self._data_table.column_names] + + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = new_datagram._data_table.rename_columns(new_names) + + # apply the same rename to the converters + new_datagram._semantic_converter = self._semantic_converter.rename( + column_mapping ) + return new_datagram + def update(self, **updates: DataValue) -> Self: """ Create a new ArrowDatagram with specific column values updated. @@ -699,23 +621,19 @@ def update(self, **updates: DataValue) -> Self: f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" ) + new_datagram = self.copy(include_cache=False) + updates_typespec = schemas.PythonSchema( {k: v for k, v in self.types().items() if k in updates} ) - update_table = self._semantic_converter.from_python_to_arrow( updates, updates_typespec ) - all_tables = [self._data_table.drop_columns(list(updates.keys())), update_table] - - if self._meta_table is not None: - all_tables.append(self._meta_table) + new_datagram._data_table = arrow_utils.hstack_tables( + self._data_table.drop_columns(list(updates.keys())), update_table + ).select(self._data_table.column_names) # adjsut the order to match original - return self.__class__( - table=arrow_utils.hstack_tables(*all_tables), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram def with_columns( self, @@ -742,7 +660,7 @@ def with_columns( if not updates: return self - # Error if any column already exists + # Error if any of the columns already exists existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) if existing_overlaps: raise ValueError( @@ -750,7 +668,11 @@ def with_columns( f"Use update() to modify existing columns." ) + # create a copy and perform in-place updates + new_datagram = self.copy() + # TODO: consider simplifying this conversion logic + # prepare update's table typespec = typespec_utils.get_typespec_from_dict(updates, column_types) updates_converter = SemanticConverter.from_semantic_schema( @@ -761,21 +683,16 @@ def with_columns( # TODO: cleanup the handling of typespec python schema and various conversion points new_data_table = updates_converter.from_python_to_arrow(updates, typespec) - # Combine with meta table for reconstruction - all_tables = [self._data_table, new_data_table] - if self._meta_table is not None: - all_tables.append(self._meta_table) - - combined_table = arrow_utils.hstack_tables(*all_tables) + # perform in-place update + new_datagram._data_table = arrow_utils.hstack_tables( + new_datagram._data_table, new_data_table + ) # prepare the joined converter - total_converter = self._semantic_converter.join(updates_converter) - - return self.__class__( - table=combined_table, - semantic_converter=total_converter, - data_context=self._data_context, + new_datagram._semantic_converter = self._semantic_converter.join( + updates_converter ) + return new_datagram # 7. Context Operations def with_context_key(self, new_context_key: str) -> Self: @@ -789,6 +706,7 @@ def with_context_key(self, new_context_key: str) -> Self: Returns: New ArrowDatagram instance with new context """ + # TODO: consider if there is a more efficient way to handle context # Combine all tables for reconstruction combined_table = self._data_table if self._meta_table is not None: @@ -801,23 +719,25 @@ def with_context_key(self, new_context_key: str) -> Self: ) # 8. Utility Operations - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - new_datagram = self.__class__( - combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_content_hash = self._cached_content_hash + new_datagram = super().copy() + + new_datagram._data_table = self._data_table + new_datagram._meta_table = self._meta_table + new_datagram._data_context = self._data_context + new_datagram._semantic_converter = self._semantic_converter + + if include_cache: + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_meta_python_schema = self._cached_meta_python_schema + else: + new_datagram._cached_python_schema = None + new_datagram._cached_python_dict = None + new_datagram._cached_content_hash = None + new_datagram._cached_meta_python_schema = None return new_datagram diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 0ec1501..f253995 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -269,33 +269,6 @@ def with_context_key(self, new_context_key: str) -> Self: ... # 8. Utility Operations - @abstractmethod def copy(self) -> Self: """Create a shallow copy of the datagram.""" - ... - - @abstractmethod - def _core_info(self) -> dict[str, Any]: - """ - Return core information about the datagram. - This is meant to be used for internal purposes only and is not part of the public API. - It provides necessary information to create an efficient copy of the datagram - and in a manner that works across inheritance hierarchies. - - Returns: - Dictionary with all information necessary to recreate the datagram in a copy. - """ - ... - - @abstractmethod - def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: - """ - Create a new datagram instance from core information. - - Args: - core_info: Dictionary with core information about the datagram - - Returns: - New datagram instance - """ - ... + return object.__new__(self.__class__) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index f22b9fe..8662903 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -366,7 +366,7 @@ def identity_structure(self, *streams: dp.Stream) -> Any: class WrappedPod(ActivatablePodBase): """ - A wrapper for a pod that allows it to be used as a kernel. + A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. This class is meant to serve as a base class for other pods that need to wrap existing pods. """ diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 5ad2a55..0f6ef94 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,8 +1,9 @@ from orcapod.protocols import data_protocols as dp, hashing_protocols as hp from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict +from collections.abc import Generator from abc import ABC, abstractmethod -from typing import Any, ContextManager, Generator +from typing import Any from contextlib import contextmanager diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/legacy_nodes.py similarity index 100% rename from src/orcapod/pipeline/nodes.py rename to src/orcapod/pipeline/legacy_nodes.py diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/legacy_pipeline.py similarity index 100% rename from src/orcapod/pipeline/pipeline.py rename to src/orcapod/pipeline/legacy_pipeline.py diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 817c249..889d8a2 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -106,3 +106,13 @@ def join(self, other: Self, strict: bool = False) -> Self: new_converter_lut[key] = converter return self.__class__(new_converter_lut) + + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Rename columns in the converter lookup table.""" + new_converter_lut = {} + new_converter_lut = { + column_mapping.get(key, key): converter + for key, converter in self._converter_lut.items() + } + + return self.__class__(new_converter_lut) diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 0947499..700fa3e 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -43,20 +43,25 @@ def hstack_tables(*tables: pa.Table) -> pa.Table: "All tables must have the same number of rows for horizontal stacking." ) - # create combined column names - all_column_names = [] - all_columns = [] + # create combined schema + all_fields = [] all_names = set() - for i, table in enumerate(tables): - if overlap := set(table.column_names).intersection(all_names): - raise ValueError( - f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" - ) - all_names.update(table.column_names) - all_column_names += table.column_names + for table in tables: + for field in table.schema: + if field.name in all_names: + raise ValueError( + f"Duplicate column name '{field.name}' found in input tables." + ) + all_fields.append(field) + all_names.add(field.name) + combined_schmea = pa.schema(all_fields) + + # create combined columns + all_columns = [] + for table in tables: all_columns += table.columns - return pa.Table.from_arrays(all_columns, names=all_column_names) + return pa.Table.from_arrays(all_columns, schema=combined_schmea) def check_arrow_schema_compatibility( From 228f4697d502113cd0b622ecfb706f30d9471da4 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 21:59:58 +0000 Subject: [PATCH 080/224] refactor: clean up unused imports and move old code into renamed module --- src/orcapod/data/base.py | 3 - src/orcapod/data/context.py | 28 +- src/orcapod/data/datagram_store.py | 890 --------- src/orcapod/data/old_datagrams.py | 2281 ---------------------- src/orcapod/data/operators.py | 9 +- src/orcapod/errors.py | 5 + src/orcapod/hashing/versioned_hashers.py | 1 - src/orcapod/pipeline/__init__.py | 8 +- src/orcapod/pipeline/legacy_pipeline.py | 2 +- src/orcapod/protocols/store_protocols.py | 4 + src/orcapod/protocols/types.py | 51 - 11 files changed, 30 insertions(+), 3252 deletions(-) delete mode 100644 src/orcapod/data/datagram_store.py delete mode 100644 src/orcapod/data/old_datagrams.py create mode 100644 src/orcapod/errors.py delete mode 100644 src/orcapod/protocols/types.py diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index f8788e1..dec4f06 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -1,9 +1,6 @@ -from abc import ABC, abstractmethod from typing import Any from orcapod.protocols import hashing_protocols as hp -from orcapod.types import TypeSpec from orcapod.hashing.defaults import get_default_object_hasher -import pyarrow as pa import logging diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py index 85261d2..20bc43a 100644 --- a/src/orcapod/data/context.py +++ b/src/orcapod/data/context.py @@ -1,4 +1,3 @@ -from typing import Self from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import default_registry from orcapod.protocols import hashing_protocols as hp @@ -22,27 +21,19 @@ def resolve_data_context(data_context: "str | DataContext | None") -> "DataConte return orcapod_system_data_context_manager.resolve_context(data_context) -default_data_context = DataContext( - "std:v0.1.0:default", - default_registry, - get_default_arrow_hasher(), - get_default_object_hasher(), -) - - class DataContextManager(dict[str, DataContext]): - def register_context(self, DataContext): + def register_context(self, data_context: DataContext): """ Register a new DataContext instance. Args: - DataContext: The DataContext instance to register. + data_context: The DataContext instance to register. """ - if DataContext.context_key in self: + if data_context.context_key in self: raise ValueError( - f"DataContext with key {DataContext.context_key} already exists." + f"DataContext with key {data_context.context_key} already exists." ) - self[DataContext.context_key] = DataContext + self[data_context.context_key] = data_context def resolve_context(self, context_info: str | DataContext | None) -> DataContext: if isinstance(context_info, DataContext): @@ -56,5 +47,14 @@ def resolve_context(self, context_info: str | DataContext | None) -> DataContext raise ValueError(f"DataContext with key {context_info} not found.") + +default_data_context = DataContext( + "std:v0.1.0:default", + default_registry, + get_default_arrow_hasher(), + get_default_object_hasher(), +) + + orcapod_system_data_context_manager = DataContextManager() orcapod_system_data_context_manager.register_context(default_data_context) diff --git a/src/orcapod/data/datagram_store.py b/src/orcapod/data/datagram_store.py deleted file mode 100644 index 72d082c..0000000 --- a/src/orcapod/data/datagram_store.py +++ /dev/null @@ -1,890 +0,0 @@ -# class DatagramStore(Protocol): -# def record_datagram( -# self, -# record_path: tuple[str, ...], -# datagram: dp.Datagram, -# ignore_duplicates: bool = False, -# ) -> str | None: ... - -# def record_stream( -# self, -# record_path: tuple[str, ...], -# stream: dp.Stream, -# ignore_duplicates: bool = False, -# ) -> None: ... - -# def get_recorded_datagram( -# self, -# record_path: tuple[str, ...], -# record_id: str, -# ) -> dp.Datagram | None: ... - -# def get_all_records(self, record_path: tuple[str, ...]) -> dp.Stream | None: -# """Retrieve all records for a given path as a stream.""" -# ... - -# def get_all_records_as_polars( -# self, record_path: tuple[str, ...] -# ) -> pl.DataFrame | None: -# """Retrieve all records for a given path as a Polars stream.""" -# ... - -# def get_records_by_ids( -# self, -# record_path: tuple[str, ...], -# entry_ids: Collection[str], -# add_entry_id_column: bool | str = False, -# preseve_input_order: bool = False, -# ) -> dp.Stream: ... - - -import pyarrow as pa -import pyarrow.compute as pc -import pyarrow.dataset as ds -import polars as pl -from pathlib import Path -from typing import Any -import logging -from deltalake import DeltaTable, write_deltalake -from deltalake.exceptions import TableNotFoundError -from collections import defaultdict -from orcapod.data.datagrams import ArrowDatagram, SemanticTypeRegistry -from orcapod.data.streams import ImmutableTableStream -from orcapod.hashing import get_default_arrow_hasher -from orcapod.hashing.types import ArrowHasher -from orcapod.protocols import data_protocols as dp -from orcapod.types import default_registry - - -# Module-level logger -logger = logging.getLogger(__name__) - - -class DeltaTableArrowStore: - """ - Delta Table-based Arrow data store with flexible hierarchical path support and schema preservation. - - Uses tuple-based source paths for robust parameter handling: - - ("source_name", "source_id") -> source_name/source_id/ - - ("org", "project", "dataset") -> org/project/dataset/ - - ("year", "month", "day", "experiment") -> year/month/day/experiment/ - """ - - def __init__( - self, - base_path: str | Path, - duplicate_entry_behavior: str = "error", - create_base_path: bool = True, - max_hierarchy_depth: int = 10, - batch_size: int = 100, - ): - """ - Initialize the DeltaTableArrowDataStore. - - Args: - base_path: Base directory path where Delta tables will be stored - duplicate_entry_behavior: How to handle duplicate entry_ids: - - 'error': Raise ValueError when entry_id already exists - - 'overwrite': Replace existing entry with new data - create_base_path: Whether to create the base path if it doesn't exist - max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) - batch_size: Number of records to batch before writing to Delta table - auto_flush_interval: Time in seconds to auto-flush pending batches (0 to disable) - """ - # Validate duplicate behavior - if duplicate_entry_behavior not in ["error", "overwrite"]: - raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - - self.duplicate_entry_behavior = duplicate_entry_behavior - self.base_path = Path(base_path) - self.max_hierarchy_depth = max_hierarchy_depth - self.batch_size = batch_size - - if create_base_path: - self.base_path.mkdir(parents=True, exist_ok=True) - elif not self.base_path.exists(): - raise ValueError( - f"Base path {self.base_path} does not exist and create_base_path=False" - ) - - # Cache for Delta tables to avoid repeated initialization - self._delta_table_cache: dict[str, DeltaTable] = {} - - # Batch management - self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) - - logger.info( - f"Initialized DeltaTableArrowDataStore at {self.base_path} " - f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " - f"batch_size={batch_size}, as" - ) - - def flush(self) -> None: - """ - Flush all pending batches immediately. - - This method is called to ensure all pending data is written to the Delta tables. - """ - try: - self.flush_all_batches() - except Exception as e: - logger.error(f"Error during flush: {e}") - - def flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - logger.debug("Flushing triggered!!") - source_key = self._get_source_key(source_path) - - if ( - source_key not in self._pending_batches - or not self._pending_batches[source_key] - ): - return - - # Get all pending records - pending_tables = self._pending_batches[source_key] - self._pending_batches[source_key] = {} - - try: - # Combine all tables in the batch - combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() - - table_path = self._get_table_path(source_path) - table_path.mkdir(parents=True, exist_ok=True) - - # Check if table exists - delta_table = self._get_existing_delta_table(source_path) - - if delta_table is None: - # TODO: reconsider mode="overwrite" here - write_deltalake( - table_path, - combined_table, - mode="overwrite", - ) - logger.debug( - f"Created new Delta table for {source_key} with {len(combined_table)} records" - ) - else: - if self.duplicate_entry_behavior == "overwrite": - # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) - - # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" - try: - delta_table.delete(delete_predicate) - logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing records to delete from {source_key}: {e}" - ) - - # otherwise, only insert if same entry_id does not exist yet - delta_table.merge( - source=combined_table, - predicate="target.__entry_id = source.__entry_id", - source_alias="source", - target_alias="target", - ).when_not_matched_insert_all().execute() - - logger.debug( - f"Appended batch of {len(combined_table)} records to {source_key}" - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - # Put the tables back in the pending queue - self._pending_batches[source_key] = pending_tables - raise - - def flush_all_batches(self) -> None: - """Flush all pending batches.""" - source_keys = list(self._pending_batches.keys()) - - # TODO: capture and re-raise exceptions at the end - for source_key in source_keys: - source_path = tuple(source_key.split("/")) - try: - self.flush_batch(source_path) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - - def __del__(self): - """Cleanup when object is destroyed.""" - self.flush() - - def _validate_source_path(self, source_path: tuple[str, ...]) -> None: - # TODO: consider removing this as path creation can be tried directly - """ - Validate source path components. - - Args: - source_path: Tuple of path components - - Raises: - ValueError: If path is invalid - """ - if not source_path: - raise ValueError("Source path cannot be empty") - - if len(source_path) > self.max_hierarchy_depth: - raise ValueError( - f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" - ) - - # Validate path components - for i, component in enumerate(source_path): - if not component or not isinstance(component, str): - raise ValueError( - f"Source path component {i} is invalid: {repr(component)}" - ) - - # Check for filesystem-unsafe characters - unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] - if any(char in component for char in unsafe_chars): - raise ValueError( - f"Source path component contains invalid characters: {repr(component)}" - ) - - def _get_source_key(self, source_path: tuple[str, ...]) -> str: - """Generate cache key for source storage.""" - return "/".join(source_path) - - def _get_table_path(self, source_path: tuple[str, ...]) -> Path: - """Get the filesystem path for a given source path.""" - path = self.base_path - for subpath in source_path: - path = path / subpath - return path - - def _get_existing_delta_table( - self, source_path: tuple[str, ...] - ) -> DeltaTable | None: - """ - Get or create a Delta table, handling schema initialization properly. - - Args: - source_path: Tuple of path components - - Returns: - DeltaTable instance or None if table doesn't exist - """ - source_key = self._get_source_key(source_path) - table_path = self._get_table_path(source_path) - - # Check cache first - if dt := self._delta_table_cache.get(source_key): - return dt - - try: - # Try to load existing table - delta_table = DeltaTable(str(table_path)) - self._delta_table_cache[source_key] = delta_table - logger.debug(f"Loaded existing Delta table for {source_key}") - return delta_table - except TableNotFoundError: - # Table doesn't exist - return None - except Exception as e: - logger.error(f"Error loading Delta table for {source_key}: {e}") - # Try to clear any corrupted cache and retry once - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - return None - - def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: - """Ensure the table has an __entry_id column.""" - if "__entry_id" not in arrow_data.column_names: - # Add entry_id column at the beginning - key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) - arrow_data = arrow_data.add_column(0, "__entry_id", key_array) - return arrow_data - - def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: - """Remove the __entry_id column if it exists.""" - if "__entry_id" in arrow_data.column_names: - column_names = arrow_data.column_names - indices_to_keep = [ - i for i, name in enumerate(column_names) if name != "__entry_id" - ] - arrow_data = arrow_data.select(indices_to_keep) - return arrow_data - - def _handle_entry_id_column( - self, arrow_data: pa.Table, add_entry_id_column: bool | str = False - ) -> pa.Table: - """ - Handle entry_id column based on add_entry_id_column parameter. - - Args: - arrow_data: Arrow table with __entry_id column - add_entry_id_column: Control entry ID column inclusion: - - False: Remove __entry_id column - - True: Keep __entry_id column as is - - str: Rename __entry_id column to custom name - """ - if add_entry_id_column is False: - # Remove the __entry_id column - return self._remove_entry_id_column(arrow_data) - elif isinstance(add_entry_id_column, str): - # Rename __entry_id to custom name - if "__entry_id" in arrow_data.column_names: - schema = arrow_data.schema - new_names = [ - add_entry_id_column if name == "__entry_id" else name - for name in schema.names - ] - return arrow_data.rename_columns(new_names) - # If add_entry_id_column is True, keep __entry_id as is - return arrow_data - - def _create_entry_id_filter(self, entry_id: str) -> list: - """ - Create a proper filter expression for Delta Lake. - - Args: - entry_id: The entry ID to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [("__entry_id", "=", entry_id)] - - def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: - """ - Create a proper filter expression for multiple entry IDs. - - Args: - entry_ids: List of entry IDs to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [("__entry_id", "in", entry_ids)] - - def _read_table_with_filter( - self, - delta_table: DeltaTable, - filters: list | None = None, - ) -> pa.Table: - """ - Read table using to_pyarrow_dataset with original schema preservation. - - Args: - delta_table: The Delta table to read from - filters: Optional filters to apply - - Returns: - Arrow table with preserved schema - """ - # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading - dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) - if filters: - # Apply filters at dataset level for better performance - import pyarrow.compute as pc - - filter_expr = None - for filt in filters: - if len(filt) == 3: - col, op, val = filt - if op == "=": - expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore - elif op == "in": - expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore - else: - logger.warning( - f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." - ) - # Fallback to table-level filtering - return dataset.to_table()(filters=filters) - - if filter_expr is None: - filter_expr = expr - else: - filter_expr = pc.and_(filter_expr, expr) # type: ignore - - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) - - return dataset.to_table() - - def record_data( - self, - record_path: tuple[str, ...], - entry_id: str, - data: pa.Table, - force_flush: bool = False, - error_on_duplicate: bool | None = None, - ) -> pa.Table: - self._validate_source_path(record_path) - source_key = self._get_source_key(record_path) - - # Check for existing entry - if error_on_duplicate is None: - error_on_duplicate = self.duplicate_entry_behavior == "error" - if error_on_duplicate: - pending_table = self._pending_batches[source_key].get(entry_id, None) - if pending_table is not None: - raise ValueError( - f"Entry '{entry_id}' already exists in pending batch for {source_key}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - existing_record = self.get_recorded_data(record_path, entry_id, flush=False) - if existing_record is not None: - raise ValueError( - f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - - # Add entry_id column to the data - data_with_entry_id = self._ensure_entry_id_column(data, entry_id) - - if force_flush: - # Write immediately - table_path = self._get_table_path(record_path) - table_path.mkdir(parents=True, exist_ok=True) - - delta_table = self._get_existing_delta_table(record_path) - - if delta_table is None: - # Create new table - save original schema first - write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") - logger.debug(f"Created new Delta table for {source_key}") - else: - if self.duplicate_entry_behavior == "overwrite": - try: - delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" - ) - logger.debug( - f"Deleted existing record {entry_id} from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing record to delete for {entry_id}: {e}" - ) - - write_deltalake( - table_path, - data_with_entry_id, - mode="append", - schema_mode="merge", - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - else: - # Add to the batch for later flushing - self._pending_batches[source_key][entry_id] = data_with_entry_id - batch_size = len(self._pending_batches[source_key]) - - # Check if we need to flush - if batch_size >= self.batch_size: - self.flush_batch(record_path) - - logger.debug(f"Added record {entry_id} to {source_key}") - return data - - def get_recorded_data( - self, - record_path: tuple[str, ...], - entry_id: str, - flush: bool = False, - ) -> pa.Table | None: - """ - Get a specific record by entry_id with schema preservation. - - Args: - source_path: Tuple of path components - entry_id: Unique identifier for the record - - Returns: - Arrow table for the record or None if not found - """ - - if flush: - self.flush_batch(record_path) - self._validate_source_path(record_path) - - # check if entry_id is found in pending batches - source_key = self._get_source_key(record_path) - if entry_id in self._pending_batches[source_key]: - # Return the pending record directly - return self._pending_batches[source_key][entry_id] - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read - filter_expr = self._create_entry_id_filter(entry_id) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - # Remove the __entry_id column before returning - return self._remove_entry_id_column(result) - - except Exception as e: - logger.error( - f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" - ) - raise e - - def get_all_records( - self, - record_path: tuple[str, ...], - add_entry_id_column: bool | str = False, - retrieve_pending: bool = True, - flush: bool = False, - ) -> pa.Table | None: - """ - Retrieve all records for a given source path as a single table with schema preservation. - - Args: - source_path: Tuple of path components - add_entry_id_column: Control entry ID column inclusion: - - False: Don't include entry ID column (default) - - True: Include entry ID column as "__entry_id" - - str: Include entry ID column with custom name - - Returns: - Arrow table containing all records with original schema, or None if no records found - """ - # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily - - if flush: - self.flush_batch(record_path) - self._validate_source_path(record_path) - - collected_tables = [] - if retrieve_pending: - # Check if there are pending records in the batch - for entry_id, arrow_table in self._pending_batches[ - self._get_source_key(record_path) - ].items(): - collected_tables.append( - self._ensure_entry_id_column(arrow_table, entry_id) - ) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is not None: - try: - # Use filter-based read - result = self._read_table_with_filter(delta_table) - - if len(result) != 0: - collected_tables.append(result) - - except Exception as e: - logger.error( - f"Error getting all records from {'/'.join(record_path)}: {e}" - ) - if collected_tables: - total_table = pa.concat_tables(collected_tables) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(total_table, add_entry_id_column) - - return None - - # def get_all_records_as_polars( - # self, source_path: tuple[str, ...], flush: bool = True - # ) -> pl.LazyFrame | None: - # """ - # Retrieve all records for a given source path as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - - # Returns: - # Polars LazyFrame containing all records, or None if no records found - # """ - # all_records = self.get_all_records(source_path, flush=flush) - # if all_records is None: - # return None - # # TODO: take care of converting semantics to Python objects - # return pl.LazyFrame(all_records.as_table()) - - def get_records_by_ids( - self, - source_path: tuple[str, ...], - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, - flush: bool = False, - ) -> pa.Table | None: - """ - Retrieve records by entry IDs as a single table with schema preservation. - - Args: - source_path: Tuple of path components - entry_ids: Entry IDs to retrieve - add_entry_id_column: Control entry ID column inclusion - preserve_input_order: If True, return results in input order with nulls for missing - - Returns: - Arrow table containing all found records with original schema, or None if no records found - """ - - if flush: - self.flush_batch(source_path) - - self._validate_source_path(source_path) - - # Convert input to list of strings for consistency - if isinstance(entry_ids, list): - if not entry_ids: - return None - entry_ids_list = entry_ids - elif isinstance(entry_ids, pl.Series): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_list() - elif isinstance(entry_ids, pa.Array): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_pylist() - else: - raise TypeError( - f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" - ) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read with filters - filter_expr = self._create_entry_ids_filter(entry_ids_list) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - if preserve_input_order: - raise NotImplementedError("Preserve input order is not yet implemented") - # Need to reorder results and add nulls for missing entries - import pandas as pd - - df = result.to_pandas() - df = df.set_index("__entry_id") - - # Create a DataFrame with the desired order, filling missing with NaN - ordered_df = df.reindex(entry_ids_list) - - # Convert back to Arrow - result = pa.Table.from_pandas(ordered_df.reset_index()) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) - - except Exception as e: - logger.error( - f"Error getting records by IDs from {'/'.join(source_path)}: {e}" - ) - return None - - # def get_records_by_ids_as_polars( - # self, - # source_path: tuple[str, ...], - # entry_ids: list[str] | pl.Series | pa.Array, - # add_entry_id_column: bool | str = False, - # preserve_input_order: bool = False, - # flush: bool = False, - # ) -> pl.LazyFrame | None: - # """ - # Retrieve records by entry IDs as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - # entry_ids: Entry IDs to retrieve - # add_entry_id_column: Control entry ID column inclusion - # preserve_input_order: If True, return results in input order with nulls for missing - - # Returns: - # Polars LazyFrame containing all found records, or None if no records found - # """ - # arrow_result = self.get_records_by_ids( - # source_path, - # entry_ids, - # add_entry_id_column, - # preserve_input_order, - # flush=flush, - # ) - - # if arrow_result is None: - # return None - - # # Convert to Polars LazyFrame - # return pl.LazyFrame(arrow_result) - - # Additional utility methods - def list_sources(self) -> list[tuple[str, ...]]: - """ - List all available source paths. - - Returns: - List of source path tuples - """ - sources = [] - - def _scan_directory(current_path: Path, path_components: tuple[str, ...]): - """Recursively scan for Delta tables.""" - for item in current_path.iterdir(): - if not item.is_dir(): - continue - - new_path_components = path_components + (item.name,) - - # Check if this directory contains a Delta table - try: - DeltaTable(str(item)) - sources.append(new_path_components) - except TableNotFoundError: - # Not a Delta table, continue scanning subdirectories - if len(new_path_components) < self.max_hierarchy_depth: - _scan_directory(item, new_path_components) - - _scan_directory(self.base_path, ()) - return sources - - def delete_source(self, source_path: tuple[str, ...]) -> bool: - """ - Delete an entire source (all records for a source path). - - Args: - source_path: Tuple of path components - - Returns: - True if source was deleted, False if it didn't exist - """ - self._validate_source_path(source_path) - - # Flush any pending batches first - self.flush_batch(source_path) - - table_path = self._get_table_path(source_path) - source_key = self._get_source_key(source_path) - - if not table_path.exists(): - return False - - try: - # Remove from caches - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - - # Remove directory - import shutil - - shutil.rmtree(table_path) - - logger.info(f"Deleted source {source_key}") - return True - - except Exception as e: - logger.error(f"Error deleting source {source_key}: {e}") - return False - - def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: - """ - Delete a specific record. - - Args: - source_path: Tuple of path components - entry_id: ID of the record to delete - - Returns: - True if record was deleted, False if it didn't exist - """ - self._validate_source_path(source_path) - - # Flush any pending batches first - self.flush_batch(source_path) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return False - - try: - # Check if record exists using proper filter - filter_expr = self._create_entry_id_filter(entry_id) - existing = self._read_table_with_filter(delta_table, filters=filter_expr) - if len(existing) == 0: - return False - - # Delete the record using SQL-style predicate (this is correct for delete operations) - delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" - ) - - # Update cache - source_key = self._get_source_key(source_path) - self._delta_table_cache[source_key] = delta_table - - logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") - return True - - except Exception as e: - logger.error( - f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" - ) - return False - - def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: - """ - Get metadata information about a Delta table. - - Args: - source_path: Tuple of path components - - Returns: - Dictionary with table metadata, or None if table doesn't exist - """ - self._validate_source_path(source_path) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return None - - try: - # Get basic info - schema = delta_table.schema() - history = delta_table.history() - source_key = self._get_source_key(source_path) - - # Add pending batch info - pending_info = self.get_pending_batch_info() - pending_count = pending_info.get(source_key, 0) - - return { - "path": str(self._get_table_path(source_path)), - "source_path": source_path, - "schema": schema, - "version": delta_table.version(), - "num_files": len(delta_table.files()), - "history_length": len(history), - "latest_commit": history[0] if history else None, - "pending_records": pending_count, - } - - except Exception as e: - logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") - return None diff --git a/src/orcapod/data/old_datagrams.py b/src/orcapod/data/old_datagrams.py deleted file mode 100644 index a0386c8..0000000 --- a/src/orcapod/data/old_datagrams.py +++ /dev/null @@ -1,2281 +0,0 @@ -""" -Data structures and utilities for working with datagrams in OrcaPod. - -This module provides classes and functions for handling packet-like data structures -that can represent data in various formats (Python dicts, Arrow tables, etc.) while -maintaining type information, source metadata, and semantic type conversion capability. - -Key classes: -- SemanticConverter: Converts between different data representations. Intended for internal use. -- DictDatagram: Immutable dict-based data structure -- PythonDictPacket: Python dict-based packet with source info -- ArrowPacket: Arrow table-based packet implementation -- PythonDictTag/ArrowTag: Tag implementations for data identification - -The module also provides utilities for schema validation, table operations, -and type conversions between semantic stores, Python stores, and Arrow tables. -""" - -from hmac import new -import logging -from abc import ABC, abstractmethod -from collections.abc import Collection, Iterator, Mapping -from types import new_class -from typing import Self, TypeAlias, cast - -from matplotlib.pyplot import arrow -import pyarrow as pa - -from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.data.context import ( - DataContext, -) -from orcapod.protocols import data_protocols as dp -from orcapod.protocols import hashing_protocols as hp -from orcapod.types import TypeSpec, schemas, typespec_utils -from orcapod.types import typespec_utils as tsutils -from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter -from orcapod.utils import arrow_utils - -logger = logging.getLogger(__name__) - -# A conveniece packet-like type that defines a value that can be -# converted to a packet. It's broader than Packet and a simple mapping -# from string keys to DataValue (e.g., int, float, str) can be regarded -# as PacketLike, allowing for more flexible interfaces. -# Anything that requires Packet-like data but without the strict features -# of a Packet should accept PacketLike. -# One should be careful when using PacketLike as a return type as it does not -# enforce the typespec or source_info, which are important for packet integrity. -PacketLike: TypeAlias = Mapping[str, DataValue] - -PythonStore: TypeAlias = Mapping[str, DataValue] - - -class ImmutableDict(Mapping[str, DataValue]): - """ - An immutable dictionary-like container for DataValues. - - Provides a read-only view of a dictionary mapping strings to DataValues, - implementing the Mapping protocol for compatibility with dict-like operations. - - Initialize with data from a mapping. - Args: - data: Source mapping to copy data from - """ - - def __init__(self, data: Mapping[str, DataValue]): - self._data = dict(data) - - def __getitem__(self, key: str) -> DataValue: - return self._data[key] - - def __iter__(self): - return iter(self._data) - - def __len__(self) -> int: - return len(self._data) - - def __repr__(self) -> str: - return self._data.__repr__() - - def __str__(self) -> str: - return self._data.__str__() - - def __or__(self, other: Mapping[str, DataValue]) -> Self: - """ - Create a new ImmutableDict by merging with another mapping. - - Args: - other: Another mapping to merge with - - Returns: - A new ImmutableDict containing the combined data - """ - return self.__class__(self._data | dict(other)) - - -def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: - """ - Check if a column name matches any of the given prefixes. - - Args: - column: Column name to check - prefixes: Collection of prefixes to match against - - Returns: - True if the column starts with any of the prefixes, False otherwise - """ - for prefix in prefixes: - if column.startswith(prefix): - return True - return False - - -class BaseDatagram(ABC): - """ - Abstract base class for immutable datagram implementations. - - Provides shared functionality and enforces consistent interface across - different storage backends (dict, Arrow table, etc.). Concrete subclasses - must implement the abstract methods to handle their specific storage format. - - The base class only manages the data context key string - how that key - is interpreted and used is left to concrete implementations. - """ - - def __init__(self, data_context: DataContext | str | None = None) -> None: - """ - Initialize base datagram with data context. - - Args: - data_context: Context for semantic interpretation. Can be a string key - or a DataContext object, or None for default. - """ - self._data_context = DataContext.resolve_data_context(data_context) - - # 1. Core Properties (Identity & Structure) - @property - def data_context_key(self) -> str: - """Return the data context key.""" - return self._data_context.context_key - - @property - @abstractmethod - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - ... - - # 2. Dict-like Interface (Data Access) - @abstractmethod - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - ... - - @abstractmethod - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - ... - - @abstractmethod - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - ... - - @abstractmethod - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - ... - - # 3. Structural Information - @abstractmethod - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - ... - - @abstractmethod - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> TypeSpec: - """Return type specification for the datagram.""" - ... - - @abstractmethod - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """Return the PyArrow schema for this datagram.""" - ... - - @abstractmethod - def content_hash(self) -> str: - """Calculate and return content hash of the datagram.""" - ... - - # 4. Format Conversions (Export) - @abstractmethod - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - ... - - @abstractmethod - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """Convert the datagram to an Arrow table.""" - ... - - # 5. Meta Column Operations - @abstractmethod - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """Get a meta column value.""" - ... - - @abstractmethod - def with_meta_columns(self, **updates: DataValue) -> Self: - """Create new datagram with updated meta columns.""" - ... - - @abstractmethod - def drop_meta_columns(self, *keys: str) -> Self: - """Create new datagram with specified meta columns removed.""" - ... - - # 6. Data Column Operations - @abstractmethod - def select(self, *column_names: str) -> Self: - """Create new datagram with only specified data columns.""" - ... - - @abstractmethod - def drop(self, *column_names: str) -> Self: - """Create new datagram with specified data columns removed.""" - ... - - @abstractmethod - def rename(self, column_mapping: Mapping[str, str]) -> Self: - """Create new datagram with data columns renamed.""" - ... - - @abstractmethod - def update(self, **updates: DataValue) -> Self: - """Create new datagram with existing column values updated.""" - ... - - @abstractmethod - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> Self: - """Create new datagram with additional data columns.""" - ... - - # 7. Context Operations - @abstractmethod - def with_context_key(self, new_context_key: str) -> Self: - """Create new datagram with different data context.""" - ... - - # 8. Utility Operations - @abstractmethod - def copy(self) -> Self: - """Create a shallow copy of the datagram.""" - ... - - -class DictDatagram(BaseDatagram): - """ - Immutable datagram implementation using dictionary as storage backend. - - This implementation uses composition (not inheritance from Mapping) to maintain - control over the interface while leveraging dictionary efficiency for data access. - Provides clean separation between data, meta, and context components. - - The underlying data is split into separate components: - - Data dict: Primary business data columns - - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes - - Context: Data context information with {orcapod.CONTEXT_KEY} - - Future Packet subclass will also handle: - - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes - - When exposing to external tools, semantic types are encoded as - `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). - - All operations return new instances, preserving immutability. - - Example: - >>> data = {{ - ... "user_id": 123, - ... "name": "Alice", - ... "__pipeline_version": "v2.1.0", - ... "{orcapod.CONTEXT_KEY}": "financial_v1" - ... }} - >>> datagram = DictDatagram(data) - >>> updated = datagram.update(name="Alice Smith") - """ - - def __init__( - self, - data: Mapping[str, DataValue], - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - """ - Initialize DictDatagram from dictionary data. - - Args: - data: Source data mapping containing all column data. - typespec: Optional type specification for fields. - semantic_converter: Optional converter for semantic type handling. - If None, will be created based on data context and inferred types. - data_context: Data context for semantic type resolution. - If None and data contains context column, will extract from data. - - Note: - The input data is automatically split into data, meta, and context - components based on column naming conventions. - """ - # Parse through data and extract different column types - data_columns = {} - meta_columns = {} - extracted_context = None - - for k, v in data.items(): - if k == constants.CONTEXT_KEY: - # Extract data context but keep it separate from meta data - if data_context is None: - extracted_context = v - # Don't store context in meta_data - it's managed separately - elif k.startswith(constants.META_PREFIX): - # Double underscore = meta metadata - meta_columns[k] = v - else: - # Everything else = user data (including _source_ and semantic types) - data_columns[k] = v - - # Initialize base class with data context - final_context = data_context or cast(str, extracted_context) - super().__init__(final_context) - - # Store data and meta components separately (immutable) - self._data = dict(data_columns) - self._meta_data = dict(meta_columns) - - # Combine provided typespec info with inferred typespec from content - # If the column value is None and no type spec is provided, defaults to str. - self._data_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._data, - typespec, - ) - ) - - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - self._data_python_schema.to_semantic_schema( - semantic_type_registry=self._data_context.semantic_type_registry - ), - ) - self.semantic_converter = semantic_converter - - # Create schema for meta data - self._meta_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._meta_data, - typespec=typespec, - ) - ) - - # Initialize caches - self._cached_data_table: pa.Table | None = None - self._cached_meta_table: pa.Table | None = None - self._cached_content_hash: str | None = None - self._cached_data_arrow_schema: pa.Schema | None = None - self._cached_meta_arrow_schema: pa.Schema | None = None - - # 1. Core Properties (Identity & Structure) - @property - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - return tuple(self._meta_data.keys()) - - # 2. Dict-like Interface (Data Access) - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - if key not in self._data: - raise KeyError(f"Data column '{key}' not found") - return self._data[key] - - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - return key in self._data - - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - return iter(self._data) - - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - return self._data.get(key, default) - - # 3. Structural Information - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - # Start with data columns - result_keys = list(self._data.keys()) - - # Add context if requested - if include_context: - result_keys.append(constants.CONTEXT_KEY) - - # Add meta columns if requested - if include_meta_columns: - if include_meta_columns is True: - result_keys.extend(self.meta_columns) - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - filtered_meta_cols = [ - col - for col in self.meta_columns - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - result_keys.extend(filtered_meta_cols) - - return tuple(result_keys) - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> schemas.PythonSchema: - """ - Return Python schema for the datagram. - - Args: - include_meta_columns: Whether to include meta column types. - - True: include all meta column types - - Collection[str]: include meta column types matching these prefixes - - False: exclude meta column types - include_context: Whether to include context type - - Returns: - Python schema - """ - # Start with data schema - schema = dict(self._data_python_schema) - - # Add context if requested - if include_context: - schema[constants.CONTEXT_KEY] = str - - # Add meta schema if requested - if include_meta_columns and self._meta_data: - if include_meta_columns is True: - schema.update(self._meta_python_schema) - elif isinstance(include_meta_columns, Collection): - filtered_meta_schema = { - k: v - for k, v in self._meta_python_schema.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - schema.update(filtered_meta_schema) - - return schemas.PythonSchema(schema) - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_meta_columns: Whether to include meta columns in the schema. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - # Build data schema (cached) - if self._cached_data_arrow_schema is None: - self._cached_data_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._data_python_schema - ) - ) - - all_schemas = [self._cached_data_arrow_schema] - - # Add context schema if requested - if include_context: - context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) - all_schemas.append(context_schema) - - # Add meta schema if requested - if include_meta_columns and self._meta_data: - if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) - ) - - assert self._cached_meta_arrow_schema is not None, ( - "Meta Arrow schema should be initialized by now" - ) - - if include_meta_columns is True: - meta_schema = self._cached_meta_arrow_schema - elif isinstance(include_meta_columns, Collection): - # Filter meta schema by prefix matching - matched_fields = [ - field - for field in self._cached_meta_arrow_schema - if any( - field.name.startswith(prefix) for prefix in include_meta_columns - ) - ] - if matched_fields: - meta_schema = pa.schema(matched_fields) - else: - meta_schema = None - else: - meta_schema = None - - if meta_schema is not None: - all_schemas.append(meta_schema) - - return arrow_utils.join_arrow_schemas(*all_schemas) - - def content_hash(self) -> str: - """ - Calculate and return content hash of the datagram. - Only includes data columns, not meta columns or context. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_meta_columns=False, include_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - # 4. Format Conversions (Export) - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation of the datagram. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context key - - Returns: - Dictionary representation - """ - result_dict = dict(self._data) # Start with user data - - # Add context if requested - if include_context: - result_dict[constants.CONTEXT_KEY] = self._data_context.context_key - - # Add meta columns if requested - if include_meta_columns and self._meta_data: - if include_meta_columns is True: - # Include all meta columns - result_dict.update(self._meta_data) - elif isinstance(include_meta_columns, Collection): - # Include only meta columns matching prefixes - filtered_meta_data = { - k: v - for k, v in self._meta_data.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - result_dict.update(filtered_meta_data) - - return result_dict - - def _get_meta_arrow_table(self) -> pa.Table: - if self._cached_meta_table is None: - arrow_schema = self._get_meta_arrow_schema() - self._cached_meta_table = pa.Table.from_pylist( - [self._meta_data], - schema=arrow_schema, - ) - assert self._cached_meta_table is not None, ( - "Meta Arrow table should be initialized by now" - ) - return self._cached_meta_table - - def _get_meta_arrow_schema(self) -> pa.Schema: - if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) - ) - assert self._cached_meta_arrow_schema is not None, ( - "Meta Arrow schema should be initialized by now" - ) - return self._cached_meta_arrow_schema - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """ - Convert the datagram to an Arrow table. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include the context column - - Returns: - Arrow table representation - """ - # Build data table (cached) - if self._cached_data_table is None: - self._cached_data_table = self.semantic_converter.from_python_to_arrow( - self._data, - self._data_python_schema, - ) - assert self._cached_data_table is not None, ( - "Data Arrow table should be initialized by now" - ) - result_table = self._cached_data_table - - # Add context if requested - if include_context: - result_table = result_table.append_column( - constants.CONTEXT_KEY, - pa.array([self._data_context.context_key], type=pa.large_string()), - ) - - # Add meta columns if requested - meta_table = None - if include_meta_columns and self._meta_data: - meta_table = self._get_meta_arrow_table() - # Select appropriate meta columns - if isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - matched_cols = [ - col - for col in self._meta_data.keys() - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - if matched_cols: - meta_table = meta_table.select(matched_cols) - else: - meta_table = None - - # Combine tables if we have meta columns to add - if meta_table is not None: - result_table = arrow_utils.hstack_tables(result_table, meta_table) - - return result_table - - # 5. Meta Column Operations - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """ - Get meta column value with optional default. - - Args: - key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). - default: Value to return if meta column doesn't exist. - - Returns: - Meta column value if exists, otherwise the default value. - """ - # Handle both prefixed and unprefixed keys - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - - return self._meta_data.get(key, default) - - def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": - """ - Create a new DictDatagram with updated meta columns. - Maintains immutability by returning a new instance. - - Args: - **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) - - Returns: - New DictDatagram instance - """ - # Prefix the keys and prepare updates - prefixed_updates = {} - for k, v in meta_updates.items(): - if not k.startswith(constants.META_PREFIX): - k = constants.META_PREFIX + k - prefixed_updates[k] = v - - # Start with existing meta data - new_meta_data = dict(self._meta_data) - new_meta_data.update(prefixed_updates) - - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(new_meta_data) # Meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def drop_meta_columns( - self, *keys: str, ignore_missing: bool = False - ) -> "DictDatagram": - """ - Create a new DictDatagram with specified meta columns dropped. - Maintains immutability by returning a new instance. - - Args: - *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) - ignore_missing: If True, ignore missing meta columns without raising an error. - - Raises: - KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. - - Returns: - New DictDatagram instance without specified meta columns - """ - # Normalize keys to have prefixes - prefixed_keys = set() - for key in keys: - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - prefixed_keys.add(key) - - missing_keys = prefixed_keys - set(self._meta_data.keys()) - if missing_keys and not ignore_missing: - raise KeyError( - f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" - ) - - # Filter out specified meta columns - new_meta_data = { - k: v for k, v in self._meta_data.items() if k not in prefixed_keys - } - - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(new_meta_data) # Filtered meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # 6. Data Column Operations - def select(self, *column_names: str) -> "DictDatagram": - """ - Create a new DictDatagram with only specified data columns. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to keep - - Returns: - New DictDatagram instance with only specified data columns - """ - # Validate columns exist - missing_cols = set(column_names) - set(self._data.keys()) - if missing_cols: - raise KeyError(f"Columns not found: {missing_cols}") - - # Keep only specified data columns - new_data = {k: v for k, v in self._data.items() if k in column_names} - - # Reconstruct full data dict for new instance - full_data = new_data # Selected user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": - """ - Create a new DictDatagram with specified data columns dropped. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to drop - - Returns: - New DictDatagram instance without specified data columns - """ - # Filter out specified data columns - missing = set(column_names) - set(self._data.keys()) - if missing and not ignore_missing: - raise KeyError( - f"Following columns do not exist and cannot be dropped: {sorted(missing)}" - ) - - new_data = {k: v for k, v in self._data.items() if k not in column_names} - - if not new_data: - raise ValueError("Cannot drop all data columns") - - # Reconstruct full data dict for new instance - full_data = new_data # Filtered user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": - """ - Create a new DictDatagram with data columns renamed. - Maintains immutability by returning a new instance. - - Args: - column_mapping: Mapping from old column names to new column names - - Returns: - New DictDatagram instance with renamed data columns - """ - # Rename data columns according to mapping, preserving original types - new_data = {} - for old_name, value in self._data.items(): - new_name = column_mapping.get(old_name, old_name) - new_data[new_name] = value - - # Handle typespec updates for renamed columns - new_typespec = None - if self._data_python_schema: - existing_typespec = dict(self._data_python_schema) - - # Rename types according to column mapping - renamed_typespec = {} - for old_name, old_type in existing_typespec.items(): - new_name = column_mapping.get(old_name, old_name) - renamed_typespec[new_name] = old_type - - new_typespec = renamed_typespec - - # Reconstruct full data dict for new instance - full_data = new_data # Renamed user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - typespec=new_typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def update(self, **updates: DataValue) -> "DictDatagram": - """ - Create a new DictDatagram with existing column values updated. - Maintains immutability by returning a new instance. - - Args: - **updates: Column names and their new values (columns must exist) - - Returns: - New DictDatagram instance with updated values - - Raises: - KeyError: If any column doesn't exist (use with_columns() to add new columns) - """ - if not updates: - return self - - # Error if any column doesn't exist - missing_columns = set(updates.keys()) - set(self._data.keys()) - if missing_columns: - raise KeyError( - f"Columns not found: {sorted(missing_columns)}. " - f"Use with_columns() to add new columns." - ) - - # Update existing columns - new_data = dict(self._data) - new_data.update(updates) - - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, # Keep existing converter - data_context=self._data_context, - ) - - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> "DictDatagram": - """ - Create a new DictDatagram with new data columns added. - Maintains immutability by returning a new instance. - - Args: - column_updates: New data columns as a mapping - column_types: Optional type specifications for new columns - **kwargs: New data columns as keyword arguments - - Returns: - New DictDatagram instance with new data columns added - - Raises: - ValueError: If any column already exists (use update() instead) - """ - # Combine explicit updates with kwargs - - if not updates: - return self - - # Error if any column already exists - existing_overlaps = set(updates.keys()) & set(self._data.keys()) - if existing_overlaps: - raise ValueError( - f"Columns already exist: {sorted(existing_overlaps)}. " - f"Use update() to modify existing columns." - ) - - # Update user data with new columns - new_data = dict(self._data) - new_data.update(updates) - - # Create updated typespec - handle None values by defaulting to str - typespec = self.types() - if column_types is not None: - typespec.update(column_types) - - new_typespec = tsutils.get_typespec_from_dict( - new_data, - typespec=typespec, - ) - - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - typespec=new_typespec, - # semantic converter needs to be rebuilt for new columns - data_context=self._data_context, - ) - - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> "DictDatagram": - """ - Create a new DictDatagram with a different data context key. - Maintains immutability by returning a new instance. - - Args: - new_context_key: New data context key string - - Returns: - New DictDatagram instance with new context - """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - return DictDatagram( - data=full_data, - data_context=new_context_key, # New context - # Note: semantic_converter will be rebuilt for new context - ) - - # 8. Utility Operations - def copy(self) -> Self: - """ - Create a shallow copy of the datagram. - - Returns a new datagram instance with the same data and cached values. - This is more efficient than reconstructing from scratch when you need - an identical datagram instance. - - Returns: - New DictDatagram instance with copied data and caches. - """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - new_datagram = self.__class__( - full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_data_table = self._cached_data_table - new_datagram._cached_meta_table = self._cached_meta_table - new_datagram._cached_content_hash = self._cached_content_hash - new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema - new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema - - return new_datagram - - # 9. String Representations - def __str__(self) -> str: - """ - Return user-friendly string representation. - - Shows the datagram as a simple dictionary for user-facing output, - messages, and logging. Only includes data columns for clean output. - - Returns: - Dictionary-style string representation of data columns only. - """ - return str(self._data) - - def __repr__(self) -> str: - """ - Return detailed string representation for debugging. - - Shows the datagram type and comprehensive information including - data columns, meta columns count, and context for debugging purposes. - - Returns: - Detailed representation with type and metadata information. - """ - meta_count = len(self.meta_columns) - context_key = self.data_context_key - - return ( - f"DictDatagram(" - f"data={self._data}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) - - -class ArrowDatagram(BaseDatagram): - """ - Immutable datagram implementation using PyArrow Table as storage backend. - - This implementation provides high-performance columnar data operations while - maintaining the datagram interface. It efficiently handles type conversions, - semantic processing, and interoperability with Arrow-based tools. - - The underlying table is split into separate components: - - Data table: Primary business data columns - - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes - - Context table: Data context information with {orcapod.CONTEXT_KEY} - - Future Packet subclass will also handle: - - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes - - When exposing to external tools, semantic types are encoded as - `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). - - All operations return new instances, preserving immutability. - - Example: - >>> table = pa.Table.from_pydict({ - ... "user_id": [123], - ... "name": ["Alice"], - ... "__pipeline_version": ["v2.1.0"], - ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] - ... }) - >>> datagram = ArrowDatagram(table) - >>> updated = datagram.update(name="Alice Smith") - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - """ - Initialize ArrowDatagram from PyArrow Table. - - Args: - table: PyArrow Table containing the data. Must have exactly one row. - semantic_converter: Optional converter for semantic type handling. - If None, will be created based on the data context and table schema. - data_context: Context key string or DataContext object. - If None and table contains context column, will extract from table. - - Raises: - ValueError: If table doesn't contain exactly one row. - - Note: - The input table is automatically split into data, meta, and context - components based on column naming conventions. - """ - # Validate table has exactly one row for datagram - if len(table) != 1: - raise ValueError( - "Table must contain exactly one row to be a valid datagram." - ) - - # Split table into data, meta, and context components - context_columns = [constants.CONTEXT_KEY] - meta_columns = [ - col for col in table.column_names if col.startswith(constants.META_PREFIX) - ] - - # Extract context table if present - if constants.CONTEXT_KEY in table.column_names and data_context is None: - context_table = table.select([constants.CONTEXT_KEY]) - data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] - - # Initialize base class with data context - super().__init__(data_context) - - # Split table into components - self._data_table = table.drop(context_columns + meta_columns) - self._meta_table = table.select(meta_columns) if meta_columns else None - if len(self._data_table.column_names) == 0: - raise ValueError("Data table must contain at least one data column.") - - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - self._data_table.schema, - self._data_context.semantic_type_registry, - ) - ) - self._semantic_converter = semantic_converter - - # Create data context table - data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) - self._data_context_table = pa.Table.from_pylist( - [{constants.CONTEXT_KEY: self._data_context.context_key}], - schema=data_context_schema, - ) - - # Initialize caches - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_meta_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - # 1. Core Properties (Identity & Structure) - @property - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - if self._meta_table is None: - return () - return tuple(self._meta_table.column_names) - - # 2. Dict-like Interface (Data Access) - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - if key not in self._data_table.column_names: - raise KeyError(f"Data column '{key}' not found") - - return self._data_table[key].to_pylist()[0] - - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - return key in self._data_table.column_names - - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - return iter(self._data_table.column_names) - - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - if key in self._data_table.column_names: - return self.as_dict()[key] - return default - - # 3. Structural Information - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - # Start with data columns - result_keys = list(self._data_table.column_names) - - # Add context if requested - if include_context: - result_keys.append(constants.CONTEXT_KEY) - - # Add meta columns if requested - if include_meta_columns: - if include_meta_columns is True: - result_keys.extend(self.meta_columns) - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - filtered_meta_cols = [ - col - for col in self.meta_columns - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - result_keys.extend(filtered_meta_cols) - - return tuple(result_keys) - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> schemas.PythonSchema: - """ - Return Python schema for the datagram. - - Args: - include_meta_columns: Whether to include meta column types. - - True: include all meta column types - - Collection[str]: include meta column types matching these prefixes - - False: exclude meta column types - include_context: Whether to include context type - - Returns: - Python schema - """ - # Get data schema (cached) - if self._cached_python_schema is None: - self._cached_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( - self._data_table.schema - ) - ) - - schema = dict(self._cached_python_schema) - - # Add context if requested - if include_context: - schema[constants.CONTEXT_KEY] = str - - # Add meta schema if requested - if include_meta_columns and self._meta_table is not None: - if self._cached_meta_python_schema is None: - self._cached_meta_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( - self._meta_table.schema - ) - ) - meta_schema = dict(self._cached_meta_python_schema) - if include_meta_columns is True: - schema.update(meta_schema) - elif isinstance(include_meta_columns, Collection): - filtered_meta_schema = { - k: v - for k, v in meta_schema.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - schema.update(filtered_meta_schema) - - return schemas.PythonSchema(schema) - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_meta_columns: Whether to include meta columns in the schema. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - all_schemas = [self._data_table.schema] - - # Add context schema if requested - if include_context: - # TODO: reassess the efficiency of this approach - all_schemas.append(self._data_context_table.schema) - - # Add meta schema if requested - if include_meta_columns and self._meta_table is not None: - if include_meta_columns is True: - meta_schema = self._meta_table.schema - elif isinstance(include_meta_columns, Collection): - # Filter meta schema by prefix matching - matched_fields = [ - field - for field in self._meta_table.schema - if any( - field.name.startswith(prefix) for prefix in include_meta_columns - ) - ] - if matched_fields: - meta_schema = pa.schema(matched_fields) - else: - meta_schema = None - else: - meta_schema = None - - if meta_schema is not None: - all_schemas.append(meta_schema) - - return arrow_utils.join_arrow_schemas(*all_schemas) - - def content_hash(self) -> str: - """ - Calculate and return content hash of the datagram. - Only includes data columns, not meta columns or context. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self._data_table, - prefix_hasher_id=True, - ) - return self._cached_content_hash - - # 4. Format Conversions (Export) - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation of the datagram. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context key - - Returns: - Dictionary representation - """ - # Get data dict (cached) - if self._cached_python_dict is None: - self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self._data_table - )[0] - - result_dict = dict(self._cached_python_dict) - - # Add context if requested - if include_context: - result_dict[constants.CONTEXT_KEY] = self._data_context.context_key - - # Add meta data if requested - if include_meta_columns and self._meta_table is not None: - if include_meta_columns is True: - meta_dict = self._meta_table.to_pylist()[0] - elif isinstance(include_meta_columns, Collection): - meta_dict = self._meta_table.to_pylist()[0] - # Include only meta columns matching prefixes - meta_dict = { - k: v - for k, v in meta_dict.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - if meta_dict is not None: - result_dict.update(meta_dict) - - return result_dict - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """ - Convert the datagram to an Arrow table. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include the context column - - Returns: - Arrow table representation - """ - all_tables = [self._data_table] - - # Add context if requested - if include_context: - all_tables.append(self._data_context_table) - - # Add meta columns if requested - if include_meta_columns and self._meta_table is not None: - meta_table = None - if include_meta_columns is True: - meta_table = self._meta_table - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - matched_cols = [ - col - for col in self._meta_table.column_names - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - if matched_cols: - meta_table = self._meta_table.select(matched_cols) - else: - meta_table = None - - if meta_table is not None: - all_tables.append(meta_table) - - return arrow_utils.hstack_tables(*all_tables) - - # 5. Meta Column Operations - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """ - Get a meta column value. - - Args: - key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) - default: Default value if not found - - Returns: - Meta column value - """ - if self._meta_table is None: - return default - - # Handle both prefixed and unprefixed keys - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - - if key not in self._meta_table.column_names: - return default - - return self._meta_table[key].to_pylist()[0] - - def with_meta_columns(self, **meta_updates: DataValue) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with updated meta columns. - Maintains immutability by returning a new instance. - - Args: - **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) - - Returns: - New ArrowDatagram instance - """ - # Prefix the keys and prepare updates - prefixed_updates = {} - for k, v in meta_updates.items(): - if not k.startswith(constants.META_PREFIX): - k = constants.META_PREFIX + k - prefixed_updates[k] = v - - # Start with existing meta data - meta_dict = {} - if self._meta_table is not None: - meta_dict = self._meta_table.to_pylist()[0] - - # Apply updates - meta_dict.update(prefixed_updates) - - # Create new meta table - new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None - - # Combine all tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def drop_meta_columns( - self, *keys: str, ignore_missing: bool = True - ) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specified meta columns dropped. - Maintains immutability by returning a new instance. - - Args: - *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) - - Returns: - New ArrowDatagram instance without specified meta columns - """ - if self._meta_table is None: - return self # No meta columns to drop - - # Normalize keys to have prefixes - prefixed_keys = set() - for key in keys: - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - prefixed_keys.add(key) - - missing_keys = prefixed_keys - set(self._meta_table.column_names) - if missing_keys and not ignore_missing: - raise KeyError( - f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" - ) - - # Filter meta columns - remaining_cols = [ - col for col in self._meta_table.column_names if col not in prefixed_keys - ] - - # Create new meta table - new_meta_table = ( - self._meta_table.select(remaining_cols) if remaining_cols else None - ) - - # Combine tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # 6. Data Column Operations - def select(self, *column_names: str) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with only specified data columns. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to keep - - Returns: - New ArrowDatagram instance with only specified data columns - """ - # Validate columns exist - missing_cols = set(column_names) - set(self._data_table.column_names) - if missing_cols: - raise ValueError(f"Columns not found: {missing_cols}") - - new_data_table = self._data_table.select(list(column_names)) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def drop(self, *column_names: str, ignore_missing: bool = False) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specified data columns dropped. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to drop - - Returns: - New ArrowDatagram instance without specified data columns - """ - - # Filter out specified data columns - missing = set(column_names) - set(self._data_table.column_names) - if missing and not ignore_missing: - raise KeyError( - f"Following columns do not exist and cannot be dropped: {sorted(missing)}" - ) - - # Filter data columns - remaining_cols = [ - col for col in self._data_table.column_names if col not in column_names - ] - - if not remaining_cols: - raise ValueError("Cannot drop all data columns") - - new_data_table = self._data_table.select(remaining_cols) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def rename(self, column_mapping: Mapping[str, str]) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with data columns renamed. - Maintains immutability by returning a new instance. - - Args: - column_mapping: Mapping from old column names to new column names - - Returns: - New ArrowDatagram instance with renamed data columns - """ - # Create new schema with renamed fields, preserving original types - new_fields = [] - for field in self._data_table.schema: - old_name = field.name - new_name = column_mapping.get(old_name, old_name) - new_field = pa.field(new_name, field.type) - new_fields.append(new_field) - - # Create new data table with renamed columns - new_schema = pa.schema(new_fields) - new_data_table = self._data_table.rename_columns( - [column_mapping.get(name, name) for name in self._data_table.column_names] - ).cast(new_schema) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def update(self, **updates: DataValue) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specific column values updated. - - Args: - **updates: Column names and their new values - - Returns: - New ArrowDatagram instance with updated values - - Raises: - KeyError: If any specified column doesn't exist - - Example: - # Convert relative path to absolute path - updated = datagram.update(file_path="/absolute/path/to/file.txt") - - # Update multiple values - updated = datagram.update(status="processed", file_path="/new/path") - """ - # Only update if there are columns to update - if not updates: - return self - - # Validate all columns exist - missing_cols = set(updates.keys()) - set(self._data_table.column_names) - if missing_cols: - raise KeyError( - f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" - ) - - updates_typespec = schemas.PythonSchema( - {k: v for k, v in self.types().items() if k in updates} - ) - - update_table = self._semantic_converter.from_python_to_arrow( - updates, updates_typespec - ) - all_tables = [self._data_table.drop(list(updates.keys())), update_table] - - if self._meta_table is not None: - all_tables.append(self._meta_table) - - return ArrowDatagram( - table=arrow_utils.hstack_tables(*all_tables), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with new data columns added. - Maintains immutability by returning a new instance. - - Args: - column_updates: New data columns as a mapping - column_types: Optional type specifications for new columns - **kwargs: New data columns as keyword arguments - - Returns: - New ArrowDatagram instance with new data columns added - - Raises: - ValueError: If any column already exists (use update() instead) - """ - # Combine explicit updates with kwargs - - if not updates: - return self - - # Error if any column already exists - existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) - if existing_overlaps: - raise ValueError( - f"Columns already exist: {sorted(existing_overlaps)}. " - f"Use update() to modify existing columns." - ) - - # TODO: consider simplifying this conversion logic - typespec = typespec_utils.get_typespec_from_dict(updates, column_types) - - updates_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_typespec( - typespec, self._data_context.semantic_type_registry - ) - ) - # TODO: cleanup the handling of typespec python schema and various conversion points - new_data_table = updates_converter.from_python_to_arrow(updates, typespec) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - # prepare the joined converter - total_converter = self._semantic_converter.join(updates_converter) - - return ArrowDatagram( - table=combined_table, - semantic_converter=total_converter, - data_context=self._data_context, - ) - - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with a different data context key. - Maintains immutability by returning a new instance. - - Args: - new_context_key: New data context key string - - Returns: - New ArrowDatagram instance with new context - """ - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - data_context=new_context_key, - # Note: semantic_converter will be rebuilt for new context - ) - - # 8. Utility Operations - def copy(self) -> Self: - """Return a copy of the datagram.""" - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - new_datagram = self.__class__( - combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_content_hash = self._cached_content_hash - - return new_datagram - - # 9. String Representations - def __str__(self) -> str: - """ - Return user-friendly string representation. - - Shows the datagram as a simple dictionary for user-facing output, - messages, and logging. Only includes data columns for clean output. - - Returns: - Dictionary-style string representation of data columns only. - - Example: - >>> str(datagram) - "{'user_id': 123, 'name': 'Alice'}" - >>> print(datagram) - {'user_id': 123, 'name': 'Alice'} - """ - return str(self.as_dict()) - - def __repr__(self) -> str: - """ - Return detailed string representation for debugging. - - Shows the datagram type and comprehensive information including - data columns, meta columns count, and context for debugging purposes. - - Returns: - Detailed representation with type and metadata information. - - Example: - >>> repr(datagram) - "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" - """ - data_dict = self.as_dict() - meta_count = len(self.meta_columns) - context_key = self.data_context_key - - return ( - f"ArrowDatagram(" - f"data={data_dict}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) - - -class DictTag(DictDatagram): - """ - A simple tag implementation using Python dictionary. - - Represents a tag (metadata) as a dictionary that can be converted - to different representations like Arrow tables. - """ - - -class DictPacket(DictDatagram): - """ - Enhanced packet implementation with source information support. - - Extends DictDatagram to include source information tracking and - enhanced table conversion capabilities that can include or exclude - source metadata. - - Initialize packet with data and optional source information. - - Args: - data: Primary data content - source_info: Optional mapping of field names to source information - typespec: Optional type specification - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types. Defaults to system default registry. - arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. - """ - - def __init__( - self, - data: Mapping[str, DataValue], - source_info: Mapping[str, str | None] | None = None, - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data_only = { - k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) - } - contained_source_info = { - k.removeprefix(constants.SOURCE_PREFIX): v - for k, v in data.items() - if k.startswith(constants.SOURCE_PREFIX) - } - - super().__init__( - data_only, - typespec=typespec, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._source_info = {**contained_source_info, **(source_info or {})} - self._cached_source_info_table: pa.Table | None = None - self._cached_source_info_schema: pa.Schema | None = None - - @property - def _source_info_schema(self) -> pa.Schema: - if self._cached_source_info_schema is None: - self._cached_source_info_schema = pa.schema( - { - f"{constants.SOURCE_PREFIX}{k}": pa.large_string() - for k in self.keys() - } - ) - return self._cached_source_info_schema - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - """Convert the packet to an Arrow table.""" - table = super().as_table( - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_source: - if self._cached_source_info_table is None: - source_info_data = { - f"{constants.SOURCE_PREFIX}{k}": v - for k, v in self.source_info().items() - } - self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=self._source_info_schema - ) - assert self._cached_source_info_table is not None, ( - "Cached source info table should not be None" - ) - # subselect the corresponding _source_info as the columns present in the data table - source_info_table = self._cached_source_info_table.select( - [ - f"{constants.SOURCE_PREFIX}{k}" - for k in table.column_names - if k in self.keys() - ] - ) - table = arrow_utils.hstack_tables(table, source_info_table) - return table - - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - dict_copy = super().as_dict( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value - return dict_copy - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - for key in self.keys(): - schema[f"{constants.SOURCE_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) - return schema - - def as_datagram( - self, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, - ) -> DictDatagram: - """ - Convert the packet to a DictDatagram. - - Args: - include_source: Whether to include source info fields - - Returns: - DictDatagram representation of the packet - """ - data = self.as_dict( - include_meta_columns=include_meta_columns, include_source=include_source - ) - typespec = self.types(include_source=include_source) - return DictDatagram( - data, - typespec=typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info - """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> Self: - """Return a shallow copy of the packet.""" - instance = super().copy() - instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table - return instance - - -class ArrowTag(ArrowDatagram): - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - super().__init__( - table=table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - -class ArrowPacket(ArrowDatagram): - """ - Arrow table-based packet implementation with comprehensive features. - - A packet implementation that uses Arrow tables as the primary storage format, - providing efficient memory usage and columnar data operations while supporting - source information tracking and content hashing. - - - Initialize ArrowPacket with Arrow table and configuration. - - Args: - table: Single-row Arrow table representing the packet - source_info: Optional source information mapping - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint for tracking - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - skip_source_info_extraction: Whether to skip source info processing - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - data: pa.Table, - source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(data) != 1: - raise ValueError( - "ArrowPacket should only contain a single row, " - "as it represents a single packet." - ) - if source_info is None: - source_info = {} - - # normalize the table to ensure it has the expected source_info columns - data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( - data, - {constants.SOURCE_PREFIX: source_info}, - exclude_columns=[constants.CONTEXT_KEY], - ) - self._source_info_table = prefixed_tables[constants.SOURCE_INFO_PREFIX] - - super().__init__( - data_table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - table = super().as_table(include_data_context=include_data_context) - if include_source: - # add source_info only for existing data columns - table = arrow_utils.hstack_tables( - table, - self._source_info_table.select( - [ - f"{constants.SOURCE_INFO_PREFIX}{c}" - for c in table.column_names - if c in self.keys() - ] - ), - ) - return table - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{constants.SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas( - schema, self._source_info_table.schema - ) - return schema - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Convert to dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - return_dict = super().as_dict(include_data_context=include_data_context) - if include_source: - return_dict.update( - { - f"{constants.SOURCE_INFO_PREFIX}{k}": v - for k, v in self.source_info().items() - } - ) - return return_dict - - def as_datagram(self, include_source: bool = False) -> ArrowDatagram: - table = self.as_table(include_source=include_source) - return ArrowDatagram( - table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Copy of the dictionary mapping field names to their source info - """ - if self._cached_source_info is None: - self._cached_source_info = { - k.removeprefix(constants.SOURCE_INFO_PREFIX): v - for k, v in self._source_info_table.to_pylist()[0].items() - } - return self._cached_source_info.copy() - - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash - - return new_packet - - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] -"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index b1b3d1b..f10bb2e 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -5,19 +5,14 @@ from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from abc import abstractmethod from typing import Any - - -class InputValidationError(Exception): - """ - Exception raised when the inputs are not valid. - This is used to indicate that the inputs do not meet the requirements of the operator. - """ +from orcapod.errors import InputValidationError class Operator(TrackedKernelBase): """ Base class for all operators. Operators are a special type of kernel that can be used to perform operations on streams. + They are defined as a callable that takes a (possibly empty) collection of streams as the input and returns a new stream as output (note that output stream is always singular). """ diff --git a/src/orcapod/errors.py b/src/orcapod/errors.py new file mode 100644 index 0000000..b1566cd --- /dev/null +++ b/src/orcapod/errors.py @@ -0,0 +1,5 @@ +class InputValidationError(Exception): + """ + Exception raised when the inputs are not valid. + This is used to indicate that the inputs do not meet the requirements of the operator. + """ diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 0cd0722..91b7931 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -2,7 +2,6 @@ from .arrow_hashers import SemanticArrowHasher from orcapod.utils.object_spec import parse_objectspec from orcapod.protocols.hashing_protocols import ObjectHasher -from typing import Any CURRENT_VERSION = "v0.1" diff --git a/src/orcapod/pipeline/__init__.py b/src/orcapod/pipeline/__init__.py index 9a99f89..9d3e0f5 100644 --- a/src/orcapod/pipeline/__init__.py +++ b/src/orcapod/pipeline/__init__.py @@ -1,5 +1,5 @@ -from .pipeline import Pipeline +# from .legacy_pipeline import Pipeline -__all__ = [ - "Pipeline", -] +# __all__ = [ +# "Pipeline", +# ] diff --git a/src/orcapod/pipeline/legacy_pipeline.py b/src/orcapod/pipeline/legacy_pipeline.py index 1fb5236..8c931f7 100644 --- a/src/orcapod/pipeline/legacy_pipeline.py +++ b/src/orcapod/pipeline/legacy_pipeline.py @@ -10,7 +10,7 @@ from orcapod.core import Invocation, Kernel, SyncStream from orcapod.core.pod import FunctionPod -from orcapod.pipeline.nodes import KernelNode, FunctionPodNode, Node +from orcapod.pipeline.legacy_nodes import KernelNode, FunctionPodNode, Node from orcapod.core.tracker import GraphTracker from orcapod.stores import ArrowDataStore diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index d51ead8..0356270 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -41,3 +41,7 @@ def get_records_by_ids( record_ids: Collection[str], record_id_column: str | None = None, ) -> pa.Table: ... + + def flush(self) -> None: + """Flush any buffered writes to the underlying storage.""" + ... diff --git a/src/orcapod/protocols/types.py b/src/orcapod/protocols/types.py deleted file mode 100644 index 73e67f1..0000000 --- a/src/orcapod/protocols/types.py +++ /dev/null @@ -1,51 +0,0 @@ -# from typing import TypeAlias -# from collections.abc import Collection, Mapping -# from pathlib import Path -# import logging -# import os - -# logger = logging.getLogger(__name__) - - -# # class TypeSpec(dict[str, DataType]): -# # def __init__(self, *args, **kwargs): -# # """ -# # TypeSpec is a mapping of parameter names to their types. -# # It can be used to define the expected types of parameters in a function or a pod. -# # """ -# # super().__init__(*args, **kwargs) - - -# # Convenience alias for anything pathlike -# PathLike: TypeAlias = str | os.PathLike - -# # an (optional) string or a collection of (optional) string values -# # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists -# TagValue: TypeAlias = int | str | None | Collection["TagValue"] - -# # the top level tag is a mapping from string keys to values that can be a string or -# # an arbitrary depth of nested list of strings or None -# Tag: TypeAlias = Mapping[str, TagValue] - -# # a pathset is a path or an arbitrary depth of nested list of paths -# PathSet: TypeAlias = PathLike | Collection[PathLike | None] - -# # Simple data types that we support (with clear Polars correspondence) -# SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes - -# ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike - -# TypeSpec = dict[str, type] # Mapping of parameter names to their types - -# # Extended data values that can be stored in packets -# # Either the original PathSet or one of our supported simple data types -# DataValue: TypeAlias = ( -# PathSet -# | SupportedNativePythonData -# | None -# | Collection["DataValue"] -# | Mapping[str, "DataValue"] -# ) - - -# PacketLike = Mapping[str, DataValue] From c871bfb1afe047c45f8046089cd538f1dadc5c44 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:00:39 +0000 Subject: [PATCH 081/224] feat: add lazyloading system --- src/orcapod/hashing/arrow_utils.py | 17 ++-- src/orcapod/utils/lazy_module.py | 155 +++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 src/orcapod/utils/lazy_module.py diff --git a/src/orcapod/hashing/arrow_utils.py b/src/orcapod/hashing/arrow_utils.py index 0d46cd7..7dc565e 100644 --- a/src/orcapod/hashing/arrow_utils.py +++ b/src/orcapod/hashing/arrow_utils.py @@ -1,12 +1,17 @@ -import pyarrow as pa import json import hashlib -from typing import Dict, Any +from typing import Any, TYPE_CHECKING from decimal import Decimal import base64 +from orcapod.utils.lazy_module import LazyModule +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") -def serialize_pyarrow_table_schema(table: pa.Table) -> str: + +def serialize_pyarrow_table_schema(table: "pa.Table") -> str: """ Serialize PyArrow table schema to JSON with Python type names and filtered metadata. @@ -29,7 +34,7 @@ def serialize_pyarrow_table_schema(table: pa.Table) -> str: return json.dumps(schema_info, separators=(",", ":"), sort_keys=True) -def serialize_pyarrow_table(table: pa.Table) -> str: +def serialize_pyarrow_table(table: "pa.Table") -> str: """ Serialize a PyArrow table to a stable JSON string with both schema and data. @@ -74,7 +79,7 @@ def serialize_pyarrow_table(table: pa.Table) -> str: ) -def get_pyarrow_table_hash(table: pa.Table) -> str: +def get_pyarrow_table_hash(table: "pa.Table") -> str: """ Get a stable SHA-256 hash of the table content. @@ -88,7 +93,7 @@ def get_pyarrow_table_hash(table: pa.Table) -> str: return hashlib.sha256(serialized.encode("utf-8")).hexdigest() -def deserialize_to_pyarrow_table(serialized_str: str) -> pa.Table: +def deserialize_to_pyarrow_table(serialized_str: str) -> "pa.Table": """ Deserialize JSON string back to a PyArrow table. diff --git a/src/orcapod/utils/lazy_module.py b/src/orcapod/utils/lazy_module.py new file mode 100644 index 0000000..75cf057 --- /dev/null +++ b/src/orcapod/utils/lazy_module.py @@ -0,0 +1,155 @@ +import importlib +from types import ModuleType +from typing import Any, Optional + + +class LazyModule: + """ + A wrapper that lazily loads a module only when its attributes are first accessed. + + Example: + # Instead of: import expensive_module + expensive_module = LazyModule('expensive_module') + + # Module is only loaded when you access something: + result = expensive_module.some_function() # Now it imports + """ + + def __init__(self, module_name: str, package: str | None = None): + """ + Initialize lazy module loader. + + Args: + module_name: Name of the module to import + package: Package for relative imports (same as importlib.import_module) + """ + self._module_name = module_name + self._package = package + self._module: ModuleType | None = None + self._loaded = False + + def _load_module(self) -> ModuleType: + """Load the module if not already loaded.""" + if not self._loaded: + self._module = importlib.import_module(self._module_name, self._package) + self._loaded = True + assert self._module is not None, ( + f"Module '{self._module_name}' could not be loaded. " + "This should not happen if the module exists." + ) + return self._module + + def __getattr__(self, name: str) -> Any: + """Get attribute from the wrapped module, loading it if necessary.""" + if name.startswith("_"): + # Avoid infinite recursion for internal attributes + raise AttributeError( + f"'{self.__class__.__name__}' object has no attribute '{name}'" + ) + + module = self._load_module() + return getattr(module, name) + + def __setattr__(self, name: str, value: Any) -> None: + """Set attribute on the wrapped module or on this instance.""" + if name.startswith("_") or not self._loaded: + # Set on this instance for internal attributes or before loading + super().__setattr__(name, value) + else: + # Set on the wrapped module + setattr(self._load_module(), name, value) + + def __delattr__(self, name: str) -> None: + """Delete attribute from the wrapped module.""" + if name.startswith("_"): + super().__delattr__(name) + else: + delattr(self._load_module(), name) + + def __dir__(self) -> list[str]: + """Return directory of the wrapped module.""" + if self._loaded: + return dir(self._module) + else: + # Return empty list or basic attributes before loading + return [] + + def __repr__(self) -> str: + """String representation.""" + if self._loaded: + return f"" + else: + return f"" + + def __str__(self) -> str: + """String representation.""" + return self.__repr__() + + # Support for callable modules (modules with __call__) + def __call__(self, *args, **kwargs): + """Call the module if it's callable.""" + module = self._load_module() + return module(*args, **kwargs) # type: ignore + + # Support for iteration if the module is iterable + def __iter__(self): + """Iterate over the module if it's iterable.""" + module = self._load_module() + return iter(module) # type: ignore + + def __len__(self): + """Get length of the module if it supports len().""" + module = self._load_module() + return len(module) # type: ignore + + def __getitem__(self, key): + """Get item from the module if it supports indexing.""" + module = self._load_module() + return module[key] # type: ignore + + def __setitem__(self, key, value): + """Set item on the module if it supports item assignment.""" + module = self._load_module() + module[key] = value # type: ignore + + def __contains__(self, item): + """Check if item is in the module if it supports 'in' operator.""" + module = self._load_module() + return item in module + + @property + def is_loaded(self) -> bool: + """Check if the module has been loaded.""" + return self._loaded + + @property + def module_name(self) -> str: + """Get the module name.""" + return self._module_name + + def force_load(self) -> ModuleType: + """Force load the module and return it.""" + return self._load_module() + + +# Convenience function for creating lazy modules +def lazy_import(module_name: str, package: Optional[str] = None) -> LazyModule: + """ + Create a lazy module loader. + + Args: + module_name: Name of the module to import + package: Package for relative imports + + Returns: + LazyModule instance that will load the module on first access + + Example: + np = lazy_import('numpy') + pd = lazy_import('pandas') + + # Modules are only imported when you use them: + array = np.array([1, 2, 3]) # numpy imported here + df = pd.DataFrame({'a': [1, 2]}) # pandas imported here + """ + return LazyModule(module_name, package) From a416c2016c577914f1854843395339604a104059 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:11 +0000 Subject: [PATCH 082/224] refactor: refine kernel and pod setup --- src/orcapod/data/kernels.py | 76 ++++++++++++++++--- src/orcapod/data/pods.py | 43 ++++++----- src/orcapod/pipeline/graph.py | 134 ++++++++++++++++++++++++++++++++++ src/orcapod/pipeline/nodes.py | 32 ++++++++ 4 files changed, 253 insertions(+), 32 deletions(-) create mode 100644 src/orcapod/pipeline/graph.py create mode 100644 src/orcapod/pipeline/nodes.py diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 58a920f..468b3f1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -11,9 +11,6 @@ logger = logging.getLogger(__name__) -def get_tracker_manager() -> dp.TrackerManager: ... - - class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): """ Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. @@ -68,7 +65,7 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... if self.fixed_input_streams is not None: if len(streams) != 0: raise ValueError( - f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted." + f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted at this point." ) return self.fixed_input_streams return streams @@ -86,13 +83,13 @@ def prepare_output_stream( """ return KernelStream(source=self, upstreams=streams, label=label) - def track_invocation(self, *streams: dp.Stream) -> None: + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: """ Track the invocation of the kernel with the provided streams. This is a convenience method that calls record_kernel_invocation. """ if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_kernel_invocation(self, streams) + self._tracker_manager.record_kernel_invocation(self, streams, label=label) def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs @@ -100,7 +97,7 @@ def __call__( processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) - self.track_invocation(*processed_streams) + self.track_invocation(*processed_streams, label=label) return output_stream @abstractmethod @@ -111,8 +108,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: Subclasses should override this method to provide the kernel with its unique behavior """ + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + processed_streams = self.pre_process_input_streams(*streams) + self.validate_inputs(*processed_streams) + return self.kernel_output_types(*processed_streams) + @abstractmethod - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... def __repr__(self): return self.__class__.__name__ @@ -122,6 +124,9 @@ def __str__(self): return f"{self.__class__.__name__}({self._label})" return self.__class__.__name__ + @abstractmethod + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ... + def identity_structure(self, *streams: dp.Stream) -> Any: # Default implementation of identity_structure for the kernel only # concerns the kernel class and the streams if present. Subclasses of @@ -137,7 +142,54 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. - logger.warning( - f"Identity structure not implemented for {self.__class__.__name__}" - ) - return (self.__class__.__name__,) + streams + if len(streams) > 0: + streams = self.pre_process_input_streams(*streams) + self.validate_inputs(*streams) + return self.kernel_identity_structure(*streams) + + +class WrappedKernel(TrackedKernelBase): + """ + A wrapper for a kernel that allows it to be used as a stream source. + This is useful for cases where you want to use a kernel as a source of data + in a pipeline or other data processing context. + """ + + def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + # TODO: handle fixed input stream already set on the kernel + super().__init__(**kwargs) + self.kernel = kernel + + @property + def kernel_id(self) -> tuple[str, ...]: + return self.kernel.kernel_id + + def computed_label(self) -> str | None: + """ + Compute a label for this kernel based on its content. + If label is not explicitly set for this kernel and computed_label returns a valid value, + it will be used as label of this kernel. + """ + return self.kernel.label + + def forward(self, *streams: dp.Stream) -> dp.Stream: + return self.kernel.forward(*streams) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + return self.kernel.output_types(*streams) + + def validate_inputs(self, *streams: dp.Stream) -> None: + pass + + def __repr__(self): + return f"WrappedKernel({self.kernel!r})" + + def __str__(self): + return f"WrappedKernel:{self.kernel!s}" + + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: + return self.kernel.identity_structure(*streams) + + +class CachedKernel(WrappedKernel): + pass diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 8662903..10e088e 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -2,7 +2,7 @@ import sys from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence -from typing import Any, Literal, cast +from typing import Any, Literal, cast, TYPE_CHECKING from orcapod.data.datagrams import ( DictPacket, @@ -22,7 +22,12 @@ from orcapod.types import typespec_utils as tsutils from orcapod.utils import arrow_utils from orcapod.data.system_constants import orcapod_constants as constants -import pyarrow as pa +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") logger = logging.getLogger(__name__) @@ -65,14 +70,12 @@ def __init__( self._active = True self.error_handling = error_handling - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ - input_streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*input_streams) - tag_typespec, _ = input_streams[0].types() + tag_typespec, _ = streams[0].types() return tag_typespec, self.output_packet_types() def is_active(self) -> bool: @@ -124,10 +127,15 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... """ # if multiple streams are provided, join them # otherwise, return as is - if self.fixed_input_streams is not None and len(streams) > 0: - output_stream = self._join_streams(*self.fixed_input_streams) - if len(streams) > 0: + if self.fixed_input_streams is not None: + if len(streams) == 0: + output_stream = self._join_streams(*self.fixed_input_streams) + else: restrict_stream = self._join_streams(*streams) + raise NotImplementedError( + f"{self.__class__.__name__} does not support semijoining fixed input streams with additional streams yet. " + "Please implement this functionality in the subclass." + ) # output_stream = SemiJoin()(output_stream, restrict_stream) else: if len(streams) == 0: @@ -144,9 +152,9 @@ def prepare_output_stream( output_stream.label = label return output_stream - def track_invocation(self, *streams: dp.Stream) -> None: + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, streams) + self._tracker_manager.record_pod_invocation(self, streams, label=label) def forward(self, *streams: dp.Stream) -> PodStream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" @@ -289,8 +297,6 @@ def __str__(self) -> str: return f"FunctionPod:{func_sig}" def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: - v: dp.Packet = DictPacket({}) - print(v) if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -330,7 +336,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) return tag, output_packet - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name @@ -355,11 +361,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: ) # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure - if len(streams) > 0: - # TODO: extract the common handling of input streams - processed_streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*processed_streams) - id_struct += (processed_streams[0],) + if len(streams) != 0: + id_struct += (streams[0],) return id_struct @@ -416,7 +419,7 @@ def output_packet_types(self) -> TypeSpec: def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: return self.pod.identity_structure(*streams) def __repr__(self) -> str: diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py new file mode 100644 index 0000000..3266e3b --- /dev/null +++ b/src/orcapod/pipeline/graph.py @@ -0,0 +1,134 @@ +from orcapod.data.trackers import GraphTracker, Invocation +from orcapod.pipeline.nodes import KernelNode, PodNode +from orcapod.data.context import DataContext +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import store_protocols as sp +from typing import Any +from collections.abc import Collection +from orcapod.data.streams import WrappedStream +import logging + + +logger = logging.getLogger(__name__) + + +class Pipeline(GraphTracker): + """ + Represents a pipeline in the system. + This class extends GraphTracker to manage the execution of kernels and pods in a pipeline. + """ + + def __init__( + self, + name: str | tuple[str, ...], + pipeline_store: sp.ArrowDataStore, + results_store: sp.ArrowDataStore | None = None, + tracker_manager: dp.TrackerManager | None = None, + data_context: str | DataContext | None = None, + auto_compile: bool = True, + ): + super().__init__(tracker_manager=tracker_manager, data_context=data_context) + if not isinstance(name, tuple): + name = (name,) + self.name = name + self.pipeline_store_path_prefix = self.name + self.results_store_path_prefix = () + if results_store is None: + if pipeline_store is None: + raise ValueError( + "Either pipeline_store or results_store must be provided" + ) + results_store = pipeline_store + self.results_store_path_prefix = self.name + ("_results",) + self.pipeline_store = pipeline_store + self.results_store = results_store + self.nodes = {} + self.auto_compile = auto_compile + self._dirty = False + self._ordered_nodes = [] # Track order of invocations + + def __exit__(self, exc_type=None, exc_value=None, traceback=None): + """ + Exit the pipeline context, ensuring all nodes are properly closed. + """ + super().__exit__(exc_type, exc_value, traceback) + if self.auto_compile: + self.compile() + + def flush(self) -> None: + self.pipeline_store.flush() + self.results_store.flush() + + def record_kernel_invocation( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + super().record_kernel_invocation(kernel, upstreams, label) + self._dirty = True + + def record_pod_invocation( + self, + pod: dp.Pod, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + super().record_pod_invocation(pod, upstreams, label) + self._dirty = True + + def compile(self) -> None: + import networkx as nx + + invocation_to_stream_lut = {} + G = self.generate_graph() + for invocation in nx.topological_sort(G): + input_streams = [ + invocation_to_stream_lut[parent] for parent in invocation.parents() + ] + node = self.wrap_invocation(invocation, new_input_streams=input_streams) + invocation_to_stream_lut[invocation] = node() + self.nodes[node.label] = node + + def wrap_invocation( + self, + invocation: Invocation, + new_input_streams: Collection[dp.Stream], + ) -> dp.Kernel: + if invocation in self.invocation_to_pod_lut: + pod = self.invocation_to_pod_lut[invocation] + node = PodNode( + pod=pod, fixed_input_streams=new_input_streams, label=invocation.label + ) + else: + node = KernelNode( + kernel=invocation.kernel, + fixed_input_streams=new_input_streams, + label=invocation.label, + ) + return node + + def __getattr__(self, item: str) -> Any: + """Allow direct access to pipeline attributes.""" + if item in self.nodes: + return self.nodes[item] + raise AttributeError(f"Pipeline has no attribute '{item}'") + + def __dir__(self) -> list[str]: + """Return a list of attributes and methods of the pipeline.""" + return list(super().__dir__()) + list(self.nodes.keys()) + + def rename(self, old_name: str, new_name: str) -> None: + """ + Rename a node in the pipeline. + This will update the label and the internal mapping. + """ + if old_name not in self.nodes: + raise KeyError(f"Node '{old_name}' does not exist in the pipeline.") + if new_name in self.nodes: + raise KeyError(f"Node '{new_name}' already exists in the pipeline.") + node = self.nodes[old_name] + del self.nodes[old_name] + node.label = new_name + self.nodes[new_name] = node + logger.info(f"Node '{old_name}' renamed to '{new_name}'") diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py new file mode 100644 index 0000000..13347f6 --- /dev/null +++ b/src/orcapod/pipeline/nodes.py @@ -0,0 +1,32 @@ +from orcapod.data.kernels import WrappedKernel +from orcapod.data.pods import WrappedPod +from orcapod.protocols import data_protocols as dp + + +class KernelNode(WrappedKernel): + """ + A node in the pipeline that represents a kernel. + This node can be used to execute the kernel and process data streams. + """ + + def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + super().__init__(kernel=kernel, **kwargs) + self.kernel = kernel + + def __repr__(self): + return f"KernelNode(kernel={self.kernel!r})" + + def __str__(self): + return f"KernelNode:{self.kernel!s}" + + +class PodNode(WrappedPod): + def __init__(self, pod: dp.Pod, **kwargs) -> None: + super().__init__(pod=pod, **kwargs) + self.pod = pod + + def __repr__(self): + return f"PodNode(pod={self.pod!r})" + + def __str__(self): + return f"PodNode:{self.pod!s}" From af75ab7236984b07fc149bbe48b230971cf6960c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:28 +0000 Subject: [PATCH 083/224] refactor: refine tracker system --- src/orcapod/data/trackers.py | 114 ++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 0f6ef94..3cf42a9 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,11 +1,16 @@ +from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.data.context import DataContext from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from collections.abc import Generator from abc import ABC, abstractmethod -from typing import Any +from typing import Any, TYPE_CHECKING from contextlib import contextmanager +if TYPE_CHECKING: + import networkx as nx + class BasicTrackerManager: def __init__(self) -> None: @@ -115,35 +120,74 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) -class Invocation: +class StubKernel: + def __init__(self, stream: dp.Stream, label: str | None = None) -> None: + """ + A placeholder kernel that does nothing. + This is used to represent a kernel that has no computation. + """ + self.label = label or stream.label + self.stream = stream + + def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: + """ + Forward the stream through the stub kernel. + This is a no-op and simply returns the stream. + """ + return self.stream + + def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: + return self.forward(*args, **kwargs) + + def identity_structure(self, *streams: dp.Stream) -> Any: + # FIXME: using label as a stop-gap for identity structure + return self.label + + def __hash__(self) -> int: + # TODO: resolve the logic around identity structure on a stream / stub kernel + """ + Hash the StubKernel based on its label and stream. + This is used to uniquely identify the StubKernel in the tracker. + """ + identity_structure = self.identity_structure() + if identity_structure is None: + return hash(self.stream) + return identity_structure + + +class Invocation(LabeledContentIdentifiableBase): def __init__( self, kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + upstreams: tuple[dp.Stream, ...] = (), label: str | None = None, ) -> None: """ Represents an invocation of a kernel with its upstream streams. This is used to track the computational graph and the invocations of kernels. """ + super().__init__(label=label) self.kernel = kernel self.upstreams = upstreams - self._label = label def parents(self) -> tuple["Invocation", ...]: parent_invoctions = [] for stream in self.upstreams: if stream.source is not None: parent_invoctions.append(Invocation(stream.source, stream.upstreams)) + else: + source = StubKernel(stream) + parent_invoctions.append(Invocation(source)) + return tuple(parent_invoctions) - @property - def label(self) -> str | None: + def computed_label(self) -> str | None: """ - Return the label of the kernel invocation. - This is used to identify the invocation in the tracker. + Compute a label for this invocation based on its kernel and upstreams. + If label is not explicitly set for this invocation and computed_label returns a valid value, + it will be used as label of this invocation. """ - return self._label or self.kernel.label or self.kernel.__class__.__name__ + return self.kernel.label def identity_structure(self) -> Any: """ @@ -152,6 +196,9 @@ def identity_structure(self) -> Any: """ return self.kernel.identity_structure(*self.upstreams) + def __repr__(self) -> str: + return f"Invocation(kernel={self.kernel}, upstreams={self.upstreams}, label={self.label})" + class GraphTracker(AutoRegisteringContextBasedTracker): """ @@ -164,41 +211,28 @@ class GraphTracker(AutoRegisteringContextBasedTracker): def __init__( self, tracker_manager: dp.TrackerManager | None = None, - object_hasher: hp.ObjectHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: super().__init__(tracker_manager=tracker_manager) - if object_hasher is None: - object_hasher = get_default_object_hasher() - self.object_hasher = object_hasher + self._data_context = DataContext.resolve_data_context(data_context) + # Dictionary to map kernels to the streams they have invoked # This is used to track the computational graph and the invocations of kernels + self.kernel_invocations: set[Invocation] = set() + self.invocation_to_pod_lut: dict[Invocation, dp.Pod] = {} self.id_to_invocation_lut: dict[str, Invocation] = {} self.id_to_label_lut: dict[str, list[str]] = defaultdict(list) self.id_to_pod_lut: dict[str, dp.Pod] = {} - def record(self, stream: dp.Stream) -> None: - assert stream.source is not None, ( - "Stream must have a source kernel when recording." - ) - stream_list = self.kernel_to_invoked_stream_lut[stream.source] - if stream not in stream_list: - stream_list.append(stream) - - def _record_kernel_and_get_id( + def _record_kernel_and_get_invocation( self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...], label: str | None = None, - ) -> str: + ) -> Invocation: invocation = Invocation(kernel, upstreams, label=label) - invocation_id = self.object_hasher.hash_to_hex(invocation) - if invocation_id not in self.id_to_invocation_lut: - self.id_to_invocation_lut[invocation_id] = invocation - label = label or kernel.label or kernel.__class__.__name__ - existing_labels = self.id_to_label_lut[invocation_id] - if label not in existing_labels: - existing_labels.append(label) - return invocation_id + self.kernel_invocations.add(invocation) + return invocation def record_kernel_invocation( self, @@ -210,7 +244,7 @@ def record_kernel_invocation( Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ - self._record_kernel_and_get_id(kernel, upstreams, label) + self._record_kernel_and_get_invocation(kernel, upstreams, label) def record_pod_invocation( self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None @@ -218,8 +252,8 @@ def record_pod_invocation( """ Record the output stream of a pod invocation in the tracker. """ - invocation_id = self._record_kernel_and_get_id(pod, upstreams, label) - self.id_to_pod_lut[invocation_id] = pod + invocation = self._record_kernel_and_get_invocation(pod, upstreams, label) + self.invocation_to_pod_lut[invocation] = pod def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: """ @@ -229,18 +263,16 @@ def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: self.kernel_to_invoked_stream_lut = defaultdict(list) return recorded_streams - def generate_graph(self): + def generate_graph(self) -> "nx.DiGraph": import networkx as nx G = nx.DiGraph() # Add edges for each invocation - for _, streams in self.kernel_to_invoked_stream_lut.items(): - for stream in streams: - if stream not in G: - G.add_node(stream) - for upstream in stream.upstreams: - G.add_edge(upstream, stream) + for invocation in self.kernel_invocations: + G.add_node(invocation) + for upstream_invocation in invocation.parents(): + G.add_edge(upstream_invocation, invocation) return G # def generate_namemap(self) -> dict[Invocation, str]: From 28540684eacd5e70072b5c311fc78135a0d1b18f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:38 +0000 Subject: [PATCH 084/224] feat: add wrapped stream --- src/orcapod/data/streams.py | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index b8ce85d..d0ecce3 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -655,3 +655,56 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: else: for i in range(len(self._cached_output_packets)): yield self._cached_output_packets[i] + + +class WrappedStream(StreamBase): + def __init__( + self, + stream: dp.Stream, + source: dp.Kernel, + input_streams: tuple[dp.Stream, ...], + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) + self._stream = stream + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + return self._stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + return self._stream.types() + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + return self._stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + return self._stream.iter_packets() + + def identity_structure(self) -> Any: + return self._stream.identity_structure() From 15bfc4c6f6f76680388de77ff56f75d9cab70b74 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:02:09 +0000 Subject: [PATCH 085/224] refactor: use hasher id consistently --- src/orcapod/hashing/arrow_hashers.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 695ffe8..264caad 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -49,11 +49,11 @@ class SemanticArrowHasher: def __init__( self, - hasher_id: str, hash_algorithm: str = "sha256", + semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, chunk_size: int = 8192, + hasher_id: str | None = None, handle_missing: str = "error", - semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, serialization_method: str = "logical", # TODO: consider passing options for serialization method ): @@ -64,6 +64,8 @@ def __init__( chunk_size: Size of chunks to read files in bytes handle_missing: How to handle missing files ('error', 'skip', 'null_hash') """ + if hasher_id is None: + hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" self._hasher_id = hasher_id self.chunk_size = chunk_size self.handle_missing = handle_missing @@ -90,7 +92,8 @@ def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: else: raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") - def get_hasher_id(self) -> str: + @property + def hasher_id(self) -> str: return self._hasher_id def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): @@ -113,9 +116,9 @@ def _get_semantic_type(self, field: pa.Field) -> str | None: def _create_hash_column( self, original_column: pa.Array, - hash_algorithm: str, hash_bytes: bytes, original_field: pa.Field, + hash_algorithm: str | None = None, ) -> tuple[pa.Array, pa.Field]: """Create a new column containing the hash bytes.""" # Create array of hash bytes (one hash value repeated for each row) @@ -128,7 +131,7 @@ def _create_hash_column( "semantic_type", "unknown" ) new_metadata["semantic_type"] = "hash" - new_metadata["hash_algorithm"] = hash_algorithm_id + new_metadata["hash_algorithm"] = hash_algorithm or self.hasher_id new_field = pa.field( original_field.name, @@ -156,7 +159,7 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: # Replace column with hash hash_column, hash_field = self._create_hash_column( - column, hasher.hasher_id, hash_bytes, field + column, hash_bytes, field ) new_columns.append(hash_column) new_fields.append(hash_field) @@ -226,7 +229,7 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: hash_str = hasher.hexdigest() if prefix_hasher_id: - hash_str = f"{self.get_hasher_id()}@{hash_str}" + hash_str = f"{self.hasher_id}@{hash_str}" return hash_str From 534e8106097e72ad211fdc56163b3f2b7c8c96ac Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:07:24 +0000 Subject: [PATCH 086/224] refactor: remove fixed stream from kernel and clean up cached pod --- src/orcapod/data/kernels.py | 12 --- src/orcapod/data/pods.py | 173 +++++++++--------------------------- 2 files changed, 43 insertions(+), 142 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 468b3f1..5392cb4 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -25,7 +25,6 @@ class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): def __init__( self, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, data_context: str | DataContext | None = None, skip_tracking: bool = False, @@ -39,7 +38,6 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER - self.fixed_input_streams = fixed_input_streams @property def data_context(self) -> DataContext: @@ -62,12 +60,6 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... pre-processing step will be tracked separately from the main computation in forward. By default, it returns the input streams unchanged. """ - if self.fixed_input_streams is not None: - if len(streams) != 0: - raise ValueError( - f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted at this point." - ) - return self.fixed_input_streams return streams @abstractmethod @@ -189,7 +181,3 @@ def __str__(self): def kernel_identity_structure(self, *streams: dp.Stream) -> Any: return self.kernel.identity_structure(*streams) - - -class CachedKernel(WrappedKernel): - pass diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 10e088e..bae7c9b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone import logging import sys from abc import abstractmethod @@ -61,12 +62,11 @@ def call( def __init__( self, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, error_handling: error_handling_options = "raise", label: str | None = None, **kwargs, ) -> None: - super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) + super().__init__(label=label, **kwargs) self._active = True self.error_handling = error_handling @@ -90,21 +90,6 @@ def set_active(self, active: bool) -> None: """ self._active = active - def validate_inputs(self, *streams: dp.Stream) -> None: - if len(streams) != 1: - raise ValueError( - f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" - ) - input_stream = streams[0] - _, incoming_packet_types = input_stream.types() - if not tsutils.check_typespec_compatibility( - incoming_packet_types, self.input_packet_types() - ): - # TODO: use custom exception type for better error handling - raise ValueError( - f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" - ) - @staticmethod def _join_streams(*streams: dp.Stream) -> dp.Stream: if not streams: @@ -120,31 +105,33 @@ def _join_streams(*streams: dp.Stream) -> dp.Stream: def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Prepare the incoming streams for execution in the pod. If fixed_input_streams are present, - they will be used as the input streams and the newly provided streams would be used to - restrict (semijoin) the fixed streams. - Otherwise, the join of the provided streams will be returned. + Prepare the incoming streams for execution in the pod. At least one stream must be present. + If more than one stream is present, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - if self.fixed_input_streams is not None: - if len(streams) == 0: - output_stream = self._join_streams(*self.fixed_input_streams) - else: - restrict_stream = self._join_streams(*streams) - raise NotImplementedError( - f"{self.__class__.__name__} does not support semijoining fixed input streams with additional streams yet. " - "Please implement this functionality in the subclass." - ) - # output_stream = SemiJoin()(output_stream, restrict_stream) - else: - if len(streams) == 0: - raise ValueError( - f"{self.__class__.__name__} expects at least one input stream" - ) - output_stream = self._join_streams(*streams) + if len(streams) == 0: + raise ValueError( + f"{self.__class__.__name__} expects at least one input stream" + ) + output_stream = self._join_streams(*streams) return (output_stream,) + def validate_inputs(self, *streams: dp.Stream) -> None: + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + input_stream = streams[0] + _, incoming_packet_types = input_stream.types() + if not tsutils.check_typespec_compatibility( + incoming_packet_types, self.input_packet_types() + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" + ) + def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None ) -> dp.LiveStream: @@ -152,10 +139,6 @@ def prepare_output_stream( output_stream.label = label return output_stream - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: - if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, streams, label=label) - def forward(self, *streams: dp.Stream) -> PodStream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" input_stream = streams[0] @@ -166,6 +149,10 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_pod_invocation(self, streams, label=label) + def function_pod( output_keys: str | Collection[str] | None = None, @@ -376,7 +363,6 @@ class WrappedPod(ActivatablePodBase): def __init__( self, pod: dp.Pod, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, data_context: str | DataContext | None = None, **kwargs, @@ -384,7 +370,6 @@ def __init__( if data_context is None: data_context = pod.data_context_key super().__init__( - fixed_input_streams=fixed_input_streams, label=label, data_context=data_context, **kwargs, @@ -437,19 +422,18 @@ class CachedPod(WrappedPod): # name of the column in the tag store that contains the packet hash PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" + DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" def __init__( self, pod: dp.Pod, result_store: ArrowDataStore, - pipeline_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store - self.pipeline_store = pipeline_store # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( @@ -468,66 +452,27 @@ def call( self, tag: dp.Tag, packet: dp.Packet, + skip_record_check: bool = False, skip_recording: bool = False, + overwrite_existing: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: - output_packet = self.get_recorded_output_packet(packet) + output_packet = None + if not skip_record_check: + output_packet = self.get_recorded_output_packet(packet) if output_packet is None: tag, output_packet = self.pod.call(tag, packet) if output_packet is not None and not skip_recording: - self.record_packet(packet, output_packet) + self.record_packet( + packet, output_packet, overwrite_existing=overwrite_existing + ) - if output_packet is not None: - self.add_pipeline_record(tag, input_packet=packet) return tag, output_packet - def add_pipeline_record(self, tag: dp.Tag, input_packet: dp.Packet) -> None: - if self.pipeline_store is None: - # no pipeline store configured, skip recording - return - # combine dp.Tag with packet content hash to compute entry hash - tag_with_hash = tag.as_table().append_column( - self.PACKET_HASH_COLUMN, - pa.array([input_packet.content_hash()], type=pa.large_string()), - ) - entry_id = self.data_context.arrow_hasher.hash_table( - tag_with_hash, prefix_hasher_id=True - ) - - existing_record = self.pipeline_store.get_record_by_id( - self.record_path, - entry_id, - ) - - if existing_record is not None: - # if the record already exists, return it - return - - # no record matching, so construct the full record - - input_packet_info = ( - input_packet.as_table( - include_source=True, - ) - .append_column( - f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", - pa.array([input_packet.data_context_key], type=pa.large_string()), - ) - .drop(input_packet.keys()) - ) - - combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) - - self.pipeline_store.add_record( - self.record_path, - entry_id, - combined_record, - ignore_duplicates=False, - ) - def record_packet( self, input_packet: dp.Packet, output_packet: dp.Packet, + overwrite_existing: bool = False, ignore_duplicates: bool = False, ) -> dp.Packet: """ @@ -539,6 +484,7 @@ def record_packet( self.record_path, input_packet.content_hash(), data_table, + overwrite_existing=overwrite_existing, ignore_duplicates=ignore_duplicates, ) if result_flag is None: @@ -560,41 +506,8 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non if result_table is None: return None - return ArrowPacket(result_table) - - def _get_all_records(self) -> "pa.Table | None": - results = self.result_store.get_all_records( - self.record_path, record_id_column=self.PACKET_HASH_COLUMN - ) - - if self.pipeline_store is None: - raise ValueError( - "Pipeline store is not configured, cannot retrieve tag info" - ) - taginfo = self.pipeline_store.get_all_records( - self.record_path, - ) - - if results is None or taginfo is None: - return None - - tag_columns = [ - c - for c in taginfo.column_names - if not c.startswith(constants.META_PREFIX) - and not c.startswith(constants.SOURCE_PREFIX) - ] - - packet_columns = [ - c for c in results.column_names if c != self.PACKET_HASH_COLUMN - ] - - # TODO: do not hardcode the join keys - joined_info = taginfo.join( - results, - self.PACKET_HASH_COLUMN, - join_type="inner", + # note that data context will be loaded from the result store + return ArrowPacket( + result_table, + meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, ) - - joined_info = joined_info.select([*tag_columns, *packet_columns]) - return joined_info From c443a32412d532933f0275fc90d63a79bbc8a4e5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:08:04 +0000 Subject: [PATCH 087/224] refactor: consistent copy logic and ability to specify meta info in constructor --- src/orcapod/data/datagrams/arrow_datagram.py | 39 +++++++++++- .../data/datagrams/arrow_tag_packet.py | 29 +++++---- src/orcapod/data/datagrams/base.py | 4 +- src/orcapod/data/datagrams/dict_datagram.py | 59 ++++++++++--------- src/orcapod/data/datagrams/dict_tag_packet.py | 12 ++-- 5 files changed, 91 insertions(+), 52 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5eb158c..5ceb3cb 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -52,6 +52,7 @@ class ArrowDatagram(BaseDatagram): def __init__( self, table: pa.Table, + meta_info: Mapping[str, DataValue] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -84,9 +85,6 @@ def __init__( if constants.CONTEXT_KEY in table.column_names else [] ) - meta_columns = [ - col for col in table.column_names if col.startswith(constants.META_PREFIX) - ] # Extract context table if present if constants.CONTEXT_KEY in table.column_names and data_context is None: @@ -96,9 +94,13 @@ def __init__( # Initialize base class with data context super().__init__(data_context) + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] # Split table into components self._data_table = table.drop_columns(context_columns + meta_columns) self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: raise ValueError("Data table must contain at least one data column.") @@ -112,6 +114,35 @@ def __init__( ) self._semantic_converter = semantic_converter + # process supplemented meta info if provided + if meta_info is not None: + # make sure it has the expected prefixes + meta_info = { + ( + f"{constants.META_PREFIX}{k}" + if not k.startswith(constants.META_PREFIX) + else k + ): v + for k, v in meta_info.items() + } + # Note that meta information cannot contain semantic types + typespec = typespec_utils.get_typespec_from_dict(meta_info) + new_meta_table = self._semantic_converter.from_python_to_arrow( + meta_info, typespec + ) + if self._meta_table is None: + self._meta_table = new_meta_table + else: + # drop any column that will be overwritten by the new meta table + keep_meta_columns = [ + c + for c in self._meta_table.column_names + if c not in new_meta_table.column_names + ] + self._meta_table = arrow_utils.hstack_tables( + self._meta_table.select(keep_meta_columns), new_meta_table + ) + # Create data context table data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) self._data_context_table = pa.Table.from_pylist( @@ -477,6 +508,8 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: # Apply updates meta_dict.update(prefixed_updates) + # TODO: properly handle case where meta data is None (it'll get inferred as NoneType) + # Create new meta table new_datagram._meta_table = ( pa.Table.from_pylist([meta_dict]) if meta_dict else None diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index f776365..503b83e 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -1,5 +1,6 @@ +from hmac import new import logging -from collections.abc import Collection +from collections.abc import Collection, Mapping from typing import Self @@ -81,7 +82,8 @@ class ArrowPacket(ArrowDatagram): def __init__( self, table: pa.Table, - source_info: dict[str, str | None] | None = None, + meta_info: Mapping[str, DataValue] | None = None, + source_info: Mapping[str, str | None] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -94,19 +96,21 @@ def __init__( source_info = {} # normalize the table to ensure it has the expected source_info columns + # TODO: use simpler function to ensure source_info columns data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( table, {constants.SOURCE_PREFIX: source_info}, exclude_columns=[constants.CONTEXT_KEY], exclude_prefixes=[constants.META_PREFIX], ) - self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] super().__init__( data_table, + meta_info=meta_info, semantic_converter=semantic_converter, data_context=data_context, ) + self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] self._cached_source_info: dict[str, str | None] | None = None self._cached_python_schema: schemas.PythonSchema | None = None @@ -252,17 +256,12 @@ def source_info(self) -> dict[str, str | None]: } return self._cached_source_info.copy() - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash + # 8. Utility Operations + def copy(self, include_cache: bool = True) -> Self: + """Return a copy of the datagram.""" + new_packet = super().copy(include_cache=include_cache) + + if include_cache: + new_packet._cached_source_info = self._cached_source_info return new_packet diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index f253995..9f6d4a8 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -271,4 +271,6 @@ def with_context_key(self, new_context_key: str) -> Self: # 8. Utility Operations def copy(self) -> Self: """Create a shallow copy of the datagram.""" - return object.__new__(self.__class__) + new_datagram = object.__new__(self.__class__) + new_datagram._data_context = self._data_context + return new_datagram diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 5ebd926..9f7088f 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -1,3 +1,4 @@ +from curses import meta import logging from collections.abc import Collection, Iterator, Mapping from typing import Self, cast @@ -54,6 +55,7 @@ def __init__( self, data: Mapping[str, DataValue], typespec: TypeSpec | None = None, + meta_info: Mapping[str, DataValue] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -96,7 +98,9 @@ def __init__( # Store data and meta components separately (immutable) self._data = dict(data_columns) - self._meta_data = dict(meta_columns) + if meta_info is not None: + meta_columns.update(meta_info) + self._meta_data = meta_columns # Combine provided typespec info with inferred typespec from content # If the column value is None and no type spec is provided, defaults to str. @@ -114,7 +118,7 @@ def __init__( semantic_type_registry=self._data_context.semantic_type_registry ), ) - self.semantic_converter = semantic_converter + self._semantic_converter = semantic_converter # Create schema for meta data self._meta_python_schema = schemas.PythonSchema( @@ -256,7 +260,7 @@ def arrow_schema( # Build data schema (cached) if self._cached_data_arrow_schema is None: self._cached_data_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._data_python_schema ) ) @@ -272,7 +276,7 @@ def arrow_schema( if include_meta_columns and self._meta_data: if self._cached_meta_arrow_schema is None: self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._meta_python_schema ) ) @@ -379,7 +383,7 @@ def _get_meta_arrow_table(self) -> pa.Table: def _get_meta_arrow_schema(self) -> pa.Schema: if self._cached_meta_arrow_schema is None: self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._meta_python_schema ) ) @@ -412,7 +416,7 @@ def as_table( # Build data table (cached) if self._cached_data_table is None: - self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._cached_data_table = self._semantic_converter.from_python_to_arrow( self._data, self._data_python_schema, ) @@ -497,7 +501,7 @@ def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -542,7 +546,7 @@ def drop_meta_columns( return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -572,7 +576,7 @@ def select(self, *column_names: str) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -605,7 +609,7 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagra return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -646,7 +650,7 @@ def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": return DictDatagram( data=full_data, typespec=new_typespec, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -685,7 +689,7 @@ def update(self, **updates: DataValue) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, # Keep existing converter + semantic_converter=self._semantic_converter, # Keep existing converter data_context=self._data_context, ) @@ -770,7 +774,7 @@ def with_context_key(self, new_context_key: str) -> "DictDatagram": ) # 8. Utility Operations - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """ Create a shallow copy of the datagram. @@ -781,22 +785,19 @@ def copy(self) -> Self: Returns: New DictDatagram instance with copied data and caches. """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - new_datagram = self.__class__( - full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_data_table = self._cached_data_table - new_datagram._cached_meta_table = self._cached_meta_table - new_datagram._cached_content_hash = self._cached_content_hash - new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema - new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + new_datagram = super().copy() + new_datagram._data = self._data.copy() + new_datagram._meta_data = self._meta_data.copy() + new_datagram._data_python_schema = self._data_python_schema.copy() + new_datagram._semantic_converter = self._semantic_converter + new_datagram._meta_python_schema = self._meta_python_schema.copy() + + if include_cache: + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema return new_datagram diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 92bf6aa..ea9b7fa 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -1,6 +1,7 @@ import logging from collections.abc import Collection, Mapping from typing import Self +from xml.etree.ElementInclude import include import pyarrow as pa @@ -46,6 +47,7 @@ class DictPacket(DictDatagram): def __init__( self, data: Mapping[str, DataValue], + meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, @@ -64,6 +66,7 @@ def __init__( super().__init__( data_only, typespec=typespec, + meta_info=meta_info, semantic_converter=semantic_converter, data_context=data_context, ) @@ -235,7 +238,7 @@ def as_datagram( return DictDatagram( data, typespec=typespec, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -248,9 +251,10 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """Return a shallow copy of the packet.""" - instance = super().copy() + instance = super().copy(include_cache=include_cache) instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table + if include_cache: + instance._cached_source_info_table = self._cached_source_info_table return instance From 08fa0ef75e3cafae27441097946884eca51437a1 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:09:29 +0000 Subject: [PATCH 088/224] feat: clean implementation of pipeline nodes --- src/orcapod/pipeline/graph.py | 8 +- src/orcapod/pipeline/nodes.py | 203 ++++++++++++++++++++++- src/orcapod/protocols/store_protocols.py | 2 + src/orcapod/stores/delta_lake_stores.py | 2 + 4 files changed, 207 insertions(+), 8 deletions(-) diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 3266e3b..0a371f5 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -98,12 +98,16 @@ def wrap_invocation( if invocation in self.invocation_to_pod_lut: pod = self.invocation_to_pod_lut[invocation] node = PodNode( - pod=pod, fixed_input_streams=new_input_streams, label=invocation.label + pod=pod, + input_streams=new_input_streams, + pipeline_store=self.pipeline_store, + label=invocation.label, ) else: node = KernelNode( kernel=invocation.kernel, - fixed_input_streams=new_input_streams, + input_streams=new_input_streams, + pipeline_store=self.pipeline_store, label=invocation.label, ) return node diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 13347f6..d9e34da 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,6 +1,18 @@ +from ast import Not +from collections.abc import Collection from orcapod.data.kernels import WrappedKernel -from orcapod.data.pods import WrappedPod +from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import PodStream +from orcapod.utils.lazy_module import LazyModule +from typing import TYPE_CHECKING +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.utils import arrow_utils + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") class KernelNode(WrappedKernel): @@ -9,9 +21,36 @@ class KernelNode(WrappedKernel): This node can be used to execute the kernel and process data streams. """ - def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + def __init__( + self, + kernel: dp.Kernel, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + **kwargs, + ) -> None: super().__init__(kernel=kernel, **kwargs) self.kernel = kernel + self.input_streams = tuple(input_streams) + self.pipeline_store = pipeline_store + self._cached_stream: dp.Stream | None = None + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + if len(streams) > 0: + raise NotImplementedError( + "At this moment, PodNode does not yet support handling additional input streams." + ) + return super().pre_process_input_streams(*self.input_streams) + + def forward(self, *args, **kwargs) -> dp.Stream: + """ + Forward the data through the kernel and return a PodStream. + This method can be overridden to customize the forwarding behavior. + """ + if self._cached_stream is None: + # TODO: reconsider this logic -- if we were to allow semijoin with inputs in the future + # this caching needs to be done more carefully + self._cached_stream = self.kernel.forward(*args, **kwargs) + return self._cached_stream def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" @@ -20,13 +59,165 @@ def __str__(self): return f"KernelNode:{self.kernel!s}" -class PodNode(WrappedPod): - def __init__(self, pod: dp.Pod, **kwargs) -> None: - super().__init__(pod=pod, **kwargs) - self.pod = pod +class PodNode(CachedPod): + PIPELINE_RESULT_PATH = ("_results",) + + def __init__( + self, + pod: dp.Pod, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + result_store: ArrowDataStore | None = None, + record_path_prefix: tuple[str, ...] = (), + **kwargs, + ) -> None: + self.pipeline_path_prefix = record_path_prefix + if result_store is None: + record_path_prefix += self.PIPELINE_RESULT_PATH + result_store = pipeline_store + super().__init__( + pod=pod, + result_store=result_store, + record_path_prefix=record_path_prefix, + **kwargs, + ) + self.pipeline_store = pipeline_store + self.input_streams = tuple(input_streams) + self._cached_stream: dp.LiveStream | None = None + + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return self.pipeline_path_prefix + self.kernel_id def __repr__(self): return f"PodNode(pod={self.pod!r})" def __str__(self): return f"PodNode:{self.pod!s}" + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + if len(streams) > 0: + raise NotImplementedError( + "At this moment, PodNode does not yet support handling additional input streams." + ) + return super().pre_process_input_streams(*self.input_streams) + + def __call__(self, *args, **kwargs) -> dp.LiveStream: + """ + Forward the data through the pod and return a PodStream. + This method can be overridden to customize the forwarding behavior. + """ + if self._cached_stream is None: + self._cached_stream = super().__call__(*args, **kwargs) + return self._cached_stream + + def call( + self, + tag: dp.Tag, + packet: dp.Packet, + skip_record_check: bool = False, + skip_recording: bool = False, + overwrite_existing: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: + tag, output_packet = super().call( + tag, + packet, + skip_record_check=skip_record_check, + skip_recording=skip_recording, + overwrite_existing=overwrite_existing, + ) + if output_packet is not None: + retrieved = ( + output_packet.get_meta_value(self.DATA_RETRIEVED_FLAG) is not None + ) + # add pipeline record if the output packet is not None + self.add_pipeline_record(tag, packet, retrieved=retrieved) + return tag, output_packet + + def add_pipeline_record( + self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None + ) -> None: + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.pipeline_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .append_column( + self.DATA_RETRIEVED_FLAG, + pa.array([retrieved], type=pa.bool_()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.pipeline_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) + + def _get_all_records(self) -> "pa.Table | None": + results = self.result_store.get_all_records( + self.record_path, record_id_column=self.PACKET_HASH_COLUMN + ) + + if self.pipeline_store is None: + raise ValueError( + "Pipeline store is not configured, cannot retrieve tag info" + ) + taginfo = self.pipeline_store.get_all_records( + self.record_path, + ) + + if results is None or taginfo is None: + return None + + tag_columns = [ + c + for c in taginfo.column_names + if not c.startswith(constants.META_PREFIX) + and not c.startswith(constants.SOURCE_PREFIX) + ] + + packet_columns = [ + c for c in results.column_names if c != self.PACKET_HASH_COLUMN + ] + + # TODO: do not hardcode the join keys + joined_info = taginfo.join( + results, + self.PACKET_HASH_COLUMN, + join_type="inner", + ) + + joined_info = joined_info.select([*tag_columns, *packet_columns]) + return joined_info diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 0356270..4940033 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -10,6 +10,7 @@ def add_record( record_id: str, data: pa.Table, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> str | None: ... def add_records( @@ -18,6 +19,7 @@ def add_records( records: pa.Table, record_id_column: str | None = None, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> list[str]: ... def get_record_by_id( diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 218c0e0..8dc6a1d 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -395,6 +395,7 @@ def add_record( record_id: str, data: pa.Table, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, force_flush: bool = False, ) -> pa.Table: self._validate_record_path(record_path) @@ -472,6 +473,7 @@ def add_records( records: pa.Table, record_id_column: str | None = None, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> list[str]: raise NotImplementedError( "add_records is not implemented in BasicDeltaTableArrowStore yet. " From 38b155b72a6f2d5c2bd146a26efef098d7036616 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 08:59:01 +0000 Subject: [PATCH 089/224] refactor: rename pre-kernel step to be more explicit --- src/orcapod/data/kernels.py | 30 ++++++++++-------- src/orcapod/data/pods.py | 62 +++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 5392cb4..1cda423 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections.abc import Collection from typing import Any from orcapod.protocols import data_protocols as dp import logging @@ -52,13 +53,13 @@ def data_context_key(self) -> str: @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing on the input streams before the main computation. This is useful if you need to modify the input streams or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the - pre-processing step will be tracked separately from the main computation in forward. - By default, it returns the input streams unchanged. + pre-processing step will be tracked outside of the computation in the kernel. + Default implementation is a no-op, returning the input streams unchanged. """ return streams @@ -86,7 +87,7 @@ def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> Non def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - processed_streams = self.pre_process_input_streams(*streams) + processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) self.track_invocation(*processed_streams, label=label) @@ -101,7 +102,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: """ def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - processed_streams = self.pre_process_input_streams(*streams) + processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) return self.kernel_output_types(*processed_streams) @@ -117,9 +118,11 @@ def __str__(self): return self.__class__.__name__ @abstractmethod - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ... + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: ... - def identity_structure(self, *streams: dp.Stream) -> Any: + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: # Default implementation of identity_structure for the kernel only # concerns the kernel class and the streams if present. Subclasses of # Kernels should override this method to provide a more meaningful @@ -134,10 +137,9 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. - if len(streams) > 0: - streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*streams) - return self.kernel_identity_structure(*streams) + if streams is not None: + streams = self.pre_kernel_processing(*streams) + return self.kernel_identity_structure(streams) class WrappedKernel(TrackedKernelBase): @@ -179,5 +181,7 @@ def __repr__(self): def __str__(self): return f"WrappedKernel:{self.kernel!s}" - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: - return self.kernel.identity_structure(*streams) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return self.kernel.identity_structure(streams) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index bae7c9b..a66cfc6 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -10,9 +10,9 @@ ArrowPacket, ) from orcapod.data.context import DataContext -from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join -from orcapod.data.streams import PodStream +from orcapod.data.streams import LazyPodResultStream, PodStream from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp @@ -55,11 +55,6 @@ def output_packet_types(self) -> TypeSpec: """ ... - @abstractmethod - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[dp.Tag, dp.Packet | None]: ... - def __init__( self, error_handling: error_handling_options = "raise", @@ -103,17 +98,16 @@ def _join_streams(*streams: dp.Stream) -> dp.Stream: joined_stream = Join()(joined_stream, next_stream) return joined_stream - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Prepare the incoming streams for execution in the pod. At least one stream must be present. If more than one stream is present, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - if len(streams) == 0: - raise ValueError( - f"{self.__class__.__name__} expects at least one input stream" - ) + if len(streams) <= 1: + return streams + output_stream = self._join_streams(*streams) return (output_stream,) @@ -134,20 +128,17 @@ def validate_inputs(self, *streams: dp.Stream) -> None: def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None - ) -> dp.LiveStream: - output_stream = self.forward(*streams) - output_stream.label = label - return output_stream + ) -> KernelStream: + return KernelStream(source=self, upstreams=streams, label=label) - def forward(self, *streams: dp.Stream) -> PodStream: + def forward(self, *streams: dp.Stream) -> dp.Stream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" - input_stream = streams[0] + return LazyPodResultStream(pod=self, prepared_stream=streams[0]) - return PodStream( - self, - input_stream, - error_handling=cast(error_handling_options, self.error_handling), - ) + @abstractmethod + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, dp.Packet | None]: ... def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: @@ -252,10 +243,7 @@ def __init__( @property def kernel_id(self) -> tuple[str, ...]: - return ( - self.function_name, - self.data_context.object_hasher.hash_to_hex(self), - ) + return (self.function_name,) def input_packet_types(self) -> PythonSchema: """ @@ -323,7 +311,9 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) return tag, output_packet - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name @@ -348,8 +338,8 @@ def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ) # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure - if len(streams) != 0: - id_struct += (streams[0],) + if streams is not None and len(streams) != 0: + id_struct += tuple(streams) return id_struct @@ -358,6 +348,7 @@ class WrappedPod(ActivatablePodBase): """ A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. This class is meant to serve as a base class for other pods that need to wrap existing pods. + Note that only the call logic is pass through to the wrapped pod, but the forward logic is not. """ def __init__( @@ -401,11 +392,16 @@ def output_packet_types(self) -> TypeSpec: """ return self.pod.output_packet_types() + def validate_inputs(self, *streams: dp.Stream) -> None: + self.pod.validate_inputs(*streams) + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: - return self.pod.identity_structure(*streams) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return self.pod.identity_structure(streams) def __repr__(self) -> str: return f"WrappedPod({self.pod!r})" @@ -446,7 +442,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.kernel_id + return self.record_path_prefix + self.kernel_id + (self.pod_hash,) def call( self, From 3351cf953d76df35cfd3e92122af424d09bba7ef Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 09:40:47 +0000 Subject: [PATCH 090/224] refactor: extract node base class --- src/orcapod/data/trackers.py | 6 +- src/orcapod/pipeline/nodes.py | 189 ++++++++++++++++++++++++---------- 2 files changed, 138 insertions(+), 57 deletions(-) diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 3cf42a9..799334e 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -3,7 +3,7 @@ from orcapod.data.context import DataContext from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict -from collections.abc import Generator +from collections.abc import Generator, Collection from abc import ABC, abstractmethod from typing import Any, TYPE_CHECKING from contextlib import contextmanager @@ -139,7 +139,7 @@ def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: return self.forward(*args, **kwargs) - def identity_structure(self, *streams: dp.Stream) -> Any: + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: # FIXME: using label as a stop-gap for identity structure return self.label @@ -194,7 +194,7 @@ def identity_structure(self) -> Any: Return a structure that represents the identity of this invocation. This is used to uniquely identify the invocation in the tracker. """ - return self.kernel.identity_structure(*self.upstreams) + return self.kernel.identity_structure(self.upstreams) def __repr__(self) -> str: return f"Invocation(kernel={self.kernel}, upstreams={self.upstreams}, label={self.label})" diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index d9e34da..7372175 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,11 +1,13 @@ from ast import Not -from collections.abc import Collection -from orcapod.data.kernels import WrappedKernel +from collections.abc import Collection, Iterator +from datetime import datetime +from orcapod.data.kernels import WrappedKernel, TrackedKernelBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp from orcapod.data.streams import PodStream +from orcapod.types import TypeSpec from orcapod.utils.lazy_module import LazyModule -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from orcapod.data.system_constants import orcapod_constants as constants from orcapod.utils import arrow_utils @@ -15,53 +17,144 @@ pa = LazyModule("pyarrow") -class KernelNode(WrappedKernel): +class Node( + TrackedKernelBase, +): """ - A node in the pipeline that represents a kernel. - This node can be used to execute the kernel and process data streams. + Mixin class for pipeline nodes """ def __init__( self, - kernel: dp.Kernel, input_streams: Collection[dp.Stream], pipeline_store: ArrowDataStore, + pipeline_path_prefix: tuple[str, ...] = (), **kwargs, - ) -> None: - super().__init__(kernel=kernel, **kwargs) - self.kernel = kernel + ): + super().__init__(**kwargs) + self._cached_stream: dp.LiveStream | None = None self.input_streams = tuple(input_streams) self.pipeline_store = pipeline_store - self._cached_stream: dp.Stream | None = None + self.pipeline_path_prefix = pipeline_path_prefix + # compute invocation hash - note that empty () is passed into identity_structure to signify + # identity structure of invocation with no input streams + self.invocation_hash = self.data_context.object_hasher.hash_to_hex( + self.identity_structure(()), prefix_hasher_id=True + ) + + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def validate_inputs(self, *processed_streams: dp.Stream) -> None: + pass + + def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: raise NotImplementedError( - "At this moment, PodNode does not yet support handling additional input streams." + "At this moment, Node does not yet support handling additional input streams." ) - return super().pre_process_input_streams(*self.input_streams) + # TODO: re-evaluate the use here + # super().validate_inputs(*self.input_streams) + return super().forward(*self.input_streams) - def forward(self, *args, **kwargs) -> dp.Stream: - """ - Forward the data through the kernel and return a PodStream. - This method can be overridden to customize the forwarding behavior. - """ + def __call__(self, *args, **kwargs) -> dp.LiveStream: if self._cached_stream is None: - # TODO: reconsider this logic -- if we were to allow semijoin with inputs in the future - # this caching needs to be done more carefully - self._cached_stream = self.kernel.forward(*args, **kwargs) + self._cached_stream = super().__call__(*args, **kwargs) return self._cached_stream + # properties and methods to act as a dp.Stream + @property + def source(self) -> dp.Kernel | None: + return self + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + return () + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + return self().keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + return self().types() + + @property + def last_modified(self) -> datetime | None: + return self().last_modified + + @property + def is_current(self) -> bool: + return self().is_current + + def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().__iter__() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().iter_packets() + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> "pa.Table": + return self().as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + return self().flow() + + +class KernelNode(Node, WrappedKernel): + """ + A node in the pipeline that represents a kernel. + This node can be used to execute the kernel and process data streams. + """ + + def __init__( + self, + kernel: dp.Kernel, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + pipeline_path_prefix: tuple[str, ...] = (), + **kwargs, + ) -> None: + super().__init__( + kernel=kernel, + input_streams=input_streams, + pipeline_store=pipeline_store, + pipeline_path_prefix=pipeline_path_prefix, + **kwargs, + ) + def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" def __str__(self): return f"KernelNode:{self.kernel!s}" + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.kernel.identity_structure(self.input_streams) -class PodNode(CachedPod): - PIPELINE_RESULT_PATH = ("_results",) +class PodNode(Node, CachedPod): def __init__( self, pod: dp.Pod, @@ -69,29 +162,20 @@ def __init__( pipeline_store: ArrowDataStore, result_store: ArrowDataStore | None = None, record_path_prefix: tuple[str, ...] = (), + pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ) -> None: - self.pipeline_path_prefix = record_path_prefix - if result_store is None: - record_path_prefix += self.PIPELINE_RESULT_PATH - result_store = pipeline_store super().__init__( pod=pod, result_store=result_store, record_path_prefix=record_path_prefix, + input_streams=input_streams, + pipeline_store=pipeline_store, + pipeline_path_prefix=pipeline_path_prefix, **kwargs, ) self.pipeline_store = pipeline_store - self.input_streams = tuple(input_streams) - self._cached_stream: dp.LiveStream | None = None - - @property - def pipeline_path(self) -> tuple[str, ...]: - """ - Return the path to the pipeline run records. - This is used to store the run-associated tag info. - """ - return self.pipeline_path_prefix + self.kernel_id + # self.input_streams = tuple(input_streams) def __repr__(self): return f"PodNode(pod={self.pod!r})" @@ -99,22 +183,6 @@ def __repr__(self): def __str__(self): return f"PodNode:{self.pod!s}" - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: - if len(streams) > 0: - raise NotImplementedError( - "At this moment, PodNode does not yet support handling additional input streams." - ) - return super().pre_process_input_streams(*self.input_streams) - - def __call__(self, *args, **kwargs) -> dp.LiveStream: - """ - Forward the data through the pod and return a PodStream. - This method can be overridden to customize the forwarding behavior. - """ - if self._cached_stream is None: - self._cached_stream = super().__call__(*args, **kwargs) - return self._cached_stream - def call( self, tag: dp.Tag, @@ -221,3 +289,16 @@ def _get_all_records(self) -> "pa.Table | None": joined_info = joined_info.select([*tag_columns, *packet_columns]) return joined_info + + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.pod.identity_structure(self.input_streams) From 7ff5a5105a6591ad9a4668968b5faac6271f4449 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 09:42:05 +0000 Subject: [PATCH 091/224] refactor: import cleanup and additional todos --- src/orcapod/data/datagrams/arrow_datagram.py | 25 + src/orcapod/data/streams.py | 569 ++++++++++++++----- src/orcapod/pipeline/graph.py | 4 + src/orcapod/protocols/data_protocols.py | 79 ++- src/orcapod/stores/delta_lake_stores.py | 2 +- src/orcapod/types/semantic_converter.py | 17 + src/orcapod/types/semantic_types.py | 16 + 7 files changed, 565 insertions(+), 147 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5ceb3cb..c29cf58 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -456,6 +456,31 @@ def as_table( return arrow_utils.hstack_tables(*all_tables) + def as_arrow_compatible_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation compatible with Arrow. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation compatible with Arrow + """ + return self.as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ).to_pylist()[0] + # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: """ diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index d0ecce3..f0178d5 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,14 +1,12 @@ import logging +from pathlib import Path import warnings from abc import ABC, abstractmethod from collections.abc import Collection, Iterator from datetime import datetime, timezone from itertools import repeat -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal -import pyarrow as pa - -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext from orcapod.data.datagrams import ( @@ -16,10 +14,17 @@ ArrowTag, DictTag, ) +from orcapod.data.system_constants import orcapod_constants as constants from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -48,6 +53,8 @@ def __init__( self._upstreams = upstreams self._last_modified: datetime | None = None self._set_modified_time() + # note that this is not necessary for Stream protocol, but is provided + # for convenience to resolve semantic types and other context-specific information self._data_context = DataContext.resolve_data_context(data_context) @property @@ -168,143 +175,10 @@ def identity_structure(self) -> Any: """ if self.source is not None: # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(*self.upstreams) + return self.source.identity_structure(self.upstreams) return super().identity_structure() -class KernelStream(StreamBase): - """ - Recomputable stream that wraps a streams produced by a kernel to provide - an abstraction over the stream, taking the stream's source and upstreams as the basis of - recomputing the stream. - - This stream is used to represent the output of a kernel invocation. - """ - - def __init__( - self, - output_stream: dp.Stream | None = None, - source: dp.Kernel | None = None, - upstreams: tuple[ - dp.Stream, ... - ] = (), # if provided, this will override the upstreams of the output_stream - **kwargs, - ) -> None: - if (output_stream is None or output_stream.source is None) and source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - if source is None: - if output_stream is None or output_stream.source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - source = output_stream.source - upstreams = upstreams or output_stream.upstreams - - super().__init__(source=source, upstreams=upstreams, **kwargs) - self._cached_stream = output_stream - - def clear_cache(self) -> None: - """ - Clears the cached stream. - This is useful for re-processing the stream with the same kernel. - """ - self._cached_stream = None - self._set_modified_time(invalidate=True) - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.keys() - - def types(self) -> tuple[TypeSpec, TypeSpec]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.types() - - @property - def is_current(self) -> bool: - if self._cached_stream is None or not super().is_current: - status = self.refresh() - if not status: # if it failed to update for whatever reason - return False - return True - - def refresh(self, force: bool = False) -> bool: - updated = False - if force or (self._cached_stream is not None and not super().is_current): - self.clear_cache() - - if self._cached_stream is None: - assert self.source is not None, ( - "Stream source must be set to recompute the stream." - ) - self._cached_stream = self.source.forward(*self.upstreams) - self._set_modified_time() - updated = True - - if self._cached_stream is None: - # TODO: use beter error type - raise ValueError( - "Stream could not be updated. Ensure that the source is valid and upstreams are correct." - ) - - return updated - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - @property - def last_modified(self) -> datetime | None: - if self._cached_stream is None: - return None - return self._cached_stream.last_modified - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> pa.Table: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_content_hash=include_content_hash, - ) - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - yield from self._cached_stream.iter_packets() - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" - - class ImmutableTableStream(StreamBase): """ An immutable stream based on a PyArrow Table. @@ -486,23 +360,287 @@ def __repr__(self) -> str: ) +class KernelStream(StreamBase): + """ + Recomputable stream that wraps a stream produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + output_stream: dp.Stream | None = None, + source: dp.Kernel | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (output_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._set_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.refresh() + if not status: # if it failed to update for whatever reason + return False + return True + + def refresh(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._set_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" + + +class LazyPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) + self.pod = pod + self.prepared_stream = prepared_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the prepared stream + self._prepared_stream_iterator = prepared_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + if self._prepared_stream_iterator is not None: + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + processed = self.pod.call(tag, packet) + if processed is not None: + # Update shared cache for future iterators (optimization) + self._cached_output_packets[i] = processed + tag, packet = processed + if packet is not None: + yield tag, packet + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.prepared_stream.keys() + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types(self) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.prepared_stream.types() + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(): + if tag_schema is None: + tag_schema = tag.arrow_schema() + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict()) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + # FIXME: this skips the semantic version conversion and thus is not + # fully correct! + all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) + all_packets: pa.Table = pa.Table.from_pylist( + all_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + return output_table + + class PodStream(StreamBase): def __init__( self, pod: dp.Pod, - input_stream: dp.Stream, + input_streams: tuple[dp.Stream, ...], error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs, ) -> None: - super().__init__(upstreams=(input_stream,), **kwargs) + super().__init__(upstreams=input_streams, **kwargs) self.pod = pod - self.input_stream = input_stream + self.input_streams = input_streams self.error_handling = error_handling self._source = pod # Cache for processed packets # This is a dictionary mapping the index of the packet in the input stream to a tuple of (Tag, Packet) # This allows us to efficiently access the processed packets without re-processing them + self._cached_forward_stream: dp.Stream | None = None self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} self._computation_complete: bool = False self._cached_output_table: pa.Table | None = None @@ -516,18 +654,27 @@ def source(self) -> dp.Pod | None: """ return self._source + def forward_stream(self) -> dp.Stream: + if self._cached_forward_stream is None: + self._cached_forward_stream = self.pod.forward(*self.input_streams) + return self._cached_forward_stream + + @property + def is_current(self) -> bool: + return self.forward_stream().is_current + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - tag_keys, _ = self.input_stream.keys() + tag_keys, _ = self.forward_stream().keys() packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys def types(self) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, _ = self.input_stream.types() + tag_typespec, _ = self.forward_stream().types() # TODO: check if copying can be avoided packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec @@ -537,6 +684,7 @@ def clear_cache(self) -> None: Clears the cached results of the processed stream. This is useful for re-processing the stream with the same processor. """ + self._cached_forward_stream = None self._cached_output_packets = {} self._computation_complete = False self._cached_output_table = None @@ -624,7 +772,7 @@ def as_table( def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() if not self._computation_complete or self._cached_output_packets is None: - for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): + for i, (tag, packet) in enumerate(self.forward_stream().iter_packets()): if i not in self._cached_output_packets: try: processed_tag, processed_packet = self.pod.call(tag, packet) @@ -708,3 +856,136 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: def identity_structure(self) -> Any: return self._stream.identity_structure() + + +class InvokedPodStream(StreamBase): + """ + Recomputable stream that wraps a streams produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + pod_stream: PodStream | None = None, + source: dp.Pod | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (pod_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._set_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.refresh() + if not status: # if it failed to update for whatever reason + return False + return True + + def refresh(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._set_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + yield from self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 0a371f5..0ba9bf8 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -100,7 +100,10 @@ def wrap_invocation( node = PodNode( pod=pod, input_streams=new_input_streams, + result_store=self.results_store, + record_path_prefix=self.results_store_path_prefix, pipeline_store=self.pipeline_store, + pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) else: @@ -108,6 +111,7 @@ def wrap_invocation( kernel=invocation.kernel, input_streams=new_input_streams, pipeline_store=self.pipeline_store, + pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) return node diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 968d70e..cd21645 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -320,6 +320,46 @@ def as_table( """ ... + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # ) -> dict[str, Any]: + # """ + # Return dictionary with values optimized for Arrow table conversion. + + # This method returns a dictionary where values are in a form that can be + # efficiently converted to Arrow format using pa.Table.from_pylist(). + + # The key insight is that this avoids the expensive as_table() → concat pattern + # by providing values that are "Arrow-ready" while remaining in dict format + # for efficient batching. + + # Implementation note: This may involve format conversions (e.g., Path objects + # to strings, datetime objects to ISO strings, etc.) to ensure compatibility + # with Arrow's expected input formats. + + # Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied + # with arrow_schema(...) with the same argument options to ensure that the schema matches the table. + + # Args: + # include_all_info: Include all available information + # include_meta_columns: Controls meta column inclusion + # include_context: Whether to include context key + + # Returns: + # Dictionary with values optimized for Arrow conversion + + # Example: + # # Efficient batch conversion pattern + # arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] + # schema = datagrams[0].arrow_schema() + # table = pa.Table.from_pylist(arrow_dicts, schema=schema) + # """ + # ... + # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: """ @@ -789,6 +829,17 @@ def as_table( """ ... + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # include_source: bool = False, + # ) -> dict[str, Any]: + # """Extended version with source info support.""" + # ... + def as_datagram( self, include_all_info: bool = False, @@ -1034,6 +1085,15 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: Provides a more explicit method name when the intent is to iterate over packets specifically, improving code readability. + This method must return an immutable iterator -- that is, the returned iterator + should not change and must consistently return identical tag,packet pairs across + multiple iterations of the iterator. + + Note that this is NOT to mean that multiple invocation of `iter_packets` must always + return an identical iterator. The iterator returned by `iter_packets` may change + between invocations, but the iterator itself must not change. Consequently, it should be understood + that the returned iterators may be a burden on memory if the stream is large or infinite. + Yields: tuple[Tag, Packet]: Sequential (tag, packet) pairs """ @@ -1061,6 +1121,19 @@ def as_table( """ ... + def flow(self) -> Collection[tuple[Tag, Packet]]: + """ + Return the entire stream as a collection of (tag, packet) pairs. + + This method materializes the stream content into a list or similar + collection type. It is useful for small streams or when you need + to process all data at once. + + Returns: + Collection[tuple[Tag, Packet]]: All (tag, packet) pairs in the stream + """ + ... + class LiveStream(Stream, Protocol): """ @@ -1293,7 +1366,7 @@ def validate_inputs(self, *streams: Stream) -> None: """ ... - def identity_structure(self, *streams: Stream) -> Any: + def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: """ Generate a unique identity structure for this kernel and/or kernel invocation. When invoked without streams, it should return a structure @@ -1307,7 +1380,9 @@ def identity_structure(self, *streams: Stream) -> Any: - Tracking kernel invocations in computational graphs Args: - *streams: Optional input streams for this invocation + streams: Optional input streams for this invocation. If None, identity_structure is + based solely on the kernel. If streams are provided, they are included in the identity + to differentiate between different invocations of the same kernel. Returns: Any: Unique identity structure (e.g., tuple of class name and stream identities) diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 8dc6a1d..213ea3e 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -220,7 +220,7 @@ def _validate_record_path(self, record_path: tuple[str, ...]) -> None: unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] if any(char in component for char in unsafe_chars): raise ValueError( - f"Source path component contains invalid characters: {repr(component)}" + f"Source path {record_path} component {component} contains invalid characters: {repr(component)}" ) def _get_source_key(self, record_path: tuple[str, ...]) -> str: diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 889d8a2..047ad2c 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -63,6 +63,23 @@ def from_python_to_arrow( arrow_data[field] = [value] return pa.Table.from_pydict(arrow_data, schema=arrow_schema) + def from_arrow_to_arrow_compat_dict( + self, arrow_data: pa.Table + ) -> list[dict[str, Any]]: + """Convert Arrow data to a dictionary of Python values""" + return arrow_data.to_pylist() + + def from_python_to_arrow_compat_dict( + self, python_data: Mapping[str, Any] + ) -> dict[str, Any]: + arrow_compat_dict = dict(python_data) + for field, converter in self._converter_lut.items(): + if field in python_data: + arrow_compat_dict[field] = converter.from_python_to_arrow( + python_data[field] + ) + return arrow_compat_dict + def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: """Convert a dictionary of Arrow arrays to Python values""" diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py index 258617a..c0eaef2 100644 --- a/src/orcapod/types/semantic_types.py +++ b/src/orcapod/types/semantic_types.py @@ -59,6 +59,16 @@ def to_canonical(self, value: pa.Array) -> list[T]: """Convert from Arrow representation to canonical form""" pass + # @abstractmethod + # def from_canonical_to_arrow_compatible(self, value: T) -> Any: + # """Convert from canonical to Arrow-compatible representation""" + # pass + + # @abstractmethod + # def from_arrow_compatible_to_canonical(self, value: Any) -> T: + # """Convert from Arrow-compatible representation to canonical form""" + # pass + @abstractmethod def from_canonical(self, value: T | Collection[T]) -> pa.Array: """Convert from canonical to Arrow representation""" @@ -145,6 +155,12 @@ def from_canonical( value = [value] return pa.array([v.path_str for v in value], type=pa.large_string()) + def from_canonical_to_arrow_compatible(self, value: CanonicalPath) -> str: + return value.path_str + + def from_arrow_compatible_to_canonical(self, value: str) -> CanonicalPath: + return CanonicalPath(path_str=value, is_absolute=Path(value).is_absolute()) + def can_handle(self, arrow_type: pa.DataType) -> bool: return arrow_type == pa.large_string() From 971aed064e86c41ec0f97499b77d5d099ffc06b5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:51:55 +0000 Subject: [PATCH 092/224] feat: add ability to change source info --- .../data/datagrams/arrow_tag_packet.py | 27 +++++++++++++++++ src/orcapod/data/datagrams/dict_datagram.py | 6 ++++ src/orcapod/data/datagrams/dict_tag_packet.py | 29 +++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 503b83e..976a392 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -256,6 +256,31 @@ def source_info(self) -> dict[str, str | None]: } return self._cached_source_info.copy() + def with_source_info(self, **source_info: str | None) -> Self: + """ + Create a copy of the packet with updated source information. + + Args: + source_info: New source information mapping + + Returns: + New ArrowPacket instance with updated source info + """ + new_packet = self.copy(include_cache=False) + + existing_source_info_with_prefix = self._source_info_table.to_pylist()[0] + for key, value in source_info.items(): + if not key.startswith(constants.SOURCE_PREFIX): + # Ensure the key is prefixed correctly + key = f"{constants.SOURCE_PREFIX}{key}" + if key in existing_source_info_with_prefix: + existing_source_info_with_prefix[key] = value + + new_packet._source_info_table = pa.Table.from_pylist( + [existing_source_info_with_prefix] + ) + return new_packet + # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" @@ -263,5 +288,7 @@ def copy(self, include_cache: bool = True) -> Self: if include_cache: new_packet._cached_source_info = self._cached_source_info + else: + new_packet._cached_source_info = None return new_packet diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 9f7088f..6cacb0c 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -798,6 +798,12 @@ def copy(self, include_cache: bool = True) -> Self: new_datagram._cached_content_hash = self._cached_content_hash new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + else: + new_datagram._cached_data_table = None + new_datagram._cached_meta_table = None + new_datagram._cached_content_hash = None + new_datagram._cached_data_arrow_schema = None + new_datagram._cached_meta_arrow_schema = None return new_datagram diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index ea9b7fa..a45a22c 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -251,10 +251,39 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} + def with_source_info(self, **source_info: str | None) -> Self: + """ + Create a new packet with updated source information. + + Args: + **kwargs: Key-value pairs to update source information + + Returns: + New DictPacket instance with updated source info + """ + current_source_info = self._source_info.copy() + + for key, value in source_info.items(): + if not key.startswith(constants.SOURCE_PREFIX): + key = f"{constants.SOURCE_PREFIX}{key}" + if key in current_source_info: + current_source_info[key] = value + + new_packet = self.copy(include_cache=False) + new_packet._source_info = current_source_info + + return new_packet + def copy(self, include_cache: bool = True) -> Self: """Return a shallow copy of the packet.""" instance = super().copy(include_cache=include_cache) instance._source_info = self._source_info.copy() if include_cache: instance._cached_source_info_table = self._cached_source_info_table + instance._cached_source_info_schema = self._cached_source_info_schema + + else: + instance._cached_source_info_table = None + instance._cached_source_info_schema = None + return instance From 532da7d4f38e0046fcf71091d2d655aef1033747 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:52:52 +0000 Subject: [PATCH 093/224] feat: add saving table with its own id column --- src/orcapod/protocols/data_protocols.py | 24 +++ src/orcapod/stores/delta_lake_stores.py | 225 ++++++++++++++++++++++-- 2 files changed, 230 insertions(+), 19 deletions(-) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index cd21645..c262fb6 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -877,6 +877,30 @@ def source_info(self) -> dict[str, str | None]: """ ... + def with_source_info( + self, + **source_info: str | None, + ) -> Self: + """ + Create new packet with updated source information. + + Adds or updates source metadata for the packet. This is useful for + tracking data provenance and lineage through the computational graph. + + Args: + **source_info: Source metadata as keyword arguments. + + Returns: + New packet instance with updated source information. + + Example: + >>> updated_packet = packet.with_source_info( + ... file_path="/new/path/to/file.txt", + ... source_id="source_123" + ... ) + """ + ... + class PodFunction(Protocol): """ diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 213ea3e..8490713 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -1,13 +1,20 @@ -import pyarrow as pa -import polars as pl from pathlib import Path -from typing import Any +from typing import Any, TYPE_CHECKING import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict from orcapod.data import constants +from orcapod.utils.lazy_module import LazyModule +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pc = LazyModule("pyarrow.compute") # Module-level logger logger = logging.getLogger(__name__) @@ -270,8 +277,8 @@ def _get_existing_delta_table( return None def _ensure_record_id_column( - self, arrow_data: pa.Table, record_id: str - ) -> pa.Table: + self, arrow_data: "pa.Table", record_id: str + ) -> "pa.Table": """Ensure the table has an record id column.""" if self.RECORD_ID_COLUMN not in arrow_data.column_names: # Add record_id column at the beginning @@ -279,7 +286,7 @@ def _ensure_record_id_column( arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data - def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: + def _remove_record_id_column(self, arrow_data: "pa.Table") -> "pa.Table": """Remove the record id column if it exists.""" if self.RECORD_ID_COLUMN in arrow_data.column_names: column_names = arrow_data.column_names @@ -292,8 +299,8 @@ def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: return arrow_data def _handle_record_id_column( - self, arrow_data: pa.Table, record_id_column: str | None = None - ) -> pa.Table: + self, arrow_data: "pa.Table", record_id_column: str | None = None + ) -> "pa.Table": """ Handle record_id column based on add_record_id_column parameter. @@ -347,7 +354,7 @@ def _read_table_with_filter( self, delta_table: DeltaTable, filters: list | None = None, - ) -> pa.Table: + ) -> "pa.Table": """ Read table using to_pyarrow_dataset with original schema preservation. @@ -393,11 +400,11 @@ def add_record( self, record_path: tuple[str, ...], record_id: str, - data: pa.Table, + data: "pa.Table", ignore_duplicates: bool | None = None, overwrite_existing: bool = False, force_flush: bool = False, - ) -> pa.Table: + ) -> "pa.Table": self._validate_record_path(record_path) source_key = self._get_source_key(record_path) @@ -470,23 +477,203 @@ def add_record( def add_records( self, record_path: tuple[str, ...], - records: pa.Table, + records: "pa.Table", record_id_column: str | None = None, ignore_duplicates: bool | None = None, overwrite_existing: bool = False, + force_flush: bool = False, ) -> list[str]: - raise NotImplementedError( - "add_records is not implemented in BasicDeltaTableArrowStore yet. " - "Use add_record for single record insertion." + """ + Add multiple records to the Delta table, using one column as record_id. + + Args: + record_path: Path tuple identifying the table location + records: PyArrow table containing the records to add + record_id_column: Column name to use as record_id (defaults to first column) + ignore_duplicates: Whether to ignore duplicate entries + overwrite_existing: Whether to overwrite existing records with same ID + force_flush: Whether to write immediately instead of batching + + Returns: + List of record IDs that were added + """ + self._validate_record_path(record_path) + source_key = self._get_source_key(record_path) + + # Determine record_id column + if record_id_column is None: + record_id_column = records.column_names[0] + + # Validate that the record_id column exists + if record_id_column not in records.column_names: + raise ValueError( + f"Record ID column '{record_id_column}' not found in table. " + f"Available columns: {records.column_names}" + ) + + # Rename the record_id column to the standard name + column_mapping = {record_id_column: self.RECORD_ID_COLUMN} + records_renamed = records.rename_columns( + [column_mapping.get(col, col) for col in records.column_names] ) + # Get unique record IDs from the data + record_ids_array = records_renamed[self.RECORD_ID_COLUMN] + unique_record_ids = pc.unique(record_ids_array).to_pylist() + + # Set default behavior for duplicates + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + + added_record_ids = [] + + # Check for duplicates if needed + if not ignore_duplicates: + # Check pending batches + pending_duplicates = [] + for record_id in unique_record_ids: + if record_id in self._pending_batches[source_key]: + pending_duplicates.append(record_id) + + if pending_duplicates: + raise ValueError( + f"Records {pending_duplicates} already exist in pending batch for {source_key}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Check existing table + existing_duplicates = [] + try: + for record_id in unique_record_ids: + existing_record = self.get_record_by_id( + record_path, str(record_id), flush=False + ) + if existing_record is not None: + existing_duplicates.append(record_id) + except Exception as e: + logger.debug(f"Error checking existing records: {e}") + + if existing_duplicates: + raise ValueError( + f"Records {existing_duplicates} already exist in {'/'.join(record_path)}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table + write_deltalake(str(table_path), records_renamed, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + added_record_ids = unique_record_ids + else: + # Handle existing table + if self.duplicate_entry_behavior == "overwrite" or overwrite_existing: + # Delete existing records with matching IDs + try: + # Create SQL condition for multiple record IDs + escaped_ids = [ + str(rid).replace("'", "''") for rid in unique_record_ids + ] + id_list = "', '".join(escaped_ids) + delete_condition = f"{self.RECORD_ID_COLUMN} IN ('{id_list}')" + + delta_table.delete(delete_condition) + logger.debug( + f"Deleted existing records {unique_record_ids} from {source_key}" + ) + except Exception as e: + logger.debug(f"No existing records to delete: {e}") + + # Filter out duplicates if not overwriting + if not ( + self.duplicate_entry_behavior == "overwrite" or overwrite_existing + ): + # Get existing record IDs + try: + existing_table = delta_table.to_pyarrow_table() + if len(existing_table) > 0: + existing_ids = pc.unique( + existing_table[self.RECORD_ID_COLUMN] + ) + + # Filter out records that already exist + mask = pc.invert( + pc.is_in( + records_renamed[self.RECORD_ID_COLUMN], existing_ids + ) + ) + records_renamed = pc.filter(records_renamed, mask) + + # Update the list of record IDs that will actually be added + if len(records_renamed) > 0: + added_record_ids = pc.unique( + records_renamed[self.RECORD_ID_COLUMN] + ).to_pylist() + else: + added_record_ids = [] + else: + added_record_ids = unique_record_ids + except Exception as e: + logger.debug(f"Error filtering duplicates: {e}") + added_record_ids = unique_record_ids + else: + added_record_ids = unique_record_ids + + # Append the (possibly filtered) records + if len(records_renamed) > 0: + write_deltalake( + table_path, + records_renamed, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + else: + # Add to batches for later flushing + # Group records by record_id for individual batch entries + for record_id in unique_record_ids: + # Filter records for this specific record_id + mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) + single_record = pc.filter(records_renamed, mask) + + # Add to pending batch (will overwrite if duplicate_entry_behavior allows) + if ( + self.duplicate_entry_behavior == "overwrite" + or overwrite_existing + or record_id not in self._pending_batches[source_key] + ): + self._pending_batches[source_key][str(record_id)] = single_record + added_record_ids.append(record_id) + elif ignore_duplicates: + logger.debug(f"Ignoring duplicate record {record_id}") + else: + # This should have been caught earlier, but just in case + logger.warning(f"Skipping duplicate record {record_id}") + + # Check if we need to flush + batch_size = len(self._pending_batches[source_key]) + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added {len(added_record_ids)} records to {source_key}") + return [str(rid) for rid in added_record_ids] + def get_record_by_id( self, record_path: tuple[str, ...], record_id: str, record_id_column: str | None = None, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Get a specific record by record_id with schema preservation. @@ -537,7 +724,7 @@ def get_all_records( record_id_column: str | None = None, retrieve_pending: bool = True, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Retrieve all records for a given source path as a single table with schema preservation. @@ -588,10 +775,10 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: list[str] | pl.Series | pa.Array, + record_ids: "list[str] | pl.Series | pa.Array", record_id_column: str | None = None, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Retrieve records by entry IDs as a single table with schema preservation. From 1af2cf439830d1905b9f4fed51c9f04997e5aad6 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:54:28 +0000 Subject: [PATCH 094/224] feat: add refined kernel id logic --- src/orcapod/data/pods.py | 20 +++- src/orcapod/data/trackers.py | 4 + src/orcapod/pipeline/nodes.py | 206 +++++++++++++++++++++++++++++----- 3 files changed, 195 insertions(+), 35 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a66cfc6..06aea7b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -238,12 +238,16 @@ def __init__( semantic_type_registry=self.data_context.semantic_type_registry ) ) - self._function_info_extractor = function_info_extractor + # now compute hash for the self and store that info + self._pod_hash = self.data_context.object_hasher.hash_to_hex( + self, prefix_hasher_id=True + ) + @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name,) + return (self.function_name, self._pod_hash) def input_packet_types(self) -> PythonSchema: """ @@ -300,7 +304,10 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) output_data = {k: v for k, v in zip(self.output_keys, output_values)} - source_info = {k: ":".join(self.kernel_id + (k,)) for k in output_data} + source_info = { + k: ":".join(self.kernel_id + (packet.content_hash(), k)) + for k in output_data + } output_packet = DictPacket( {k: v for k, v in zip(self.output_keys, output_values)}, @@ -396,7 +403,8 @@ def validate_inputs(self, *streams: dp.Stream) -> None: self.pod.validate_inputs(*streams) def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: - return self.pod.call(tag, packet) + output_tag, output_packet = self.pod.call(tag, packet) + return output_tag, output_packet def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -442,7 +450,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.kernel_id + (self.pod_hash,) + return self.record_path_prefix + self.kernel_id def call( self, @@ -456,7 +464,7 @@ def call( if not skip_record_check: output_packet = self.get_recorded_output_packet(packet) if output_packet is None: - tag, output_packet = self.pod.call(tag, packet) + tag, output_packet = super().call(tag, packet) if output_packet is not None and not skip_recording: self.record_packet( packet, output_packet, overwrite_existing=overwrite_existing diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 799334e..70e27d9 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -129,6 +129,10 @@ def __init__(self, stream: dp.Stream, label: str | None = None) -> None: self.label = label or stream.label self.stream = stream + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.stream.__class__.__name__,) + def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: """ Forward the stream through the stub kernel. diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 7372175..0a4cfd7 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from ast import Not from collections.abc import Collection, Iterator from datetime import datetime @@ -13,8 +14,12 @@ if TYPE_CHECKING: import pyarrow as pa + import polars as pl + import pandas as pd else: pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pd = LazyModule("pandas") class Node( @@ -42,6 +47,31 @@ def __init__( self.identity_structure(()), prefix_hasher_id=True ) + @property + def contained_kernel(self) -> dp.Kernel: + raise NotImplementedError( + "This property should be implemented by subclasses to return the contained kernel." + ) + + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + tag_keys, _ = self.keys() + return tag_keys + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + # TODO: consider caching this + _, packet_keys = self.keys() + return packet_keys + @property def pipeline_path(self) -> tuple[str, ...]: """ @@ -111,6 +141,62 @@ def as_table( def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: return self().flow() + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.contained_kernel.identity_structure(self.input_streams) + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records associated with the node. + If include_system_columns is True, system columns will be included in the result. + """ + raise NotImplementedError("This method should be implemented by subclasses.") + + @property + def lazy(self) -> "pl.LazyFrame | None": + records = self.get_all_records(include_system_columns=False) + if records is not None: + return pl.LazyFrame(records) + return None + + @property + def df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + lazy_df = self.lazy + if lazy_df is not None: + return lazy_df.collect() + return None + + @property + def polars_df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + return self.df + + @property + def pandas_df(self) -> "pd.DataFrame | None": + """ + Return the pandas DataFrame representation of the pod's records. + """ + records = self.get_all_records(include_system_columns=False) + if records is not None: + return records.to_pandas() + return None + class KernelNode(Node, WrappedKernel): """ @@ -134,6 +220,10 @@ def __init__( **kwargs, ) + @property + def contained_kernel(self) -> dp.Kernel: + return self.kernel + def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" @@ -153,6 +243,74 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An ) return self.kernel.identity_structure(self.input_streams) + def forward(self, *streams: dp.Stream) -> dp.Stream: + output_stream = super().forward(*streams) + + self.record_pipeline_output(output_stream) + return output_stream + + def record_pipeline_output(self, output_stream: dp.Stream) -> None: + key_column_name = "_record_hash" + output_table = output_stream.as_table( + include_data_context=True, + include_source=True, + include_content_hash=key_column_name, + ) + self.pipeline_store.add_records( + self.pipeline_path, + output_table, + record_id_column=key_column_name, + ignore_duplicates=True, + ) + + +def add_pipeline_record( + self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None +) -> None: + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.pipeline_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .append_column( + self.DATA_RETRIEVED_FLAG, + pa.array([retrieved], type=pa.bool_()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.pipeline_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) + class PodNode(Node, CachedPod): def __init__( @@ -175,7 +333,10 @@ def __init__( **kwargs, ) self.pipeline_store = pipeline_store - # self.input_streams = tuple(input_streams) + + @property + def contained_kernel(self) -> dp.Kernel: + return self.pod def __repr__(self): return f"PodNode(pod={self.pod!r})" @@ -253,7 +414,9 @@ def add_pipeline_record( ignore_duplicates=False, ) - def _get_all_records(self) -> "pa.Table | None": + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": results = self.result_store.get_all_records( self.record_path, record_id_column=self.PACKET_HASH_COLUMN ) @@ -263,42 +426,27 @@ def _get_all_records(self) -> "pa.Table | None": "Pipeline store is not configured, cannot retrieve tag info" ) taginfo = self.pipeline_store.get_all_records( - self.record_path, + self.pipeline_path, ) if results is None or taginfo is None: return None - tag_columns = [ - c - for c in taginfo.column_names - if not c.startswith(constants.META_PREFIX) - and not c.startswith(constants.SOURCE_PREFIX) - ] - - packet_columns = [ - c for c in results.column_names if c != self.PACKET_HASH_COLUMN - ] - # TODO: do not hardcode the join keys joined_info = taginfo.join( results, self.PACKET_HASH_COLUMN, join_type="inner", ) - - joined_info = joined_info.select([*tag_columns, *packet_columns]) + tag_keys, packet_keys = self.keys() + + if not include_system_columns: + system_columns = [ + c + for c in joined_info.column_names + if c.startswith(constants.META_PREFIX) + or c.startswith(constants.CONTEXT_KEY) + or c.startswith(constants.SOURCE_PREFIX) + ] + joined_info = joined_info.drop(system_columns) return joined_info - - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - """ - Return the identity structure of the node. - This is used to compute the invocation hash. - """ - # construct identity structure from the node's information and the - # contained kernel - if streams is not None and len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) - return self.pod.identity_structure(self.input_streams) From 61e170b579e4e1e882d0ee8af19285b1c471f7aa Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 5 Jul 2025 05:36:32 +0000 Subject: [PATCH 095/224] doc: reorganize tutorials --- .../01_orcapod_core_concepts copy.ipynb | 0 .../02_orcapod_basic_usage copy.ipynb | 0 .../02_orcapod_basic_usage.ipynb | 0 .../03_orcacapod_qol_features.ipynb | 0 .../04_orcapod_tracker.ipynb | 0 .../05_orcabridge_dj_integration.ipynb | 0 .../01_quick_dive_into_orcapod.ipynb | 351 ++++++++++-------- 7 files changed, 203 insertions(+), 148 deletions(-) rename notebooks/{ => old_tutorials}/01_orcapod_core_concepts copy.ipynb (100%) rename notebooks/{ => old_tutorials}/02_orcapod_basic_usage copy.ipynb (100%) rename notebooks/{ => old_tutorials}/02_orcapod_basic_usage.ipynb (100%) rename notebooks/{ => old_tutorials}/03_orcacapod_qol_features.ipynb (100%) rename notebooks/{ => old_tutorials}/04_orcapod_tracker.ipynb (100%) rename notebooks/{ => old_tutorials}/05_orcabridge_dj_integration.ipynb (100%) diff --git a/notebooks/01_orcapod_core_concepts copy.ipynb b/notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb similarity index 100% rename from notebooks/01_orcapod_core_concepts copy.ipynb rename to notebooks/old_tutorials/01_orcapod_core_concepts copy.ipynb diff --git a/notebooks/02_orcapod_basic_usage copy.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb similarity index 100% rename from notebooks/02_orcapod_basic_usage copy.ipynb rename to notebooks/old_tutorials/02_orcapod_basic_usage copy.ipynb diff --git a/notebooks/02_orcapod_basic_usage.ipynb b/notebooks/old_tutorials/02_orcapod_basic_usage.ipynb similarity index 100% rename from notebooks/02_orcapod_basic_usage.ipynb rename to notebooks/old_tutorials/02_orcapod_basic_usage.ipynb diff --git a/notebooks/03_orcacapod_qol_features.ipynb b/notebooks/old_tutorials/03_orcacapod_qol_features.ipynb similarity index 100% rename from notebooks/03_orcacapod_qol_features.ipynb rename to notebooks/old_tutorials/03_orcacapod_qol_features.ipynb diff --git a/notebooks/04_orcapod_tracker.ipynb b/notebooks/old_tutorials/04_orcapod_tracker.ipynb similarity index 100% rename from notebooks/04_orcapod_tracker.ipynb rename to notebooks/old_tutorials/04_orcapod_tracker.ipynb diff --git a/notebooks/05_orcabridge_dj_integration.ipynb b/notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb similarity index 100% rename from notebooks/05_orcabridge_dj_integration.ipynb rename to notebooks/old_tutorials/05_orcabridge_dj_integration.ipynb diff --git a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb index 2f99783..b09f745 100644 --- a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb +++ b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "27cdd37d", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "e776b8dc", + "id": "9cd4692c", "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "78ab941b", "metadata": {}, "outputs": [ @@ -65,8 +65,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "c32596f5", + "execution_count": 3, + "id": "ef13511e", "metadata": {}, "outputs": [ { @@ -112,8 +112,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "8f5d5dbc", + "execution_count": 4, + "id": "f8781072", "metadata": {}, "outputs": [], "source": [ @@ -147,8 +147,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "c0a191b2", + "execution_count": 5, + "id": "7b8f8056", "metadata": {}, "outputs": [ { @@ -158,7 +158,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:60\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m streams:\n\u001b[32m 59\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, SyncStream):\n\u001b[32m---> \u001b[39m\u001b[32m60\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 61\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mExpected SyncStream, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(stream).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m for stream \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstream\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 62\u001b[39m )\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source):\n\u001b[32m 64\u001b[39m \u001b[38;5;66;03m# if the stream is a Source, instantiate it\u001b[39;00m\n\u001b[32m 65\u001b[39m stream = stream()\n", "\u001b[31mTypeError\u001b[39m: Expected SyncStream, got int for stream 5" ] @@ -172,7 +172,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88a9b698", + "id": "fba23537", "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "11" ] }, - "execution_count": 7, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -193,8 +193,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "c8ad097f", + "execution_count": null, + "id": "e56ffa7d", "metadata": {}, "outputs": [], "source": [ @@ -213,24 +213,19 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "93c3f1a7", + "execution_count": 6, + "id": "4c9017c9", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'id': 0} {'total': 1}\n", - "{'id': 1} {'total': 3}\n", - "{'id': 2} {'total': 5}\n", - "{'id': 3} {'total': 7}\n", - "{'id': 4} {'total': 9}\n", - "{'id': 5} {'total': 11}\n", - "{'id': 6} {'total': 13}\n", - "{'id': 7} {'total': 15}\n", - "{'id': 8} {'total': 17}\n", - "{'id': 9} {'total': 19}\n" + "ename": "NameError", + "evalue": "name 'total_stream' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtotal_stream\u001b[49m:\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(tag, packet)\n", + "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" ] } ], @@ -241,28 +236,20 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "cfadfb8f", + "execution_count": 7, + "id": "59104716", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[({'id': 0}, {'total': 1}),\n", - " ({'id': 1}, {'total': 3}),\n", - " ({'id': 2}, {'total': 5}),\n", - " ({'id': 3}, {'total': 7}),\n", - " ({'id': 4}, {'total': 9}),\n", - " ({'id': 5}, {'total': 11}),\n", - " ({'id': 6}, {'total': 13}),\n", - " ({'id': 7}, {'total': 15}),\n", - " ({'id': 8}, {'total': 17}),\n", - " ({'id': 9}, {'total': 19})]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'total_stream' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtotal_stream\u001b[49m.flow()\n", + "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" + ] } ], "source": [ @@ -279,8 +266,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "2805282e", + "execution_count": 8, + "id": "77547b4d", "metadata": {}, "outputs": [ { @@ -297,7 +284,7 @@ "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:75\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 69\u001b[39m normalized_streams = [\n\u001b[32m 70\u001b[39m stream() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source) \u001b[38;5;28;01melse\u001b[39;00m stream\n\u001b[32m 71\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m verified_streams\n\u001b[32m 72\u001b[39m ]\n\u001b[32m 74\u001b[39m pre_processed_streams = \u001b[38;5;28mself\u001b[39m.pre_forward_hook(*normalized_streams, **kwargs)\n\u001b[32m---> \u001b[39m\u001b[32m75\u001b[39m output_stream = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43mpre_processed_streams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m post_processed_stream = \u001b[38;5;28mself\u001b[39m.post_forward_hook(output_stream, **kwargs)\n\u001b[32m 77\u001b[39m \u001b[38;5;66;03m# create an invocation instance\u001b[39;00m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:236\u001b[39m, in \u001b[36mFunctionPod.forward\u001b[39m\u001b[34m(self, *streams, **kwargs)\u001b[39m\n\u001b[32m 232\u001b[39m _, packet_typespec = stream.types(trigger_run=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m packet_typespec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m check_typespec_compatibility(\n\u001b[32m 234\u001b[39m packet_typespec, \u001b[38;5;28mself\u001b[39m.function_input_typespec\n\u001b[32m 235\u001b[39m ):\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 237\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInput packet types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is not compatible with the function\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms expected input types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.function_input_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 238\u001b[39m )\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().forward(*streams, **kwargs)\n", "\u001b[31mTypeError\u001b[39m: Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }" @@ -310,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "4c9c030a", "metadata": {}, "outputs": [ @@ -320,7 +307,29 @@ "({'id': int}, {'x': int, 'y': int})" ] }, - "execution_count": 11, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can check the tag and packet types of the stream\n", + "stream.types()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "34338baf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'id': int}, {'x': int, 'y': int})" + ] + }, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -348,18 +357,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "8083f54a", "metadata": {}, "outputs": [], "source": [ "# Use simple data store, saving data to Parquet files\n", - "pipeline_store = op.stores.SimpleParquetDataStore(\"./example_data_store\")" + "from orcapod.stores.delta_table_arrow_data_store import DeltaTableArrowDataStore\n", + "\n", + "pipeline_store = DeltaTableArrowDataStore(\"./delta_store\", batch_size=100)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "a475308c", "metadata": {}, "outputs": [], @@ -377,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "f923ecf1", "metadata": {}, "outputs": [], @@ -408,17 +419,77 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, + "id": "64746ada", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "Memoizing single packet return 2 packets!", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", + "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" + ] + } + ], + "source": [ + "pipeline.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "66230603", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "FunctionPodNode>" + "FunctionPodNode>" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -429,17 +500,17 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "6587f2f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "FunctionPodNode>" + "FunctionPodNode>" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -458,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "bd0dfba2", "metadata": {}, "outputs": [ @@ -468,7 +539,7 @@ "KernelNode" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -487,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "e22758ab", "metadata": {}, "outputs": [ @@ -495,15 +566,15 @@ "data": { "text/plain": [ "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", " 'MapPackets_0': KernelNode,\n", " 'MapPackets_1': KernelNode,\n", " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>}" + " 'mult': FunctionPodNode>}" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "0d1a470e", "metadata": {}, "outputs": [], @@ -533,7 +604,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "3a43984d", "metadata": {}, "outputs": [ @@ -541,15 +612,15 @@ "data": { "text/plain": [ "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", + " 'total': FunctionPodNode>,\n", + " 'delta': FunctionPodNode>,\n", " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>,\n", + " 'mult': FunctionPodNode>,\n", " 'total_map': KernelNode,\n", " 'mult_map': KernelNode}" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -584,45 +655,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "id": "96106e09", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "shape: (10, 2)
idtotal
i64i64
01
13
25
37
49
511
613
715
817
919
" - ], - "text/plain": [ - "shape: (10, 2)\n", - "┌─────┬───────┐\n", - "│ id ┆ total │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═══════╡\n", - "│ 0 ┆ 1 │\n", - "│ 1 ┆ 3 │\n", - "│ 2 ┆ 5 │\n", - "│ 3 ┆ 7 │\n", - "│ 4 ┆ 9 │\n", - "│ 5 ┆ 11 │\n", - "│ 6 ┆ 13 │\n", - "│ 7 ┆ 15 │\n", - "│ 8 ┆ 17 │\n", - "│ 9 ┆ 19 │\n", - "└─────┴───────┘" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n" + ] } ], "source": [ @@ -639,45 +681,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "id": "33b449b6", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "shape: (10, 3)
idxy
i64i64i64
001
112
223
334
445
556
667
778
889
9910
" - ], - "text/plain": [ - "shape: (10, 3)\n", - "┌─────┬─────┬─────┐\n", - "│ id ┆ x ┆ y │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ i64 │\n", - "╞═════╪═════╪═════╡\n", - "│ 0 ┆ 0 ┆ 1 │\n", - "│ 1 ┆ 1 ┆ 2 │\n", - "│ 2 ┆ 2 ┆ 3 │\n", - "│ 3 ┆ 3 ┆ 4 │\n", - "│ 4 ┆ 4 ┆ 5 │\n", - "│ 5 ┆ 5 ┆ 6 │\n", - "│ 6 ┆ 6 ┆ 7 │\n", - "│ 7 ┆ 7 ┆ 8 │\n", - "│ 8 ┆ 8 ┆ 9 │\n", - "│ 9 ┆ 9 ┆ 10 │\n", - "└─────┴─────┴─────┘" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n" + ] } ], "source": [ @@ -694,10 +707,52 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 18, "id": "189f943f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n", + "Flushing triggered!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "Memoizing single packet return 2 packets!", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", + "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" + ] + } + ], "source": [ "pipeline.run()" ] From 5958594dcf51edb3f841ea450a0d6275758c2a84 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 5 Jul 2025 05:37:58 +0000 Subject: [PATCH 096/224] feat: cleaned up delta store --- src/orcapod/pipeline/nodes.py | 22 +- .../stores/delta_table_arrow_data_store.py | 662 ++++++------------ 2 files changed, 239 insertions(+), 445 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 07d9eb4..405714f 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -196,13 +196,15 @@ def update_cached_values(self): def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: if self._cache_computed: logger.info(f"Returning cached outputs for {self}") - if self.df is not None: + if (lazy_df := self.get_all_records_as_polars(flush=False)) is not None: if self.tag_keys is None: raise ValueError( "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" ) return PolarsStream( - self.df, tag_keys=self.tag_keys, packet_keys=self.packet_keys + lazy_df.collect(), + tag_keys=self.tag_keys, + packet_keys=self.packet_keys, ) else: return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) @@ -235,7 +237,7 @@ def post_call(self, tag: Tag, packet: Packet) -> None: ) # TODO: revisit this logic output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) - if not self.output_store.get_record(self.store_path, output_id): + if not self.output_store.get_record(self.store_path, output_id, flush=False): self.output_store.add_record( self.store_path, output_id, @@ -249,6 +251,9 @@ def output_iterator_completion_hook(self) -> None: logger.info(f"Results cached for {self}") self._cache_computed = True + def get_all_records_as_polars(self, flush: bool = True) -> pl.LazyFrame | None: + return self.output_store.get_all_records_as_polars(self.store_path, flush=flush) + @property def lazy_df(self) -> pl.LazyFrame | None: lazydf = self.output_store.get_all_records_as_polars(self.store_path) @@ -542,7 +547,9 @@ def _add_pipeline_record_with_packet_key( # TODO: add error handling # check if record already exists: - retrieved_table = self.tag_store.get_record(self.tag_store_path, entry_hash) + retrieved_table = self.tag_store.get_record( + self.tag_store_path, entry_hash, flush=False + ) if retrieved_table is None: self.tag_store.add_record(self.tag_store_path, entry_hash, table) @@ -565,6 +572,7 @@ def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: arrow_table = self.output_store.get_record( self.output_store_path, packet_key, + flush=False, ) if arrow_table is None: return None @@ -626,7 +634,9 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: output_packet = None if not self.skip_memoization_lookup: - output_packet = self._retrieve_memoized_with_packet_key(packet_key) + output_packet = self._retrieve_memoized_with_packet_key( + packet_key, + ) if output_packet is not None: logger.debug( f"Memoized output for {packet} with {packet_key} found, skipping computation" @@ -658,7 +668,7 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ) return tag, None - # result was successfully computed -- save the tag + # result was successfully computed/retrieved -- save the tag if not self.skip_tag_record and self.tag_store is not None: self._add_pipeline_record_with_packet_key( tag, packet_key, packet.source_info diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/delta_table_arrow_data_store.py index e5ddfb9..56bbbfa 100644 --- a/src/orcapod/stores/delta_table_arrow_data_store.py +++ b/src/orcapod/stores/delta_table_arrow_data_store.py @@ -1,14 +1,14 @@ import pyarrow as pa import pyarrow.compute as pc +import pyarrow.dataset as ds import polars as pl from pathlib import Path -from typing import Any, Dict, List +from typing import Any import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -import threading from collections import defaultdict -import json + # Module-level logger logger = logging.getLogger(__name__) @@ -31,7 +31,6 @@ def __init__( create_base_path: bool = True, max_hierarchy_depth: int = 10, batch_size: int = 100, - auto_flush_interval: float = 300.0, # 5 minutes ): """ Initialize the DeltaTableArrowDataStore. @@ -54,7 +53,6 @@ def __init__( self.base_path = Path(base_path) self.max_hierarchy_depth = max_hierarchy_depth self.batch_size = batch_size - self.auto_flush_interval = auto_flush_interval if create_base_path: self.base_path.mkdir(parents=True, exist_ok=True) @@ -66,56 +64,125 @@ def __init__( # Cache for Delta tables to avoid repeated initialization self._delta_table_cache: dict[str, DeltaTable] = {} - # Cache for original schemas (without __entry_id column) - self._schema_cache: dict[str, pa.Schema] = {} - # Batch management - self._pending_batches: Dict[str, List[pa.Table]] = defaultdict(list) - self._batch_lock = threading.Lock() - - # Auto-flush timer - self._flush_timer = None - # if auto_flush_interval > 0: - # self._start_auto_flush_timer() + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) logger.info( f"Initialized DeltaTableArrowDataStore at {self.base_path} " f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " - f"batch_size={batch_size}, auto_flush_interval={auto_flush_interval}s" + f"batch_size={batch_size}, as" ) - def _start_auto_flush_timer(self): - """Start the auto-flush timer.""" - if self._flush_timer: - self._flush_timer.cancel() - - if self.auto_flush_interval > 0: - self._flush_timer = threading.Timer( - self.auto_flush_interval, self._auto_flush - ) - self._flush_timer.daemon = True - self._flush_timer.start() + def flush(self) -> None: + """ + Flush all pending batches immediately. - def _auto_flush(self): - """Auto-flush all pending batches.""" + This method is called to ensure all pending data is written to the Delta tables. + """ try: - print("Flushing!", flush=True) self.flush_all_batches() except Exception as e: - logger.error(f"Error during auto-flush: {e}") - finally: - self._start_auto_flush_timer() + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") def __del__(self): """Cleanup when object is destroyed.""" - try: - if self._flush_timer: - self._flush_timer.cancel() - self.flush_all_batches() - except Exception: - pass # Ignore errors during cleanup + self.flush() def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly """ Validate source path components. @@ -154,174 +221,11 @@ def _get_source_key(self, source_path: tuple[str, ...]) -> str: def _get_table_path(self, source_path: tuple[str, ...]) -> Path: """Get the filesystem path for a given source path.""" path = self.base_path - for component in source_path: - path = path / component + for subpath in source_path: + path = path / subpath return path - def _get_schema_metadata_path(self, source_path: tuple[str, ...]) -> Path: - """Get the path for storing original schema metadata.""" - table_path = self._get_table_path(source_path) - return table_path / "_original_schema.json" - - def _save_original_schema( - self, source_path: tuple[str, ...], schema: pa.Schema - ) -> None: - """Save the original schema (without __entry_id) to metadata file.""" - source_key = self._get_source_key(source_path) - - # Cache the schema - self._schema_cache[source_key] = schema - - try: - # Save to file as well for persistence - schema_path = self._get_schema_metadata_path(source_path) - schema_path.parent.mkdir(parents=True, exist_ok=True) - - # Convert schema to JSON-serializable format - def convert_metadata(metadata): - """Convert Arrow metadata (bytes keys/values) to JSON-safe format.""" - if metadata is None: - return None - result = {} - for key, value in metadata.items(): - # Convert bytes keys and values to strings - str_key = ( - key.decode("utf-8") if isinstance(key, bytes) else str(key) - ) - str_value = ( - value.decode("utf-8") - if isinstance(value, bytes) - else str(value) - ) - result[str_key] = str_value - return result - - schema_dict = { - "fields": [ - { - "name": field.name, - "type": str(field.type), - "nullable": field.nullable, - "metadata": convert_metadata(field.metadata), - } - for field in schema - ], - "metadata": convert_metadata(schema.metadata), - } - - with open(schema_path, "w") as f: - json.dump(schema_dict, f, indent=2) - - except Exception as e: - logger.warning(f"Could not save schema metadata for {source_key}: {e}") - - def _load_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: - """Load the original schema from cache or metadata file.""" - source_key = self._get_source_key(source_path) - - # Check cache first - if source_key in self._schema_cache: - return self._schema_cache[source_key] - - # Try to load from file - try: - schema_path = self._get_schema_metadata_path(source_path) - if not schema_path.exists(): - return None - - with open(schema_path, "r") as f: - schema_dict = json.load(f) - - # Reconstruct schema from JSON - def convert_metadata_back(metadata_dict): - """Convert JSON metadata back to Arrow format (bytes keys/values).""" - if metadata_dict is None: - return None - result = {} - for key, value in metadata_dict.items(): - # Convert string keys and values back to bytes - bytes_key = key.encode("utf-8") - bytes_value = ( - value.encode("utf-8") - if isinstance(value, str) - else str(value).encode("utf-8") - ) - result[bytes_key] = bytes_value - return result - - fields = [] - for field_dict in schema_dict["fields"]: - # Parse the type string back to Arrow type - type_str = field_dict["type"] - arrow_type = self._parse_arrow_type_string(type_str) - - metadata = convert_metadata_back(field_dict.get("metadata")) - - field = pa.field( - field_dict["name"], - arrow_type, - nullable=field_dict["nullable"], - metadata=metadata, - ) - fields.append(field) - - schema_metadata = convert_metadata_back(schema_dict.get("metadata")) - - schema = pa.schema(fields, metadata=schema_metadata) - - # Cache it - self._schema_cache[source_key] = schema - return schema - - except Exception as e: - logger.warning(f"Could not load schema metadata for {source_key}: {e}") - return None - - def _parse_arrow_type_string(self, type_str: str) -> pa.DataType: - """Parse Arrow type string back to Arrow type object.""" - # This is a simplified parser for common types - # You might need to extend this for more complex types - type_str = type_str.strip() - - # Handle basic types - if type_str == "int64": - return pa.int64() - elif type_str == "int32": - return pa.int32() - elif type_str == "float64": - return pa.float64() - elif type_str == "float32": - return pa.float32() - elif type_str == "bool": - return pa.bool_() - elif type_str == "string": - return pa.string() - elif type_str == "large_string": - return pa.large_string() - elif type_str == "binary": - return pa.binary() - elif type_str == "large_binary": - return pa.large_binary() - elif type_str.startswith("timestamp"): - # Extract timezone if present - if "[" in type_str and "]" in type_str: - tz = type_str.split("[")[1].split("]")[0] - if tz == "UTC": - tz = "UTC" - return pa.timestamp("us", tz=tz) - else: - return pa.timestamp("us") - elif type_str.startswith("list<"): - # Parse list type - inner_type_str = type_str[5:-1] # Remove 'list<' and '>' - inner_type = self._parse_arrow_type_string(inner_type_str) - return pa.list_(inner_type) - else: - # Fallback to string for unknown types - logger.warning(f"Unknown Arrow type string: {type_str}, using string") - return pa.string() - - def _get_or_create_delta_table( + def _get_existing_delta_table( self, source_path: tuple[str, ...] ) -> DeltaTable | None: """ @@ -337,8 +241,8 @@ def _get_or_create_delta_table( table_path = self._get_table_path(source_path) # Check cache first - if source_key in self._delta_table_cache: - return self._delta_table_cache[source_key] + if dt := self._delta_table_cache.get(source_key): + return dt try: # Try to load existing table @@ -426,164 +330,57 @@ def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: """ return [("__entry_id", "in", entry_ids)] - def _read_table_with_schema_preservation( + def _read_table_with_filter( self, delta_table: DeltaTable, - source_path: tuple[str, ...], - filters: list = None, + filters: list | None = None, ) -> pa.Table: """ Read table using to_pyarrow_dataset with original schema preservation. Args: delta_table: The Delta table to read from - source_path: Source path for schema lookup filters: Optional filters to apply Returns: Arrow table with preserved schema """ - try: - # Get the original schema (without __entry_id) - original_schema = self._load_original_schema(source_path) - - if original_schema is not None: - # Create target schema with __entry_id column - entry_id_field = pa.field( - "__entry_id", pa.large_string(), nullable=False - ) - target_schema = pa.schema([entry_id_field] + list(original_schema)) - - # Use to_pyarrow_dataset with the target schema - dataset = delta_table.to_pyarrow_dataset(schema=target_schema) - if filters: - # Apply filters at dataset level for better performance - import pyarrow.compute as pc - - filter_expr = None - for filt in filters: - if len(filt) == 3: - col, op, val = filt - if op == "=": - expr = pc.equal(pc.field(col), pa.scalar(val)) - elif op == "in": - expr = pc.is_in(pc.field(col), pa.array(val)) - else: - # Fallback to table-level filtering - return delta_table.to_pyarrow_table(filters=filters) - - if filter_expr is None: - filter_expr = expr - else: - filter_expr = pc.and_(filter_expr, expr) - - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) - - return dataset.to_table() - else: - # Fallback to regular method if no schema found - logger.warning( - f"No original schema found for {'/'.join(source_path)}, using fallback" - ) - return delta_table.to_pyarrow_table(filters=filters) - - except Exception as e: - logger.warning( - f"Error reading with schema preservation: {e}, falling back to regular method" - ) - return delta_table.to_pyarrow_table(filters=filters) - - def _flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - print("Flushing triggered!!", flush=True) - source_key = self._get_source_key(source_path) - - with self._batch_lock: - if ( - source_key not in self._pending_batches - or not self._pending_batches[source_key] - ): - return - - # Get all pending records - pending_tables = self._pending_batches[source_key] - self._pending_batches[source_key] = [] - - if not pending_tables: - return - - try: - # Combine all tables in the batch - combined_table = pa.concat_tables(pending_tables) - - table_path = self._get_table_path(source_path) - table_path.mkdir(parents=True, exist_ok=True) - - # Check if table exists - delta_table = self._get_or_create_delta_table(source_path) - - if delta_table is None: - # Create new table - save original schema first - original_schema = self._remove_entry_id_column(combined_table).schema - self._save_original_schema(source_path, original_schema) - - write_deltalake(str(table_path), combined_table, mode="overwrite") - logger.debug( - f"Created new Delta table for {source_key} with {len(combined_table)} records" - ) - else: - # Handle duplicates if needed - if self.duplicate_entry_behavior == "overwrite": - # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) - - # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" - try: - delta_table.delete(delete_predicate) - logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing records to delete from {source_key}: {e}" - ) + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) - # Append new records - write_deltalake( - str(table_path), combined_table, mode="append", schema_mode="merge" - ) - logger.debug( - f"Appended batch of {len(combined_table)} records to {source_key}" - ) + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - # Put the tables back in the pending queue - with self._batch_lock: - self._pending_batches[source_key] = ( - pending_tables + self._pending_batches[source_key] - ) - raise + return dataset.to_table() def add_record( self, source_path: tuple[str, ...], entry_id: str, arrow_data: pa.Table, - ignore_duplicate: bool = False, force_flush: bool = False, ) -> pa.Table: """ @@ -605,24 +402,22 @@ def add_record( self._validate_source_path(source_path) source_key = self._get_source_key(source_path) - # Check for existing entry if needed (only for immediate duplicates, not batch) - if ( - not ignore_duplicate - and self.duplicate_entry_behavior == "error" - and not force_flush - ): + # Check for existing entry + if self.duplicate_entry_behavior == "error": # Only check existing table, not pending batch for performance - existing_record = self.get_record(source_path, entry_id) + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_record(source_path, entry_id, flush=False) if existing_record is not None: raise ValueError( f"Entry '{entry_id}' already exists in {'/'.join(source_path)}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - # Save original schema if this is the first record for this source - if source_key not in self._schema_cache: - self._save_original_schema(source_path, arrow_data.schema) - # Add entry_id column to the data data_with_entry_id = self._ensure_entry_id_column(arrow_data, entry_id) @@ -631,11 +426,10 @@ def add_record( table_path = self._get_table_path(source_path) table_path.mkdir(parents=True, exist_ok=True) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: # Create new table - save original schema first - self._save_original_schema(source_path, arrow_data.schema) write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") logger.debug(f"Created new Delta table for {source_key}") else: @@ -653,7 +447,7 @@ def add_record( ) write_deltalake( - str(table_path), + table_path, data_with_entry_id, mode="append", schema_mode="merge", @@ -662,55 +456,32 @@ def add_record( # Update cache self._delta_table_cache[source_key] = DeltaTable(str(table_path)) else: - # Add to batch - with self._batch_lock: - self._pending_batches[source_key].append(data_with_entry_id) - batch_size = len(self._pending_batches[source_key]) + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) # Check if we need to flush if batch_size >= self.batch_size: - self._flush_batch(source_path) + self.flush_batch(source_path) logger.debug(f"Added record {entry_id} to {source_key}") return arrow_data - def flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Manually flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - self._flush_batch(source_path) - - def flush_all_batches(self) -> None: - """Flush all pending batches.""" - with self._batch_lock: - source_keys = list(self._pending_batches.keys()) - - for source_key in source_keys: - source_path = tuple(source_key.split("/")) - try: - self._flush_batch(source_path) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - - def get_pending_batch_info(self) -> Dict[str, int]: + def get_pending_batch_info(self) -> dict[str, int]: """ Get information about pending batches. Returns: Dictionary mapping source keys to number of pending records """ - with self._batch_lock: - return { - source_key: len(tables) - for source_key, tables in self._pending_batches.items() - if tables - } + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } def get_record( - self, source_path: tuple[str, ...], entry_id: str + self, source_path: tuple[str, ...], entry_id: str, flush: bool = False ) -> pa.Table | None: """ Get a specific record by entry_id with schema preservation. @@ -720,20 +491,26 @@ def get_record( entry_id: Unique identifier for the record Returns: - Arrow table for the record with original schema, or None if not found + Arrow table for the record or None if not found """ + if flush: + self.flush_batch(source_path) self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) + # check if entry_id is found in pending batches + source_key = self._get_source_key(source_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None try: # Use schema-preserving read filter_expr = self._create_entry_id_filter(entry_id) - result = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None @@ -748,7 +525,11 @@ def get_record( raise e def get_all_records( - self, source_path: tuple[str, ...], add_entry_id_column: bool | str = False + self, + source_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, ) -> pa.Table | None: """ Retrieve all records for a given source path as a single table with schema preservation. @@ -763,28 +544,43 @@ def get_all_records( Returns: Arrow table containing all records with original schema, or None if no records found """ + if flush: + self.flush_batch(source_path) self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) - if delta_table is None: - return None + collected_arrays = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(source_path) + ].items(): + collected_arrays.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) - try: - # Use schema-preserving read - result = self._read_table_with_schema_preservation(delta_table, source_path) + delta_table = self._get_existing_delta_table(source_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) - if len(result) == 0: - return None + if len(result) != 0: + collected_arrays.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(source_path)}: {e}" + ) + if collected_arrays: + total_table = pa.Table.concatenate(collected_arrays) # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) + return self._handle_entry_id_column(total_table, add_entry_id_column) - except Exception as e: - logger.error(f"Error getting all records from {'/'.join(source_path)}: {e}") - return None + return None def get_all_records_as_polars( - self, source_path: tuple[str, ...] + self, source_path: tuple[str, ...], flush: bool = True ) -> pl.LazyFrame | None: """ Retrieve all records for a given source path as a single Polars LazyFrame. @@ -795,7 +591,7 @@ def get_all_records_as_polars( Returns: Polars LazyFrame containing all records, or None if no records found """ - all_records = self.get_all_records(source_path) + all_records = self.get_all_records(source_path, flush=flush) if all_records is None: return None return pl.LazyFrame(all_records) @@ -806,6 +602,7 @@ def get_records_by_ids( entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, + flush: bool = False, ) -> pa.Table | None: """ Retrieve records by entry IDs as a single table with schema preservation. @@ -819,6 +616,9 @@ def get_records_by_ids( Returns: Arrow table containing all found records with original schema, or None if no records found """ + if flush: + self.flush_batch(source_path) + self._validate_source_path(source_path) # Convert input to list of strings for consistency @@ -839,16 +639,14 @@ def get_records_by_ids( f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" ) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None try: # Use schema-preserving read with filters filter_expr = self._create_entry_ids_filter(entry_ids_list) - result = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None @@ -881,6 +679,7 @@ def get_records_by_ids_as_polars( entry_ids: list[str] | pl.Series | pa.Array, add_entry_id_column: bool | str = False, preserve_input_order: bool = False, + flush: bool = False, ) -> pl.LazyFrame | None: """ Retrieve records by entry IDs as a single Polars LazyFrame. @@ -895,7 +694,11 @@ def get_records_by_ids_as_polars( Polars LazyFrame containing all found records, or None if no records found """ arrow_result = self.get_records_by_ids( - source_path, entry_ids, add_entry_id_column, preserve_input_order + source_path, + entry_ids, + add_entry_id_column, + preserve_input_order, + flush=flush, ) if arrow_result is None: @@ -947,7 +750,7 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: self._validate_source_path(source_path) # Flush any pending batches first - self._flush_batch(source_path) + self.flush_batch(source_path) table_path = self._get_table_path(source_path) source_key = self._get_source_key(source_path) @@ -990,16 +793,14 @@ def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: # Flush any pending batches first self._flush_batch(source_path) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return False try: # Check if record exists using proper filter filter_expr = self._create_entry_id_filter(entry_id) - existing = self._read_table_with_schema_preservation( - delta_table, source_path, filters=filter_expr - ) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) if len(existing) == 0: return False @@ -1033,7 +834,7 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: """ self._validate_source_path(source_path) - delta_table = self._get_or_create_delta_table(source_path) + delta_table = self._get_existing_delta_table(source_path) if delta_table is None: return None @@ -1047,14 +848,10 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: pending_info = self.get_pending_batch_info() pending_count = pending_info.get(source_key, 0) - # Get original schema info - original_schema = self._load_original_schema(source_path) - return { "path": str(self._get_table_path(source_path)), "source_path": source_path, "schema": schema, - "original_schema": original_schema, "version": delta_table.version(), "num_files": len(delta_table.files()), "history_length": len(history), @@ -1065,16 +862,3 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: except Exception as e: logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") return None - - def get_original_schema(self, source_path: tuple[str, ...]) -> pa.Schema | None: - """ - Get the original schema (without __entry_id column) for a source path. - - Args: - source_path: Tuple of path components - - Returns: - Original Arrow schema or None if not found - """ - self._validate_source_path(source_path) - return self._load_original_schema(source_path) From 663082b6f7a3feae6dc18a2cea37ef5e3b475a63 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:42:59 +0000 Subject: [PATCH 097/224] feat: add protocols --- src/orcapod/protocols/__init__.py | 0 src/orcapod/protocols/data_protocols.py | 247 ++++++++++++++++++++ src/orcapod/protocols/hashing_protocols.py | 139 +++++++++++ src/orcapod/protocols/semantic_protocols.py | 38 +++ src/orcapod/protocols/store_protocols.py | 0 src/orcapod/protocols/types.py | 51 ++++ 6 files changed, 475 insertions(+) create mode 100644 src/orcapod/protocols/__init__.py create mode 100644 src/orcapod/protocols/data_protocols.py create mode 100644 src/orcapod/protocols/hashing_protocols.py create mode 100644 src/orcapod/protocols/semantic_protocols.py create mode 100644 src/orcapod/protocols/store_protocols.py create mode 100644 src/orcapod/protocols/types.py diff --git a/src/orcapod/protocols/__init__.py b/src/orcapod/protocols/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py new file mode 100644 index 0000000..677aab6 --- /dev/null +++ b/src/orcapod/protocols/data_protocols.py @@ -0,0 +1,247 @@ +from typing import Protocol +from orcapod.types import DataValue, TypeSpec +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from collections.abc import Iterator, Collection +import pyarrow as pa +from datetime import datetime + + +class Datagram(Protocol): + @property + def typespec(self) -> TypeSpec: ... + + def keys(self) -> Collection[str]: ... + + def as_table(self) -> pa.Table: ... + + def as_dict(self) -> dict[str, DataValue]: ... + + +class Tag(Datagram, Protocol): ... + + +class Packet(Datagram, Protocol): + def as_table(self, include_source: bool = False) -> pa.Table: + """ + Convert the packet to a PyArrow Table. + If include_source is True, the source information is included in the table. + """ + ... + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Convert the packet to a dictionary. + If include_source is True, the source information is included in the dictionary. + """ + ... + + def content_hash(self) -> str: ... + + def source_info(self) -> dict[str, str | None]: ... + + # def join(self, other: "Packet") -> "Packet": ... + + # def get_as(self, packet_type: PacketType) -> PacketType: ... + + +class PodFunction(Protocol): + """ + A function suitable to be used in a FunctionPod. + It takes one or more named arguments, each corresponding to either: + - A path to a file or directory (PathSet) - for backward compatibility + - A simple data value (str, int, float, bool, bytes, Path) + and returns either None, a single value, or a list of values + """ + + def __call__(self, **kwargs: DataValue) -> None | DataValue: ... + + +class Labelable(Protocol): + """ + A protocol for objects that can have a label. + This is used to provide a human-readable name for the object. + """ + + @property + def label(self) -> str | None: + """ + Return the label of the object. + If no label is set, return None. + """ + ... + + +class Kernel(ContentIdentifiable, Labelable, Protocol): + """ + Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. + It is the base class for all computations and transformations that can be performed on a collection of streams + (including an empty collection). + A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + Each "invocation" of the kernel on a collection of streams is assigned a unique ID. + The corresponding invocation information is stored as Invocation object and attached to the output stream + for computational graph tracking. + """ + + def __call__( + self, *streams: "Stream", label: str | None = None, **kwargs + ) -> "Stream": + """ + This is the main interface for invoking the kernel and perform any side-effects such as registering the invocation with the computational graph. + This method should be called with a collection of streams, which can be empty, and is expected to trigger + the call to the forward method of the kernel. + """ + ... + + def forward(self, *streams: "Stream") -> "Stream": + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior. + The method should return a new stream that represents the output of the kernel, but should not register the invocation + with the computational graph, allowing for the computation to be performed without side effects. + """ + ... + + def types(self, *streams: "Stream") -> tuple[TypeSpec, TypeSpec]: ... + + def validate_inputs(self, *streams: "Stream") -> None: ... + + +class Pod(Kernel, Protocol): + @property + def input_typespec(self) -> TypeSpec: ... + + @property + def output_typespec(self) -> TypeSpec: ... + + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + """ + Call the function pod with a single input packet. + This is used to invoke the function pod with a single packet. + """ + ... + + +class Stream(ContentIdentifiable, Labelable, Protocol): + """ + A stream that is generated by an invocation of a kernel. + This stream is used to represent the output of a kernel invocation. + It is a concrete implementation of the SyncStream that has an associated + invocation that generated the stream. + """ + + @property + def source(self) -> Kernel | None: ... + + @property + def upstreams(self) -> tuple["Stream", ...]: ... + + @property + def last_modified(self) -> datetime | None: + """ + Returns when the stream's content was last modified. + + Returns: + datetime: Timestamp of last modification (cacheable streams) + None: Content is never stable - always recompute + (async streams, dynamic streams, etc.) + """ + ... + + @property + def is_current(self) -> bool: + """ + Returns whether the stream is current. + A stream is current if the content is up-to-date with respect to its source. + This can be used to determine if a stream with non-None last_modified is up-to-date. + Note that for asynchronous streams, this status is not applicable and always returns False. + """ + ... + + def as_table(self) -> pa.Table: + """ + Convert the stream to a PyArrow Table. + To avoid collision, tags should be prefixed with "_tag_". + """ + ... + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: ... + + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: ... + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Return the keys of the pipeline property. + This is used to define the keys of the pipeline property. + """ + ... + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Return the types of the pipeline property. + This is used to define the types of the graph property. + """ + ... + + +class Source(Kernel, Stream, Protocol): + """ + A source is a special type of kernel that produces a stream of data. + It is the entry point for data into the computational graph. + Sources are typically used to read data from external sources such as files, databases, etc. + """ + + +class Tracker(Protocol): + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the tracker. + This is used to activate or deactivate the tracker. + If the tracker is active, it will record the invocations of kernels. + """ + ... + + def is_active(self) -> bool: + """ + Check if the tracker is active. + This is used to determine if the tracker is currently recording invocations. + """ + ... + + def record(self, stream: Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + ... + + +class TrackerManager(Protocol): + def get_active_trackers(self) -> list[Tracker]: + """ + Get the list of active trackers. + This is used to retrieve the currently active trackers in the system. + """ + ... + + def register_tracker(self, tracker: Tracker) -> None: + """ + Register a new tracker in the system. + This is used to add a new tracker to the list of active trackers. + """ + ... + + def deregister_tracker(self, tracker: Tracker) -> None: + """ + Deregister a tracker from the system. + This is used to remove a tracker from the list of active trackers. + """ + ... + + def record(self, stream: Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + ... diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py new file mode 100644 index 0000000..7c14e2e --- /dev/null +++ b/src/orcapod/protocols/hashing_protocols.py @@ -0,0 +1,139 @@ +"""Hash strategy protocols for dependency injection.""" + +from collections.abc import Callable +from typing import Any, Protocol, runtime_checkable +import uuid + +from orcapod.types import TypeSpec, PathLike +import pyarrow as pa + + +@runtime_checkable +class ContentIdentifiable(Protocol): + """Protocol for objects that can provide an identity structure.""" + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this object. + + Returns: + Any: A structure representing this object's content. + Should be deterministic and include all identity-relevant data. + Return None to indicate no custom identity is available. + """ + ... + + def __eq__(self, other: object) -> bool: + """ + Equality check that compares the identity structures of two objects. + + Args: + other (object): The object to compare with. + + Returns: + bool: True if the identity structures are equal, False otherwise. + """ + ... + + def __hash__(self) -> int: + """ + Hash implementation that uses the identity structure if provided, + otherwise falls back to the default hash. + + Returns: + int: A hash value based on either content or identity. + """ + ... + + +class ObjectHasher(Protocol): + """Protocol for general object hashing.""" + + # TODO: consider more explicitly stating types of objects accepted + def hash(self, obj: Any) -> bytes: + """ + Hash an object to a byte representation. + + Args: + obj (Any): The object to hash. + + Returns: + bytes: The byte representation of the hash. + """ + ... + + def get_hasher_id(self) -> str: + """ + Returns a unique identifier/name assigned to the hasher + """ + ... + + def hash_to_hex( + self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + ) -> str: ... + + def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: + """ + Hash an object to an integer. + + Args: + obj (Any): The object to hash. + hexdigits (int): Number of hexadecimal digits to use for the hash. + + Returns: + int: The integer representation of the hash. + """ + ... + + def hash_to_uuid( + self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + ) -> uuid.UUID: ... + + +class FileContentHasher(Protocol): + """Protocol for file-related hashing.""" + + def hash_file(self, file_path: PathLike) -> bytes: ... + + +class ArrowHasher(Protocol): + """Protocol for hashing arrow packets.""" + + def get_hasher_id(self) -> str: ... + + def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... + + +class StringCacher(Protocol): + """Protocol for caching string key value pairs.""" + + def get_cached(self, cache_key: str) -> str | None: ... + def set_cached(self, cache_key: str, value: str) -> None: ... + def clear_cache(self) -> None: ... + + +class FunctionInfoExtractor(Protocol): + """Protocol for extracting function information.""" + + def extract_function_info( + self, + func: Callable[..., Any], + function_name: str | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | None = None, + ) -> dict[str, Any]: ... + + +class SemanticTypeHasher(Protocol): + """Abstract base class for semantic type-specific hashers.""" + + def hash_column( + self, + column: pa.Array, + ) -> pa.Array: + """Hash a column with this semantic type and return the hash bytes.""" + ... + + def set_cacher(self, cacher: StringCacher) -> None: + """Add a string cacher for caching hash values.""" + ... diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_protocols.py new file mode 100644 index 0000000..5458cad --- /dev/null +++ b/src/orcapod/protocols/semantic_protocols.py @@ -0,0 +1,38 @@ +from typing import Protocol, Any + + +class TypeHandler(Protocol): + """Protocol for handling conversion between Python type and Arrow + data types used for storage. + + The handler itself IS the definition of a semantic type. The semantic type + name/identifier is provided by the registerer when registering the handler. + + TypeHandlers should clearly communicate what Python types they can handle, + and focus purely on conversion logic. + """ + + def python_type(self) -> type: + """Return the Python type(s) this handler can process. + + Returns: + Python type the handler supports + + Examples: + - PathHandler: return Path + - NumericHandler: return (int, float) + - CollectionHandler: return (list, tuple, set) + """ + ... + + def storage_type(self) -> type: + """Return the Arrow DataType instance for schema definition.""" + ... + + def python_to_storage(self, value: Any) -> Any: + """Convert Python value to Arrow-compatible storage representation.""" + ... + + def storage_to_python(self, value: Any) -> Any: + """Convert storage representation back to Python object.""" + ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/protocols/types.py b/src/orcapod/protocols/types.py new file mode 100644 index 0000000..73e67f1 --- /dev/null +++ b/src/orcapod/protocols/types.py @@ -0,0 +1,51 @@ +# from typing import TypeAlias +# from collections.abc import Collection, Mapping +# from pathlib import Path +# import logging +# import os + +# logger = logging.getLogger(__name__) + + +# # class TypeSpec(dict[str, DataType]): +# # def __init__(self, *args, **kwargs): +# # """ +# # TypeSpec is a mapping of parameter names to their types. +# # It can be used to define the expected types of parameters in a function or a pod. +# # """ +# # super().__init__(*args, **kwargs) + + +# # Convenience alias for anything pathlike +# PathLike: TypeAlias = str | os.PathLike + +# # an (optional) string or a collection of (optional) string values +# # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists +# TagValue: TypeAlias = int | str | None | Collection["TagValue"] + +# # the top level tag is a mapping from string keys to values that can be a string or +# # an arbitrary depth of nested list of strings or None +# Tag: TypeAlias = Mapping[str, TagValue] + +# # a pathset is a path or an arbitrary depth of nested list of paths +# PathSet: TypeAlias = PathLike | Collection[PathLike | None] + +# # Simple data types that we support (with clear Polars correspondence) +# SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes + +# ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike + +# TypeSpec = dict[str, type] # Mapping of parameter names to their types + +# # Extended data values that can be stored in packets +# # Either the original PathSet or one of our supported simple data types +# DataValue: TypeAlias = ( +# PathSet +# | SupportedNativePythonData +# | None +# | Collection["DataValue"] +# | Mapping[str, "DataValue"] +# ) + + +# PacketLike = Mapping[str, DataValue] From 37ed8e8713929c55f812ea265f03614f2e4d35f0 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:48:42 +0000 Subject: [PATCH 098/224] refactor: use protocols in hashing package --- src/orcapod/hashing/__init__.py | 12 +- src/orcapod/hashing/arrow_hashers.py | 2 +- src/orcapod/hashing/defaults.py | 47 ++- src/orcapod/hashing/file_hashers.py | 345 +++++++++--------- .../hashing/function_info_extractors.py | 2 +- src/orcapod/hashing/hash_utils.py | 7 +- src/orcapod/hashing/legacy_core.py | 23 +- src/orcapod/hashing/object_hashers.py | 54 ++- src/orcapod/hashing/semantic_type_hashers.py | 6 +- src/orcapod/hashing/string_cachers.py | 2 +- src/orcapod/hashing/versioned_hashers.py | 2 +- src/orcapod/types/core.py | 45 ++- src/orcapod/types/packet_converter.py | 182 --------- 13 files changed, 297 insertions(+), 432 deletions(-) delete mode 100644 src/orcapod/types/packet_converter.py diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index b1e5849..eb94afe 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -2,21 +2,11 @@ get_default_object_hasher, get_default_arrow_hasher, ) -from .types import ( - FileContentHasher, - LegacyPacketHasher, - ArrowHasher, - ObjectHasher, - StringCacher, - FunctionInfoExtractor, - LegacyCompositeFileHasher, -) -from .content_identifiable import ContentIdentifiableBase + __all__ = [ "FileContentHasher", "LegacyPacketHasher", - "ArrowHasher", "StringCacher", "ObjectHasher", "LegacyCompositeFileHasher", diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 465b29b..2b66b52 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -3,7 +3,7 @@ import pyarrow as pa import polars as pl import json -from orcapod.hashing.types import SemanticTypeHasher, StringCacher +from orcapod.protocols.hashing_protocols import SemanticTypeHasher, StringCacher from orcapod.hashing import arrow_serialization from collections.abc import Callable diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index 3bae548..c9e404b 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -1,13 +1,8 @@ # A collection of utility function that provides a "default" implementation of hashers. # This is often used as the fallback hasher in the library code. -from orcapod.hashing.types import ( - LegacyCompositeFileHasher, - ArrowHasher, - StringCacher, -) -from orcapod.hashing.file_hashers import LegacyPathLikeHasherFactory +from orcapod.protocols import hashing_protocols as hp + from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.hashing.object_hashers import ObjectHasher from orcapod.hashing.object_hashers import LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.versioned_hashers import ( @@ -17,8 +12,8 @@ def get_default_arrow_hasher( - cache_file_hash: bool | StringCacher = True, -) -> ArrowHasher: + cache_file_hash: bool | hp.StringCacher = True, +) -> hp.ArrowHasher: """ Get the default Arrow hasher with semantic type support. If `cache_file_hash` is True, it uses an in-memory cacher for caching hash values. If a `StringCacher` is provided, it uses that for caching file hashes. @@ -26,22 +21,22 @@ def get_default_arrow_hasher( arrow_hasher = get_versioned_semantic_arrow_hasher() if cache_file_hash: # use unlimited caching - if isinstance(cache_file_hash, StringCacher): - string_cacher = cache_file_hash - else: + if cache_file_hash is True: string_cacher = InMemoryCacher(max_size=None) + else: + string_cacher = cache_file_hash arrow_hasher.set_cacher("path", string_cacher) return arrow_hasher -def get_default_object_hasher() -> ObjectHasher: +def get_default_object_hasher() -> hp.ObjectHasher: object_hasher = get_versioned_object_hasher() return object_hasher -def get_legacy_object_hasher() -> ObjectHasher: +def get_legacy_object_hasher() -> hp.ObjectHasher: function_info_extractor = ( FunctionInfoExtractorFactory.create_function_info_extractor( strategy="signature" @@ -50,17 +45,17 @@ def get_legacy_object_hasher() -> ObjectHasher: return LegacyObjectHasher(function_info_extractor=function_info_extractor) -def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: - if with_cache: - # use unlimited caching - string_cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_legacy_composite(string_cacher) - return LegacyPathLikeHasherFactory.create_basic_legacy_composite() +# def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: +# if with_cache: +# # use unlimited caching +# string_cacher = InMemoryCacher(max_size=None) +# return LegacyPathLikeHasherFactory.create_cached_legacy_composite(string_cacher) +# return LegacyPathLikeHasherFactory.create_basic_legacy_composite() -def get_default_composite_file_hasher_with_cacher( - cacher=None, -) -> LegacyCompositeFileHasher: - if cacher is None: - cacher = InMemoryCacher(max_size=None) - return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) +# def get_default_composite_file_hasher_with_cacher( +# cacher=None, +# ) -> LegacyCompositeFileHasher: +# if cacher is None: +# cacher = InMemoryCacher(max_size=None) +# return LegacyPathLikeHasherFactory.create_cached_legacy_composite(cacher) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index f0ca8d1..d5fc761 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,13 +1,10 @@ from orcapod.hashing import legacy_core from orcapod.hashing.hash_utils import hash_file -from orcapod.hashing.types import ( +from orcapod.protocols.hashing_protocols import ( FileContentHasher, StringCacher, - LegacyFileHasher, - LegacyPathSetHasher, - LegacyCompositeFileHasher, ) -from orcapod.types import PacketLike, PathLike, PathSet +from orcapod.types import PathLike, PathSet, PacketLike class BasicFileHasher: @@ -52,172 +49,172 @@ def hash_file(self, file_path: PathLike) -> bytes: # ----------------Legacy implementations for backward compatibility----------------- -class LegacyDefaultFileHasher: - def __init__( - self, - algorithm: str = "sha256", - buffer_size: int = 65536, - ): - self.algorithm = algorithm - self.buffer_size = buffer_size - - def hash_file(self, file_path: PathLike) -> str: - return legacy_core.hash_file( - file_path, algorithm=self.algorithm, buffer_size=self.buffer_size - ) - - -class LegacyCachedFileHasher: - """File hasher with caching.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - string_cacher: StringCacher, - ): - self.file_hasher = file_hasher - self.string_cacher = string_cacher - - def hash_file(self, file_path: PathLike) -> str: - cache_key = f"file:{file_path}" - cached_value = self.string_cacher.get_cached(cache_key) - if cached_value is not None: - return cached_value - - value = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, value) - return value - - -class LegacyDefaultPathsetHasher: - """Default pathset hasher that composes file hashing.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - char_count: int | None = 32, - ): - self.file_hasher = file_hasher - self.char_count = char_count - - def _hash_file_to_hex(self, file_path: PathLike) -> str: - return self.file_hasher.hash_file(file_path) - - def hash_pathset(self, pathset: PathSet) -> str: - """Hash a pathset using the injected file hasher.""" - return legacy_core.hash_pathset( - pathset, - char_count=self.char_count, - file_hasher=self.file_hasher.hash_file, # Inject the method - ) - - -class LegacyDefaultPacketHasher: - """Default packet hasher that composes pathset hashing.""" - - def __init__( - self, - pathset_hasher: LegacyPathSetHasher, - char_count: int | None = 32, - prefix: str = "", - ): - self.pathset_hasher = pathset_hasher - self.char_count = char_count - self.prefix = prefix - - def _hash_pathset_to_hex(self, pathset: PathSet): - return self.pathset_hasher.hash_pathset(pathset) - - def hash_packet(self, packet: PacketLike) -> str: - """Hash a packet using the injected pathset hasher.""" - hash_str = legacy_core.hash_packet( - packet, - char_count=self.char_count, - prefix_algorithm=False, # Will apply prefix on our own - pathset_hasher=self._hash_pathset_to_hex, # Inject the method - ) - return f"{self.prefix}-{hash_str}" if self.prefix else hash_str - - -# Convenience composite implementation -class LegacyDefaultCompositeFileHasher: - """Composite hasher that implements all interfaces.""" - - def __init__( - self, - file_hasher: LegacyFileHasher, - char_count: int | None = 32, - packet_prefix: str = "", - ): - self.file_hasher = file_hasher - self.pathset_hasher = LegacyDefaultPathsetHasher(self.file_hasher, char_count) - self.packet_hasher = LegacyDefaultPacketHasher( - self.pathset_hasher, char_count, packet_prefix - ) - - def hash_file(self, file_path: PathLike) -> str: - return self.file_hasher.hash_file(file_path) - - def hash_pathset(self, pathset: PathSet) -> str: - return self.pathset_hasher.hash_pathset(pathset) - - def hash_packet(self, packet: PacketLike) -> str: - return self.packet_hasher.hash_packet(packet) - - -# Factory for easy construction -class LegacyPathLikeHasherFactory: - """Factory for creating various hasher combinations.""" - - @staticmethod - def create_basic_legacy_composite( - algorithm: str = "sha256", - buffer_size: int = 65536, - char_count: int | None = 32, - ) -> LegacyCompositeFileHasher: - """Create a basic composite hasher.""" - file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - # use algorithm as the prefix for the packet hasher - return LegacyDefaultCompositeFileHasher( - file_hasher, char_count, packet_prefix=algorithm - ) - - @staticmethod - def create_cached_legacy_composite( - string_cacher: StringCacher, - algorithm: str = "sha256", - buffer_size: int = 65536, - char_count: int | None = 32, - ) -> LegacyCompositeFileHasher: - """Create a composite hasher with file caching.""" - basic_file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - cached_file_hasher = LegacyCachedFileHasher(basic_file_hasher, string_cacher) - return LegacyDefaultCompositeFileHasher( - cached_file_hasher, char_count, packet_prefix=algorithm - ) - - @staticmethod - def create_legacy_file_hasher( - string_cacher: StringCacher | None = None, - algorithm: str = "sha256", - buffer_size: int = 65536, - ) -> LegacyFileHasher: - """Create just a file hasher, optionally with caching.""" - default_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) - if string_cacher is None: - return default_hasher - else: - return LegacyCachedFileHasher(default_hasher, string_cacher) - - @staticmethod - def create_file_hasher( - string_cacher: StringCacher | None = None, - algorithm: str = "sha256", - buffer_size: int = 65536, - ) -> FileContentHasher: - """Create just a file hasher, optionally with caching.""" - basic_hasher = BasicFileHasher(algorithm, buffer_size) - if string_cacher is None: - return basic_hasher - else: - return CachedFileHasher(basic_hasher, string_cacher) +# class LegacyDefaultFileHasher: +# def __init__( +# self, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ): +# self.algorithm = algorithm +# self.buffer_size = buffer_size + +# def hash_file(self, file_path: PathLike) -> str: +# return legacy_core.hash_file( +# file_path, algorithm=self.algorithm, buffer_size=self.buffer_size +# ) + + +# class LegacyCachedFileHasher: +# """File hasher with caching.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# string_cacher: StringCacher, +# ): +# self.file_hasher = file_hasher +# self.string_cacher = string_cacher + +# def hash_file(self, file_path: PathLike) -> str: +# cache_key = f"file:{file_path}" +# cached_value = self.string_cacher.get_cached(cache_key) +# if cached_value is not None: +# return cached_value + +# value = self.file_hasher.hash_file(file_path) +# self.string_cacher.set_cached(cache_key, value) +# return value + + +# class LegacyDefaultPathsetHasher: +# """Default pathset hasher that composes file hashing.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# char_count: int | None = 32, +# ): +# self.file_hasher = file_hasher +# self.char_count = char_count + +# def _hash_file_to_hex(self, file_path: PathLike) -> str: +# return self.file_hasher.hash_file(file_path) + +# def hash_pathset(self, pathset: PathSet) -> str: +# """Hash a pathset using the injected file hasher.""" +# return legacy_core.hash_pathset( +# pathset, +# char_count=self.char_count, +# file_hasher=self.file_hasher.hash_file, # Inject the method +# ) + + +# class LegacyDefaultPacketHasher: +# """Default packet hasher that composes pathset hashing.""" + +# def __init__( +# self, +# pathset_hasher: LegacyPathSetHasher, +# char_count: int | None = 32, +# prefix: str = "", +# ): +# self.pathset_hasher = pathset_hasher +# self.char_count = char_count +# self.prefix = prefix + +# def _hash_pathset_to_hex(self, pathset: PathSet): +# return self.pathset_hasher.hash_pathset(pathset) + +# def hash_packet(self, packet: PacketLike) -> str: +# """Hash a packet using the injected pathset hasher.""" +# hash_str = legacy_core.hash_packet( +# packet, +# char_count=self.char_count, +# prefix_algorithm=False, # Will apply prefix on our own +# pathset_hasher=self._hash_pathset_to_hex, # Inject the method +# ) +# return f"{self.prefix}-{hash_str}" if self.prefix else hash_str + + +# # Convenience composite implementation +# class LegacyDefaultCompositeFileHasher: +# """Composite hasher that implements all interfaces.""" + +# def __init__( +# self, +# file_hasher: LegacyFileHasher, +# char_count: int | None = 32, +# packet_prefix: str = "", +# ): +# self.file_hasher = file_hasher +# self.pathset_hasher = LegacyDefaultPathsetHasher(self.file_hasher, char_count) +# self.packet_hasher = LegacyDefaultPacketHasher( +# self.pathset_hasher, char_count, packet_prefix +# ) + +# def hash_file(self, file_path: PathLike) -> str: +# return self.file_hasher.hash_file(file_path) + +# def hash_pathset(self, pathset: PathSet) -> str: +# return self.pathset_hasher.hash_pathset(pathset) + +# def hash_packet(self, packet: PacketLike) -> str: +# return self.packet_hasher.hash_packet(packet) + + +# # Factory for easy construction +# class LegacyPathLikeHasherFactory: +# """Factory for creating various hasher combinations.""" + +# @staticmethod +# def create_basic_legacy_composite( +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# char_count: int | None = 32, +# ) -> LegacyCompositeFileHasher: +# """Create a basic composite hasher.""" +# file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# # use algorithm as the prefix for the packet hasher +# return LegacyDefaultCompositeFileHasher( +# file_hasher, char_count, packet_prefix=algorithm +# ) + +# @staticmethod +# def create_cached_legacy_composite( +# string_cacher: StringCacher, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# char_count: int | None = 32, +# ) -> LegacyCompositeFileHasher: +# """Create a composite hasher with file caching.""" +# basic_file_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# cached_file_hasher = LegacyCachedFileHasher(basic_file_hasher, string_cacher) +# return LegacyDefaultCompositeFileHasher( +# cached_file_hasher, char_count, packet_prefix=algorithm +# ) + +# @staticmethod +# def create_legacy_file_hasher( +# string_cacher: StringCacher | None = None, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ) -> LegacyFileHasher: +# """Create just a file hasher, optionally with caching.""" +# default_hasher = LegacyDefaultFileHasher(algorithm, buffer_size) +# if string_cacher is None: +# return default_hasher +# else: +# return LegacyCachedFileHasher(default_hasher, string_cacher) + +# @staticmethod +# def create_file_hasher( +# string_cacher: StringCacher | None = None, +# algorithm: str = "sha256", +# buffer_size: int = 65536, +# ) -> FileContentHasher: +# """Create just a file hasher, optionally with caching.""" +# basic_hasher = BasicFileHasher(algorithm, buffer_size) +# if string_cacher is None: +# return basic_hasher +# else: +# return CachedFileHasher(basic_hasher, string_cacher) diff --git a/src/orcapod/hashing/function_info_extractors.py b/src/orcapod/hashing/function_info_extractors.py index 816208b..27cae33 100644 --- a/src/orcapod/hashing/function_info_extractors.py +++ b/src/orcapod/hashing/function_info_extractors.py @@ -1,4 +1,4 @@ -from .types import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from collections.abc import Callable from typing import Any, Literal from orcapod.types import TypeSpec diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 0dc0777..476b0a0 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -1,5 +1,6 @@ from typing import Any from .function_info_extractors import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import ContentIdentifiable import logging import json from uuid import UUID @@ -71,9 +72,9 @@ def process_structure( if obj is None: return None - from .content_identifiable import ContentIdentifiableBase - - if isinstance(obj, ContentIdentifiableBase): + # TODO: currently using runtime_checkable on ContentIdentifiable protocol + # Re-evaluate this strategy to see if a faster / more robust check could be used + if isinstance(obj, ContentIdentifiable): logger.debug( f"Processing ContentHashableBase instance of type {type(obj).__name__}" ) diff --git a/src/orcapod/hashing/legacy_core.py b/src/orcapod/hashing/legacy_core.py index a5b4319..e338a89 100644 --- a/src/orcapod/hashing/legacy_core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -1,18 +1,9 @@ -""" -Stable Hashing Library -====================== - -A library for creating stable, content-based hashes that remain consistent across Python sessions, -suitable for arbitrarily nested data structures and custom objects via HashableMixin. -""" - -WARN_NONE_IDENTITY = False import hashlib import inspect import json import logging import zlib -from .types import FunctionInfoExtractor +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from functools import partial from os import PathLike from pathlib import Path @@ -33,9 +24,19 @@ import xxhash -from orcapod.types import Packet, PacketLike, PathSet +from orcapod.types import PathSet, Packet, PacketLike from orcapod.utils.name import find_noncolliding_name +WARN_NONE_IDENTITY = False +""" +Stable Hashing Library +====================== + +A library for creating stable, content-based hashes that remain consistent across Python sessions, +suitable for arbitrarily nested data structures and custom objects via HashableMixin. +""" + + # Configure logging with __name__ for proper hierarchy logger = logging.getLogger(__name__) diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 3401574..97568f5 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,9 +1,57 @@ -from orcapod.hashing.types import FunctionInfoExtractor, ObjectHasher +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor, ObjectHasher from orcapod.hashing import legacy_core from orcapod.hashing import hash_utils +from typing import Any +import uuid +from abc import ABC, abstractmethod + + +class ObjectHasherBase(ABC): + @abstractmethod + def hash(self, obj: object) -> bytes: ... + + @abstractmethod + def get_hasher_id(self) -> str: ... + + def hash_to_hex( + self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + ) -> str: + hash_bytes = self.hash(obj) + hex_str = hash_bytes.hex() + + # TODO: clean up this logic, as char_count handling is messy + if char_count is not None: + if char_count > len(hex_str): + raise ValueError( + f"Cannot truncate to {char_count} chars, hash only has {len(hex_str)}" + ) + hex_str = hex_str[:char_count] + if prefix_hasher_id: + hex_str = self.get_hasher_id() + "@" + hex_str + return hex_str + + def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: + """ + Hash an object to an integer. + + Args: + obj (Any): The object to hash. + hexdigits (int): Number of hexadecimal digits to use for the hash. + + Returns: + int: The integer representation of the hash. + """ + hex_hash = self.hash_to_hex(obj, char_count=hexdigits) + return int(hex_hash, 16) + + def hash_to_uuid( + self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + ) -> uuid.UUID: + """Convert hash to proper UUID5.""" + return uuid.uuid5(namespace, self.hash(obj)) -class BasicObjectHasher(ObjectHasher): +class BasicObjectHasher(ObjectHasherBase): """ Default object hasher used throughout the codebase. """ @@ -34,7 +82,7 @@ def hash(self, obj: object) -> bytes: ) -class LegacyObjectHasher(ObjectHasher): +class LegacyObjectHasher(ObjectHasherBase): """ Legacy object hasher that returns the string representation of the object. diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 5be28b0..4508f95 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -1,4 +1,8 @@ -from orcapod.hashing.types import SemanticTypeHasher, FileContentHasher, StringCacher +from orcapod.protocols.hashing_protocols import ( + SemanticTypeHasher, + FileContentHasher, + StringCacher, +) import os import hashlib import pyarrow as pa diff --git a/src/orcapod/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py index 620dece..bb09eff 100644 --- a/src/orcapod/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TYPE_CHECKING -from orcapod.hashing.types import StringCacher +from orcapod.protocols.hashing_protocols import StringCacher logger = logging.getLogger(__name__) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index d2fec4d..c5c1919 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,6 +1,6 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher -from .types import ObjectHasher +from orcapod.protocols.hashing_protocols import ObjectHasher import importlib from typing import Any diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 22491ae..98b49b8 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -2,6 +2,8 @@ import os from collections.abc import Collection, Mapping +import logging + DataType: TypeAlias = type @@ -9,6 +11,17 @@ str, DataType ] # Mapping of parameter names to their types +logger = logging.getLogger(__name__) + + +# class TypeSpec(dict[str, DataType]): +# def __init__(self, *args, **kwargs): +# """ +# TypeSpec is a mapping of parameter names to their types. +# It can be used to define the expected types of parameters in a function or a pod. +# """ +# super().__init__(*args, **kwargs) + # Convenience alias for anything pathlike PathLike = str | os.PathLike @@ -27,29 +40,27 @@ # Simple data types that we support (with clear Polars correspondence) SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes -ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike +ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathSet # Extended data values that can be stored in packets # Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = ( - PathSet - | SupportedNativePythonData - | None - | Collection["DataValue"] - | Mapping[str, "DataValue"] -) +DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None +StoreValue: TypeAlias = SupportedNativePythonData | Collection["StoreValue"] | None -class PodFunction(Protocol): - """ - A function suitable to be used in a FunctionPod. - It takes one or more named arguments, each corresponding to either: - - A path to a file or directory (PathSet) - for backward compatibility - - A simple data value (str, int, float, bool, bytes, Path) - and returns either None, a single value, or a list of values - """ +PacketLike: TypeAlias = Mapping[str, DataValue] + + +# class PodFunction(Protocol): +# """ +# A function suitable to be used in a FunctionPod. +# It takes one or more named arguments, each corresponding to either: +# - A path to a file or directory (PathSet) - for backward compatibility +# - A simple data value (str, int, float, bool, bytes, Path) +# and returns either None, a single value, or a list of values +# """ - def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... +# def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... class TypeHandler(Protocol): diff --git a/src/orcapod/types/packet_converter.py b/src/orcapod/types/packet_converter.py deleted file mode 100644 index 6edea00..0000000 --- a/src/orcapod/types/packet_converter.py +++ /dev/null @@ -1,182 +0,0 @@ -# from orcapod.types.core import TypeSpec, TypeHandler -# from orcapod.types.packets import Packet, PacketLike -# from orcapod.types.semantic_type_registry import ( -# SemanticTypeRegistry, -# TypeInfo, -# get_metadata_from_schema, -# arrow_to_dicts, -# ) -# from typing import Any -# from collections.abc import Mapping, Sequence -# import pyarrow as pa -# import logging - -# logger = logging.getLogger(__name__) - - -# def is_packet_supported( -# python_type_info: TypeSpec, -# registry: SemanticTypeRegistry, -# type_lut: dict | None = None, -# ) -> bool: -# """Check if all types in the packet are supported by the registry or known to the default lut.""" -# if type_lut is None: -# type_lut = {} -# return all( -# python_type in registry or python_type in type_lut -# for python_type in python_type_info.values() -# ) - - -# class PacketConverter: -# def __init__(self, python_type_spec: TypeSpec, registry: SemanticTypeRegistry): -# self.python_type_spec = python_type_spec -# self.registry = registry - -# # Lookup handlers and type info for fast access -# self.handlers: dict[str, TypeHandler] = {} -# self.storage_type_info: dict[str, TypeInfo] = {} - -# self.expected_key_set = set(python_type_spec.keys()) - -# # prepare the corresponding arrow table schema with metadata -# self.keys_with_handlers, self.schema = create_schema_from_python_type_info( -# python_type_spec, registry -# ) - -# self.semantic_type_lut = get_metadata_from_schema(self.schema, b"semantic_type") - -# def _check_key_consistency(self, keys): -# """Check if the provided keys match the expected keys.""" -# keys_set = set(keys) -# if keys_set != self.expected_key_set: -# missing_keys = self.expected_key_set - keys_set -# extra_keys = keys_set - self.expected_key_set -# error_parts = [] -# if missing_keys: -# error_parts.append(f"Missing keys: {missing_keys}") -# if extra_keys: -# error_parts.append(f"Extra keys: {extra_keys}") - -# raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - -# def _to_storage_packet(self, packet: PacketLike) -> dict[str, Any]: -# """Convert packet to storage representation. - -# Args: -# packet: Dictionary mapping parameter names to Python values - -# Returns: -# Dictionary with same keys but values converted to storage format - -# Raises: -# KeyError: If packet keys don't match the expected type_info keys -# TypeError: If value type doesn't match expected type -# ValueError: If conversion fails -# """ -# # Validate packet keys -# packet_keys = set(packet.keys()) - -# self._check_key_consistency(packet_keys) - -# # Convert each value -# storage_packet: dict[str, Any] = dict(packet) # Start with a copy of the packet - -# for key, handler in self.keys_with_handlers: -# try: -# storage_packet[key] = handler.python_to_storage(storage_packet[key]) -# except Exception as e: -# raise ValueError(f"Failed to convert value for '{key}': {e}") from e - -# return storage_packet - -# def _from_storage_packet(self, storage_packet: Mapping[str, Any]) -> PacketLike: -# """Convert storage packet back to Python packet. - -# Args: -# storage_packet: Dictionary with values in storage format - -# Returns: -# Packet with values converted back to Python types - -# Raises: -# KeyError: If storage packet keys don't match the expected type_info keys -# TypeError: If value type doesn't match expected type -# ValueError: If conversion fails -# """ -# # Validate storage packet keys -# storage_keys = set(storage_packet.keys()) - -# self._check_key_consistency(storage_keys) - -# # Convert each value back to Python type -# packet: PacketLike = dict(storage_packet) - -# for key, handler in self.keys_with_handlers: -# try: -# packet[key] = handler.storage_to_python(storage_packet[key]) -# except Exception as e: -# raise ValueError(f"Failed to convert value for '{key}': {e}") from e - -# return packet - -# def to_arrow_table(self, packet: PacketLike | Sequence[PacketLike]) -> pa.Table: -# """Convert packet to PyArrow Table with field metadata. - -# Args: -# packet: Dictionary mapping parameter names to Python values - -# Returns: -# PyArrow Table with the packet data as a single row -# """ -# # Convert packet to storage format -# if not isinstance(packet, Sequence): -# packets = [packet] -# else: -# packets = packet - -# storage_packets = [self._to_storage_packet(p) for p in packets] - -# # Create arrays -# arrays = [] -# for field in self.schema: -# values = [p[field.name] for p in storage_packets] -# array = pa.array(values, type=field.type) -# arrays.append(array) - -# return pa.Table.from_arrays(arrays, schema=self.schema) - -# def from_arrow_table( -# self, table: pa.Table, verify_semantic_equivalence: bool = True -# ) -> list[Packet]: -# """Convert Arrow table to packet with field metadata. - -# Args: -# table: PyArrow Table with metadata - -# Returns: -# List of packets converted from the Arrow table -# """ -# # Check for consistency in the semantic type mapping: -# semantic_type_info = get_metadata_from_schema(table.schema, b"semantic_type") - -# if semantic_type_info != self.semantic_type_lut: -# if not verify_semantic_equivalence: -# logger.warning( -# "Arrow table semantic types do not match expected type registry. " -# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" -# ) -# else: -# raise ValueError( -# "Arrow table semantic types do not match expected type registry. " -# f"Expected: {self.semantic_type_lut}, got: {semantic_type_info}" -# ) - -# # Create packets from the Arrow table -# # TODO: make this more efficient -# storage_packets: list[Packet] = arrow_to_dicts(table) # type: ignore -# if not self.keys_with_handlers: -# # no special handling required -# return storage_packets - -# return [Packet(self._from_storage_packet(packet)) for packet in storage_packets] From b04fb61692d376044c8ec6887e90413f79ce7c64 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:49:06 +0000 Subject: [PATCH 099/224] refactor: temporarily stop top level import while refactoring --- src/orcapod/__init__.py | 66 ++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index ad00035..01cd5db 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,36 +1,36 @@ -from .core import operators, sources, streams -from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -from . import hashing, stores -from .core.operators import Join, MapPackets, MapTags, packet, tag -from .core.pod import FunctionPod, function_pod -from .core.sources import GlobSource -from .stores import DirDataStore, SafeDirDataStore -from .core.tracker import GraphTracker -from .pipeline import Pipeline +# from .core import operators, sources, streams +# from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator +# from . import hashing, stores +# from .core.operators import Join, MapPackets, MapTags, packet, tag +# from .core.pod import FunctionPod, function_pod +# from .core.sources import GlobSource +# from .stores import DirDataStore, SafeDirDataStore +# from .core.tracker import GraphTracker +# from .pipeline import Pipeline -DEFAULT_TRACKER = GraphTracker() -DEFAULT_TRACKER.activate() +# DEFAULT_TRACKER = GraphTracker() +# DEFAULT_TRACKER.activate() -__all__ = [ - "hashing", - "stores", - "pod", - "operators", - "streams", - "sources", - "MapTags", - "MapPackets", - "Join", - "tag", - "packet", - "FunctionPod", - "function_pod", - "GlobSource", - "DirDataStore", - "SafeDirDataStore", - "DEFAULT_TRACKER", - "SyncStreamFromLists", - "SyncStreamFromGenerator", - "Pipeline", -] +# __all__ = [ +# "hashing", +# "stores", +# "pod", +# "operators", +# "streams", +# "sources", +# "MapTags", +# "MapPackets", +# "Join", +# "tag", +# "packet", +# "FunctionPod", +# "function_pod", +# "GlobSource", +# "DirDataStore", +# "SafeDirDataStore", +# "DEFAULT_TRACKER", +# "SyncStreamFromLists", +# "SyncStreamFromGenerator", +# "Pipeline", +# ] From f47aa02bd888322524da000c7eac2af72d8198c9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:50:42 +0000 Subject: [PATCH 100/224] refactor: remove protocol-relevant definitions --- src/orcapod/types/__init__.py | 10 ++++----- src/orcapod/types/schemas.py | 18 ++++++++++----- src/orcapod/types/typespec_utils.py | 35 ++++++++++++----------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index 03a3b4b..179a253 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,10 +1,11 @@ -from .core import Tag, PathLike, PathSet, PodFunction, TypeSpec -from .packets import Packet, PacketLike +from .core import Tag, PathLike, PathSet, TypeSpec, DataValue, StoreValue from .semantic_type_registry import SemanticTypeRegistry from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler from . import semantic_type_handlers from . import typespec_utils +Packet = dict[str, str] +PacketLike = Packet # Create default registry and register handlers default_registry = SemanticTypeRegistry() @@ -19,12 +20,11 @@ __all__ = [ "default_registry", "Tag", - "Packet", - "PacketLike", "TypeSpec", "PathLike", "PathSet", - "PodFunction", "semantic_type_handlers", "typespec_utils", + "DataValue", + "StoreValue", ] diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 35cc4f0..31e56d5 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,4 +1,4 @@ -from orcapod.types import TypeSpec +from orcapod.types.core import DataType, TypeSpec from orcapod.types.semantic_type_registry import SemanticTypeRegistry import pyarrow as pa import datetime @@ -38,7 +38,7 @@ def arrow_to_python_type(arrow_type: pa.DataType) -> type: raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") -class PythonSchema(dict[str, type]): +class PythonSchema(dict[str, DataType]): """ A schema for Python data types, mapping string keys to Python types. @@ -70,6 +70,9 @@ def with_source_info(self) -> dict[str, type]: """ return {**self, **{f"_source_info_{k}": str for k in self.keys()}} + def copy(self) -> "PythonSchema": + return PythonSchema(self) + class SemanticSchema(dict[str, tuple[type, str | None]]): """ @@ -299,11 +302,16 @@ def from_arrow_schema_to_semantic_schema( """ semantic_schema = {} for field in arrow_schema: - if field.metadata.get(b"field_type", b"") == b"source_info": + if field.name.startswith("_source_info_") or ( + field.metadata and field.metadata.get(b"field_type", b"") == b"source_info" + ): # Skip source info fields continue - semantic_type = field.metadata.get(b"semantic_type", None) - semantic_type = semantic_type.decode() if semantic_type else None + + semantic_type = None + if field.metadata is not None: + semantic_type = field.metadata.get(b"semantic_type", None) + semantic_type = semantic_type.decode() if semantic_type else None python_type = arrow_to_python_type(field.type) semantic_schema[field.name] = (python_type, semantic_type) return SemanticSchema(semantic_schema) diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index a0a3c58..71318aa 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -55,8 +55,8 @@ def check_typespec_compatibility( def extract_function_typespecs( func: Callable, output_keys: Collection[str], - input_types: TypeSpec | None = None, - output_types: TypeSpec | Sequence[type] | None = None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | Sequence[type] | None = None, ) -> tuple[TypeSpec, TypeSpec]: """ Extract input and output data types from a function signature. @@ -137,23 +137,23 @@ def extract_function_typespecs( {'count': , 'total': , 'repr': } """ verified_output_types: TypeSpec = {} - if output_types is not None: - if isinstance(output_types, dict): - verified_output_types = output_types - elif isinstance(output_types, Sequence): + if output_typespec is not None: + if isinstance(output_typespec, dict): + verified_output_types = output_typespec + elif isinstance(output_typespec, Sequence): # If output_types is a collection, convert it to a dict with keys from return_keys - if len(output_types) != len(output_keys): + if len(output_typespec) != len(output_keys): raise ValueError( - f"Output types collection length {len(output_types)} does not match return keys length {len(output_keys)}." + f"Output types collection length {len(output_typespec)} does not match return keys length {len(output_keys)}." ) - verified_output_types = {k: v for k, v in zip(output_keys, output_types)} + verified_output_types = {k: v for k, v in zip(output_keys, output_typespec)} signature = inspect.signature(func) param_info: TypeSpec = {} for name, param in signature.parameters.items(): - if input_types and name in input_types: - param_info[name] = input_types[name] + if input_typespec and name in input_typespec: + param_info[name] = input_typespec[name] else: # check if the parameter has annotation if param.annotation is not inspect.Signature.empty: @@ -232,11 +232,7 @@ def get_compatible_type(type1: Any, type2: Any) -> Any: raise TypeError(f"Types {type1} and {type2} are not compatible") -def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | None: - if left is None: - return right - if right is None: - return left +def union_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: # Merge the two TypeSpecs but raise an error if conflicts in types are found merged = dict(left) for key, right_type in right.items(): @@ -248,15 +244,12 @@ def union_typespecs(left: TypeSpec | None, right: TypeSpec | None) -> TypeSpec | return merged -def intersection_typespecs( - left: TypeSpec | None, right: TypeSpec | None -) -> TypeSpec | None: +def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: """ Returns the intersection of two TypeSpecs, only returning keys that are present in both. If a key is present in both TypeSpecs, the type must be the same. """ - if left is None or right is None: - return None + # Find common keys and ensure types match common_keys = set(left.keys()).intersection(set(right.keys())) intersection = {} From 1b22ee6a12edfe22ee58dcf72a1f5a22f5c2ced3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 10 Jul 2025 23:51:46 +0000 Subject: [PATCH 101/224] refactor: add concrete component implementation in data package --- src/orcapod/data/__init__.py | 0 src/orcapod/data/base.py | 149 +++++++++ src/orcapod/data/datagrams.py | 608 ++++++++++++++++++++++++++++++++++ src/orcapod/data/kernels.py | 104 ++++++ src/orcapod/data/operators.py | 156 +++++++++ src/orcapod/data/pods.py | 340 +++++++++++++++++++ src/orcapod/data/streams.py | 487 +++++++++++++++++++++++++++ src/orcapod/data/trackers.py | 150 +++++++++ 8 files changed, 1994 insertions(+) create mode 100644 src/orcapod/data/__init__.py create mode 100644 src/orcapod/data/base.py create mode 100644 src/orcapod/data/datagrams.py create mode 100644 src/orcapod/data/kernels.py create mode 100644 src/orcapod/data/operators.py create mode 100644 src/orcapod/data/pods.py create mode 100644 src/orcapod/data/streams.py create mode 100644 src/orcapod/data/trackers.py diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py new file mode 100644 index 0000000..5082c9b --- /dev/null +++ b/src/orcapod/data/base.py @@ -0,0 +1,149 @@ +from abc import ABC, abstractmethod +from typing import Any +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import TypeSpec +from orcapod.hashing.defaults import get_default_object_hasher +import pyarrow as pa +import logging + + +logger = logging.getLogger(__name__) + + +class DatagramBase(ABC): + """ + Base class for data packets that can be processed in a pipeline. + This class provides a common interface for data packets, allowing them to be processed + and transformed in a consistent manner. + """ + + @property + @abstractmethod + def typespec(self) -> TypeSpec: + """Return the type specification of the data packet.""" + pass + + @abstractmethod + def keys(self) -> tuple[str, ...]: + """Return the keys of the data packet.""" + pass + + @abstractmethod + def as_table(self) -> pa.Table: + """Convert the data packet to a PyArrow Table.""" + pass + + @abstractmethod + def as_dict(self) -> dict[str, Any]: + """Convert the data packet to a dictionary.""" + pass + + +class LabeledContentIdentifiableBase: + """ + Base class for content-identifiable objects. + This class provides a way to define objects that can be uniquely identified + based on their content rather than their identity in memory. Specifically, the identity of the + object is determined by the structure returned by the `identity_structure` method. + The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, + which defaults to the one returned by `get_default_object_hasher`. + Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. + """ + + def __init__( + self, + identity_structure_hasher: hp.ObjectHasher | None = None, + label: str | None = None, + ) -> None: + """ + Initialize the ContentHashable with an optional ObjectHasher. + + Args: + identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. + """ + self.identity_structure_hasher = ( + identity_structure_hasher or get_default_object_hasher() + ) + self._label = label + + @property + def has_assigned_label(self) -> bool: + """ + Check if the label is explicitly set for this object. + + Returns: + bool: True if the label is explicitly set, False otherwise. + """ + return self._label is not None + + @property + def label(self) -> str: + """ + Get the label of this object. + + Returns: + str | None: The label of the object, or None if not set. + """ + return self._label or self.computed_label() or self.__class__.__name__ + + @label.setter + def label(self, label: str | None) -> None: + """ + Set the label of this object. + + Args: + label (str | None): The label to set for this object. + """ + self._label = label + + def computed_label(self) -> str | None: + """ + Compute a label for this object based on its content. If label is not explicitly set for this object + and computed_label returns a valid value, it will be used as label of this object. + """ + return None + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this object. + + Override this method in your subclass to provide a stable representation + of your object's content. The structure should contain all fields that + determine the object's identity. + + Returns: + Any: A structure representing this object's content, or None to use default hash + """ + # TODO: come up with a way to signify non-determinate identity structure + return None + + def __hash__(self) -> int: + """ + Hash implementation that uses the identity structure if provided, + otherwise falls back to the superclass's hash method. + + Returns: + int: A hash value based on either content or identity + """ + # Get the identity structure + structure = self.identity_structure() + if structure is None: + # If no identity structure is provided, use the default hash + return super().__hash__() + + return self.identity_structure_hasher.hash_to_int(structure) + + def __eq__(self, other: object) -> bool: + """ + Equality check that compares the identity structures of two objects. + + Args: + other (object): The object to compare against. + + Returns: + bool: True if both objects have the same identity structure, False otherwise. + """ + if not isinstance(other, LabeledContentIdentifiableBase): + return NotImplemented + + return self.identity_structure() == other.identity_structure() diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py new file mode 100644 index 0000000..c21c46b --- /dev/null +++ b/src/orcapod/data/datagrams.py @@ -0,0 +1,608 @@ +from orcapod.types.core import DataValue, StoreValue +from typing import TypeAlias, cast +from collections.abc import Callable, Mapping, Collection +from orcapod.types import TypeSpec, default_registry +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.core import TypeHandler +from orcapod.types import schemas +from orcapod.types.typespec_utils import get_typespec_from_dict +import pyarrow as pa + +from orcapod.hashing.defaults import get_default_arrow_hasher + + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +SemanticStore: TypeAlias = Mapping[str, StoreValue] +PythonStore: TypeAlias = Mapping[str, DataValue] + + +def check_arrow_schema_compatibility( + incoming_schema: pa.Schema, current_schema: pa.Schema +) -> tuple[bool, list[str]]: + """ + Check if incoming schema is compatible with current schema. + + Args: + incoming_schema: Schema to validate + current_schema: Expected schema to match against + + Returns: + Tuple of (is_compatible, list_of_errors) + """ + errors = [] + + # Create lookup dictionaries for efficient access + incoming_fields = {field.name: field for field in incoming_schema} + current_fields = {field.name: field for field in current_schema} + + # Check each field in current_schema + for field_name, current_field in current_fields.items(): + if field_name not in incoming_fields: + errors.append(f"Missing field '{field_name}' in incoming schema") + continue + + incoming_field = incoming_fields[field_name] + + # Check data type compatibility + if not current_field.type.equals(incoming_field.type): + errors.append( + f"Type mismatch for field '{field_name}': " + f"expected {current_field.type}, got {incoming_field.type}" + ) + + # Check semantic_type metadata if present in current schema + current_metadata = current_field.metadata or {} + incoming_metadata = incoming_field.metadata or {} + + if b"semantic_type" in current_metadata: + expected_semantic_type = current_metadata[b"semantic_type"] + + if b"semantic_type" not in incoming_metadata: + errors.append( + f"Missing 'semantic_type' metadata for field '{field_name}'" + ) + elif incoming_metadata[b"semantic_type"] != expected_semantic_type: + errors.append( + f"Semantic type mismatch for field '{field_name}': " + f"expected {expected_semantic_type.decode()}, " + f"got {incoming_metadata[b'semantic_type'].decode()}" + ) + elif b"semantic_type" in incoming_metadata: + errors.append( + f"Unexpected 'semantic_type' metadata for field '{field_name}': " + f"{incoming_metadata[b'semantic_type'].decode()}" + ) + + return len(errors) == 0, errors + + +class SemanticConverter: + @staticmethod + def prepare_handler( + semantic_schema: schemas.SemanticSchema, + semantic_type_registry: SemanticTypeRegistry, + ) -> dict[str, TypeHandler]: + handler_lut = {} + for key, (_, semantic_type) in semantic_schema.items(): + if semantic_type is None: + continue # Skip keys without semantic type + handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( + semantic_type + ) + return handler_lut + + @classmethod + def from_typespec( + cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticConverter": + semantic_schema = schemas.from_typespec_to_semantic_schema( + typespec, semantic_type_registry + ) + python_schema = schemas.PythonSchema(typespec) + handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) + return cls(python_schema, semantic_schema, handler_lut) + + @classmethod + def from_arrow_schema( + cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticConverter": + semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) + python_schema = schemas.from_semantic_schema_to_python_schema( + semantic_schema, semantic_type_registry=semantic_type_registry + ) + handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) + return cls(python_schema, semantic_schema, handler_lut) + + def __init__( + self, + python_schema: schemas.PythonSchema, + semantic_schema: schemas.SemanticSchema, + handler_lut: dict[str, TypeHandler] | None = None, + ): + self.python_schema = python_schema + self.semantic_schema = semantic_schema + self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( + semantic_schema, include_source_info=False + ) + if handler_lut is None: + handler_lut = {} + self.handler_lut = handler_lut + + def from_semantic_store_to_python_store( + self, semantic_store: SemanticStore + ) -> PythonStore: + python_store = dict(semantic_store) + for key, handler in self.handler_lut.items(): + python_store[key] = handler.storage_to_python(semantic_store[key]) + return python_store + + def from_python_store_to_semantic_store( + self, python_store: PythonStore + ) -> SemanticStore: + semantic_store = dict(python_store) + for key, handler in self.handler_lut.items(): + semantic_store[key] = handler.python_to_storage(python_store[key]) + return semantic_store # type: ignore[return-value] + + def from_semantic_store_to_arrow_table( + self, semantic_store: SemanticStore + ) -> pa.Table: + """Convert a semantic store to an Arrow table.""" + return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) + + def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: + """Convert a Python store to an Arrow table.""" + semantic_store = self.from_python_store_to_semantic_store(python_store) + return self.from_semantic_store_to_arrow_table(semantic_store) + + def from_arrow_table_to_semantic_stores( + self, arrow_table: pa.Table + ) -> list[SemanticStore]: + """Convert an Arrow table to a list of semantic stores.""" + self.verify_compatible_arrow_schema(arrow_table.schema) + return arrow_table.to_pylist() # Ensure the table is materialized + + def from_arrow_table_to_python_stores( + self, arrow_table: pa.Table + ) -> list[PythonStore]: + """Convert an Arrow table to a Python store.""" + return [ + self.from_semantic_store_to_python_store(semantic_store) + for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) + ] + + def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): + compatible, errors = check_arrow_schema_compatibility( + arrow_schema, self.arrow_schema + ) + if not compatible: + raise ValueError( + "Arrow table schema is not compatible with the expected schema: " + + ", ".join(errors) + ) + + +class PythonDictTag(dict[str, DataValue]): + def as_dict(self) -> dict[str, DataValue]: + return dict(self) + + def as_table(self) -> pa.Table: + return pa.Table.from_pylist([self]) + + @property + def typespec(self) -> schemas.PythonSchema: + # TODO: provide correct implementation + return schemas.PythonSchema({k: str for k in self.keys()}) + + +class ArrowTag: + def __init__(self, table: pa.Table) -> None: + self.table = table + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + + def keys(self) -> tuple[str, ...]: + return tuple(self.table.column_names) + + @property + def typespec(self) -> schemas.PythonSchema: + if self._cached_python_schema is None: + self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( + self.table.schema + ).storage_schema + return self._cached_python_schema.copy() + + def as_dict(self) -> dict[str, DataValue]: + if self._cached_python_dict is None: + self._cached_python_dict = cast( + dict[str, DataValue], self.table.to_pylist()[0] + ) + return self._cached_python_dict + + def as_table(self) -> pa.Table: + return self.table + + def clear_cache(self) -> None: + self._cached_python_schema = None + self._cached_python_dict = None + + def __repr__(self) -> str: + return f"{self.as_dict()}" + + +class PythonDictPacket(dict[str, DataValue]): + @classmethod + def create_from( + cls, + object: dp.Packet, + finger_print: str | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> "PythonDictPacket": + if isinstance(object, PythonDictPacket): + return object.copy() + + new_packet = PythonDictPacket( + object.as_dict(include_source=False), + object.source_info(), + dict(object.typespec), + finger_print=finger_print, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + post_hash_callback=post_hash_callback, + ) + return new_packet + + def __init__( + self, + data: dict[str, DataValue], + source_info: dict[str, str | None] | None = None, + typespec: TypeSpec | None = None, + finger_print: str | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data = {k: v for k, v in data.items() if not k.startswith("_source_info_")} + contained_source_info = { + k.removeprefix("_source_info_"): v + for k, v in data.items() + if k.startswith("_source_info_") + } + super().__init__(data) + + self._source_info = {**contained_source_info, **(source_info or {})} + + verified_typespec = {} + if typespec is not None: + verified_typespec = dict(typespec) + inferred_typespec = get_typespec_from_dict(self) + for key in self: + if key not in verified_typespec: + verified_typespec[key] = inferred_typespec[key] + self._typespec = verified_typespec + + self._python_schema = schemas.PythonSchema(self._typespec) + + if semantic_converter is not None: + if semantic_converter.python_schema != self._python_schema.with_source_info: + raise ValueError( + "Incompatible Python schema between packet and semantic converter: " + + str(self._python_schema.with_source_info) + + " vs " + + str(semantic_converter.python_schema) + ) + else: + semantic_converter = SemanticConverter.from_typespec( + self._python_schema.with_source_info, + semantic_type_registry or default_registry, + ) + self.semantic_converter = semantic_converter + + self._finger_print = finger_print + self._post_hash_callback = post_hash_callback + self._cached_table: pa.Table | None = None + self._cached_content_hash: str | None = None + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + def as_table(self, include_source: bool = False) -> pa.Table: + """Convert the packet to an Arrow table.""" + if self._cached_table is None: + self._cached_table = ( + self.semantic_converter.from_python_store_to_arrow_table( + self.as_dict(include_source=True) + ) + ) + assert self._cached_table is not None, "Cached table should not be None" + if include_source: + return self._cached_table + else: + # drop source info columns if not needed + return self._cached_table.select(list(self.keys())) + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + dict_copy = self.copy() + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"_source_info_{key}"] = value + return dict_copy + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(include_source=False), prefix_hasher_id=True + ) + if self._post_hash_callback is not None and self._finger_print is not None: + self._post_hash_callback(self._finger_print, self._cached_content_hash) + return self._cached_content_hash + + @property + def typespec(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def source_info(self) -> dict[str, str | None]: + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> "PythonDictPacket": + """Return a shallow copy of the packet.""" + new_packet = PythonDictPacket(self, self.source_info()) + new_packet._finger_print = self._finger_print + new_packet._cached_table = self._cached_table + new_packet._cached_content_hash = self._cached_content_hash + new_packet._python_schema = self._python_schema.copy() + new_packet.semantic_converter = self.semantic_converter + new_packet.arrow_hasher = self.arrow_hasher + new_packet._post_hash_callback = self._post_hash_callback + return new_packet + + +def process_table_with_source_info( + table: pa.Table, source_info: dict[str, str | None] | None = None +) -> tuple[tuple[str, ...], pa.Table]: + """ + Process a table to ensure proper source_info columns. + + Args: + table: Input PyArrow table + source_info: optional dictionary mapping column names to source info values. If present, + it will take precedence over existing source_info columns in the table. + + Returns: + Processed table with source_info columns + """ + if source_info is None: + source_info = {} + + # Step 1: Separate source_info columns from regular columns + regular_columns = [] + regular_names = [] + existing_source_info = {} + + for i, name in enumerate(table.column_names): + if name.startswith("_source_info_"): + # Extract the base column name + base_name = name.removeprefix("_source_info_") + existing_source_info[base_name] = table.column(i) + else: + regular_columns.append(table.column(i)) + regular_names.append(name) + + # Step 2: Create source_info columns for each regular column + final_columns = [] + final_names = [] + + # Add all regular columns first + final_columns.extend(regular_columns) + final_names.extend(regular_names) + + # Create source_info columns for each regular column + num_rows = table.num_rows + + for col_name in regular_names: + source_info_col_name = f"_source_info_{col_name}" + + # if col_name is in source_info, use that value + if col_name in source_info: + # Use value from source_info dictionary + source_value = source_info[col_name] + source_values = pa.array([source_value] * num_rows, type=pa.large_string()) + # if col_name is in existing_source_info, use that column + elif col_name in existing_source_info: + # Use existing source_info column, but convert to large_string + existing_col = existing_source_info[col_name] + if existing_col.type == pa.large_string(): + source_values = existing_col + else: + # Convert to large_string + source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + + else: + # Use null values + source_values = pa.array([None] * num_rows, type=pa.large_string()) + + final_columns.append(source_values) + final_names.append(source_info_col_name) + + # Step 3: Create the final table + result: pa.Table = pa.Table.from_arrays(final_columns, names=final_names) + return tuple(regular_names), result + + +class ArrowPacket: + @classmethod + def create_from( + cls, + object: dp.Packet, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + finger_print: str | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + ) -> "ArrowPacket": + if isinstance(object, ArrowPacket): + return object.copy() + + new_packet = ArrowPacket( + object.as_table(include_source=True), + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + finger_print=finger_print, + arrow_hasher=arrow_hasher, + post_hash_callback=post_hash_callback, + skip_source_info_extraction=True, + ) + return new_packet + + def __init__( + self, + table: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + finger_print: str | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + post_hash_callback: Callable[[str, str], None] | None = None, + skip_source_info_extraction: bool = False, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + if not skip_source_info_extraction: + # normalize the table to ensure it has the expected source_info columns + self._keys, self._arrow_table = process_table_with_source_info( + table, source_info + ) + else: + self._keys: tuple[str, ...] = tuple( + [c for c in table.column_names if not c.startswith("_source_info_")] + ) + for k in self._keys: + if f"_source_info_{k}" not in table.column_names: + raise ValueError( + f"Source info column '_source_info_{k}' is missing in the table." + ) + self._arrow_table = table + + self._finger_print = finger_print + self._post_hash_callback = post_hash_callback + + if semantic_converter is not None: + check_arrow_schema_compatibility( + semantic_converter.arrow_schema, self._arrow_table.schema + ) + else: + semantic_converter = SemanticConverter.from_arrow_schema( + self._arrow_table.schema, semantic_type_registry or default_registry + ) + self.semantic_converter = semantic_converter + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + self._cached_python_packet: PythonStore | None = None + self._cached_content_hash: str | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_source_info: dict[str, str | None] | None = None + + def as_table(self, include_source: bool = False) -> pa.Table: + """Return the Arrow table representation of the packet.""" + base_table = self._arrow_table + if not include_source: + # Select only the keys that are not source info + base_table = base_table.select(self._keys) + return base_table + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + if self._cached_python_packet is None: + self._cached_python_packet = ( + self.semantic_converter.from_arrow_table_to_python_stores( + self._arrow_table + )[0] + ) + if include_source: + return dict(self._cached_python_packet) + + return {k: self._cached_python_packet[k] for k in self._keys} + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self._arrow_table, prefix_hasher_id=True + ) + if self._post_hash_callback is not None and self._finger_print is not None: + self._post_hash_callback(self._finger_print, self._cached_content_hash) + return self._cached_content_hash + + @property + def typespec(self) -> schemas.PythonSchema: + return self.semantic_converter.python_schema.copy() + + def keys(self) -> tuple[str, ...]: + """Return the keys of the packet.""" + return tuple(self._keys) + + def source_info(self) -> dict[str, str | None]: + if self._cached_source_info is None: + self._cached_source_info = { + k: self._arrow_table[f"_source_info_{k}"][0].as_py() for k in self._keys + } + return self._cached_source_info.copy() + + def copy(self) -> "ArrowPacket": + """Return a shallow copy of the packet.""" + new_packet = ArrowPacket( + self._arrow_table, + semantic_converter=self.semantic_converter, + finger_print=self._finger_print, + arrow_hasher=self.arrow_hasher, + post_hash_callback=self._post_hash_callback, + skip_source_info_extraction=True, + ) + new_packet._cached_content_hash = self._cached_content_hash + new_packet._cached_source_info = ( + self._cached_source_info.copy() + if self._cached_source_info is not None + else None + ) + new_packet._cached_python_packet = ( + dict(self._cached_python_packet) + if self._cached_python_packet is not None + else None + ) + return new_packet + + def __repr__(self) -> str: + return f"{self.as_dict(include_source=False)}" + + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py new file mode 100644 index 0000000..0695715 --- /dev/null +++ b/src/orcapod/data/kernels.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod +from typing import Any +from orcapod.protocols import data_protocols as dp +import logging +from orcapod.data.streams import KernelStream +from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER +from orcapod.types import TypeSpec + +logger = logging.getLogger(__name__) + + +def get_tracker_manager() -> dp.TrackerManager: ... + + +class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): + """ + Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. + It is the base class for all computations and transformations that can be performed on a collection of streams + (including an empty collection). + A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + Each "invocation" of the kernel on a collection of streams is assigned a unique ID. + The corresponding invocation information is stored as Invocation object and attached to the output stream + for computational graph tracking. + """ + + def __init__( + self, + label: str | None = None, + skip_tracking: bool = False, + tracker_manager: dp.TrackerManager | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._label = label + self._skip_tracking = skip_tracking + self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + + def __call__( + self, *streams: dp.Stream, label: str | None = None, **kwargs + ) -> dp.Stream: + output_stream = self.forward(*streams, **kwargs) + + kernel_stream: dp.Stream + if output_stream.source is not None: + kernel_stream = KernelStream(output_stream, label=label) + else: + logger.warning( + "Output stream does not have a source. " + "This may lead to unexpected behavior when tracking the kernel invocation." + ) + kernel_stream = KernelStream(source=self, upstreams=streams, label=label) + + # TODO: consider the logic around tracker manager more carefully + if not self._skip_tracking and self._tracker_manager is not None: + # register the invocation to all active trackers + active_trackers = self._tracker_manager.get_active_trackers() + for tracker in active_trackers: + tracker.record(kernel_stream) + + return kernel_stream + + @abstractmethod + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + + @abstractmethod + def validate_inputs(self, *streams: dp.Stream) -> None: ... + + @abstractmethod + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior + """ + + def __repr__(self): + return self.__class__.__name__ + + def __str__(self): + if self._label is not None: + return f"{self.__class__.__name__}({self._label})" + return self.__class__.__name__ + + def identity_structure(self, *streams: dp.Stream) -> Any: + # Default implementation of identity_structure for the kernel only + # concerns the kernel class and the streams if present. Subclasses of + # Kernels should override this method to provide a more meaningful + # representation of the kernel. Note that kernel must provide the notion + # of identity under possibly two distinct contexts: + # 1) identity of the kernel in itself when invoked without any stream + # 2) identity of the specific invocation of the kernel with a collection of streams + # While the latter technically corresponds to the identity of the invocation and not + # the kernel, only kernel can provide meaningful information as to the uniqueness of + # the invocation as only kernel would know if / how the input stream(s) alter the identity + # of the invocation. For example, if the kernel corresponds to an commutative computation + # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the + # equivalence of the two by returning the same identity structure for both invocations. + # This can be achieved, for example, by returning a set over the streams instead of a tuple. + logger.warning( + f"Identity structure not implemented for {self.__class__.__name__}" + ) + return (self.__class__.__name__,) + streams diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py new file mode 100644 index 0000000..15d255a --- /dev/null +++ b/src/orcapod/data/operators.py @@ -0,0 +1,156 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import ImmutableTableStream +from orcapod.types import TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from abc import abstractmethod +from typing import Any + + +class InputValidationError(Exception): + """ + Exception raised when the inputs are not valid. + This is used to indicate that the inputs do not meet the requirements of the operator. + """ + + +class BinaryOperator(TrackedKernelBase): + """ + Base class for all operators. + """ + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_validate_inputs(left_stream, right_stream) + + @abstractmethod + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + def check_binary_inputs( + self, *streams: dp.Stream, allow_zero: bool = False + ) -> None: + """ + Check that the inputs to the binary operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if not (allow_zero and len(streams) == 0) and len(streams) != 2: + raise ValueError("BinaryOperator requires exactly two input streams.") + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for binary operators. + It expects exactly two streams as input. + """ + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_forward(left_stream, right_stream) + + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + self.check_binary_inputs(*streams) + left_stream, right_stream = streams + return self.op_types(left_stream, right_stream) + + def identity_structure(self, *streams: dp.Stream) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + self.check_binary_inputs(*streams, allow_zero=True) + return self.op_identity_structure(*streams) + + @abstractmethod + def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_types( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure(self, *streams: dp.Stream) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes two streams as input and returns a tuple containing the operator name and a set of streams. + """ + ... + + +class Join(BinaryOperator): + def op_identity_structure(self, *streams: dp.Stream) -> Any: + # Join does not depend on the order of the streams -- convert it onto a set + id_struct = (self.__class__.__name__,) + if len(streams) == 2: + id_struct += (set(streams),) + return id_struct + + def op_forward( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> ImmutableTableStream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + common_tag_keys = tuple( + intersection_typespecs(left_tag_typespec, right_tag_typespec).keys() + ) + joined_tag_keys = tuple( + union_typespecs(left_tag_typespec, right_tag_typespec).keys() + ) + + # performing a check to ensure that packets are compatible + union_typespecs(left_packet_typespec, right_packet_typespec) + + joined_table = left_stream.as_table().join( + right_stream.as_table(), + keys=common_tag_keys, + join_type="inner", + ) + + return ImmutableTableStream( + joined_table, + tag_columns=tuple(joined_tag_keys), + source=self, + upstreams=(left_stream, right_stream), + ) + + def op_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) + joined_packet_typespec = union_typespecs( + left_packet_typespec, right_packet_typespec + ) + return joined_tag_typespec, joined_packet_typespec + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + try: + self.op_types(left_stream, right_stream) + except Exception as e: + raise InputValidationError(f"Input streams are not compatible: {e}") + + def __repr__(self) -> str: + return "Join()" diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py new file mode 100644 index 0000000..256c34b --- /dev/null +++ b/src/orcapod/data/pods.py @@ -0,0 +1,340 @@ +from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.streams import PodStream +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.types import SemanticTypeRegistry, default_registry +from orcapod.types import typespec_utils as tsutils +from abc import abstractmethod + +import logging +import sys +from collections.abc import Callable, Collection, Iterable, Sequence +from typing import Any, Literal, cast + + +from orcapod.types.typespec_utils import ( + extract_function_typespecs, + check_typespec_compatibility, +) +from orcapod.types import TypeSpec + +from orcapod.hashing.legacy_core import get_function_signature +from orcapod.data.operators import Join + + +logger = logging.getLogger(__name__) + +error_handling_options = Literal["raise", "ignore", "warn"] + + +class PodBase(TrackedKernelBase): + """ + FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. + It allows for the execution of a function with a specific label and can be tracked by the system. + """ + + def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Return the input and output typespecs for the pod. + This is used to validate the input and output streams. + """ + input_stream = self.process_and_verify_streams(*streams) + tag_typespec, _ = input_stream.types() + return tag_typespec, self.output_typespec + + @property + @abstractmethod + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the pod. This is used to validate the input streams. + """ + ... + + @property + @abstractmethod + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the pod. This is used to validate the output streams. + """ + ... + + @abstractmethod + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, dp.Packet | None]: ... + + def __init__( + self, + error_handling: error_handling_options = "raise", + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(label=label, **kwargs) + self._active = True + self.error_handling = error_handling + + def is_active(self) -> bool: + """ + Check if the pod is active. If not, it will not process any packets. + """ + return self._active + + def set_active(self, active: bool) -> None: + """ + Set the active state of the pod. If set to False, the pod will not process any packets. + """ + self._active = active + + def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: + """ + Prepare the incoming streams for execution in the pod. This default implementation + joins all the input streams together. + """ + # if multiple streams are provided, join them + # otherwise, return as is + combined_streams = list(streams) + if len(streams) > 1: + stream = streams[0] + for next_stream in streams[1:]: + stream = Join()(stream, next_stream) + combined_streams = [stream] + input_stream = combined_streams[0] + _, input_typespec = input_stream.types() + if not tsutils.check_typespec_compatibility( + input_typespec, self.input_typespec + ): + raise ValueError( + f"Input typespec {input_typespec} is not compatible with expected input typespec {self.input_typespec}" + ) + return input_stream + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.process_and_verify_streams(*streams) + + def forward(self, *streams: dp.Stream) -> PodStream: + input_stream = self.process_and_verify_streams(*streams) + # at this point, streams should have been joined into one + + return PodStream( + self, + input_stream, + error_handling=cast(error_handling_options, self.error_handling), + ) + + +def function_pod( + output_keys: str | Collection[str] | None = None, + function_name: str | None = None, + label: str | None = None, + **kwargs, +) -> Callable[..., "FunctionPod"]: + """ + Decorator that wraps a function in a FunctionPod instance. + + Args: + output_keys: Keys for the function output(s) + function_name: Name of the function pod; if None, defaults to the function name + **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. + + Returns: + FunctionPod instance wrapping the decorated function + """ + + def decorator(func) -> FunctionPod: + if func.__name__ == "": + raise ValueError("Lambda functions cannot be used with function_pod") + + if not hasattr(func, "__module__") or func.__module__ is None: + raise ValueError( + f"Function {func.__name__} must be defined at module level" + ) + + # Store the original function in the module for pickling purposes + # and make sure to change the name of the function + module = sys.modules[func.__module__] + base_function_name = func.__name__ + new_function_name = f"_original_{func.__name__}" + setattr(module, new_function_name, func) + # rename the function to be consistent and make it pickleable + setattr(func, "__name__", new_function_name) + setattr(func, "__qualname__", new_function_name) + + # Create a simple typed function pod + pod = FunctionPod( + function=func, + output_keys=output_keys, + function_name=function_name or base_function_name, + label=label, + **kwargs, + ) + return pod + + return decorator + + +class FunctionPod(PodBase): + def __init__( + self, + function: dp.PodFunction, + output_keys: str | Collection[str] | None = None, + function_name=None, + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | Sequence[type] | None = None, + label: str | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + function_info_extractor: hp.FunctionInfoExtractor | None = None, + **kwargs, + ) -> None: + self.function = function + if output_keys is None: + output_keys = [] + if isinstance(output_keys, str): + output_keys = [output_keys] + self.output_keys = output_keys + if function_name is None: + if hasattr(self.function, "__name__"): + function_name = getattr(self.function, "__name__") + else: + raise ValueError( + "function_name must be provided if function has no __name__ attribute" + ) + self.function_name = function_name + super().__init__(label=label or self.function_name, **kwargs) + + if semantic_type_registry is None: + # TODO: reconsider the use of default registry here + semantic_type_registry = default_registry + + self.semantic_type_registry = semantic_type_registry + self.function_info_extractor = function_info_extractor + + # extract input and output types from the function signature + self._input_typespec, self._output_typespec = extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, + ) + + @property + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the function pod. + This is used to validate the input streams. + """ + return self._input_typespec + + @property + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the function pod. + This is used to validate the output streams. + """ + return self._output_typespec + + def __repr__(self) -> str: + return f"FunctionPod:{self.function!r}" + + def __str__(self) -> str: + include_module = self.function.__module__ != "__main__" + func_sig = get_function_signature( + self.function, + name_override=self.function_name, + include_module=include_module, + ) + return f"FunctionPod:{func_sig}" + + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, PythonDictPacket | None]: + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + output_values = [] + + values = self.function(**packet.as_dict(include_source=False)) + + if len(self.output_keys) == 0: + output_values = [] + elif len(self.output_keys) == 1: + output_values = [values] # type: ignore + elif isinstance(values, Iterable): + output_values = list(values) # type: ignore + elif len(self.output_keys) > 1: + raise ValueError( + "Values returned by function must be a pathlike or a sequence of pathlikes" + ) + + if len(output_values) != len(self.output_keys): + raise ValueError( + f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" + ) + + # TODO: add source info based on this function call + output_packet = PythonDictPacket( + {k: v for k, v in zip(self.output_keys, output_values)} + ) + return tag, output_packet + + def identity_structure(self, *streams: dp.Stream) -> Any: + # construct identity structure for the function + # if function_info_extractor is available, use that but substitute the function_name + if self.function_info_extractor is not None: + function_info = self.function_info_extractor.extract_function_info( + self.function, + function_name=self.function_name, + input_typespec=self.input_typespec, + output_typespec=self.output_typespec, + ) + else: + # use basic information only + function_info = { + "name": self.function_name, + "input_typespec": self.input_typespec, + "output_typespec": self.output_typespec, + } + function_info["output_keys"] = tuple(self.output_keys) + + return ( + self.__class__.__name__, + function_info, + ) + streams + + +class StoredPod(PodBase): + def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: + super().__init__(**kwargs) + self.pod = pod + + def computed_label(self) -> str | None: + return self.pod.label + + @property + def input_typespec(self) -> TypeSpec: + """ + Return the input typespec for the stored pod. + This is used to validate the input streams. + """ + return self.pod.input_typespec + + @property + def output_typespec(self) -> TypeSpec: + """ + Return the output typespec for the stored pod. + This is used to validate the output streams. + """ + return self.pod.output_typespec + + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + return self.pod.call(tag, packet) + + def identity_structure(self, *streams: dp.Stream) -> Any: + return self.pod.identity_structure(*streams) + + def __repr__(self) -> str: + return f"StoredPod({self.pod!r})" + + def __str__(self) -> str: + return f"StoredPod:{self.pod!s}" diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py new file mode 100644 index 0000000..9389949 --- /dev/null +++ b/src/orcapod/data/streams.py @@ -0,0 +1,487 @@ +from orcapod.protocols import data_protocols as dp +from orcapod.types import SemanticTypeRegistry, default_registry, schemas, TypeSpec +from orcapod.data.datagrams import ArrowPacket, ArrowTag, SemanticConverter +from orcapod.data.base import LabeledContentIdentifiableBase +import pyarrow as pa +from collections.abc import Iterator, Collection +from abc import ABC, abstractmethod +from datetime import timezone, datetime +from typing import Any, Literal +import logging +import warnings + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class StreamBase(ABC, LabeledContentIdentifiableBase): + """ + A stream is a collection of tagged-packets that are generated by an operation. + The stream is iterable and can be used to access the packets in the stream. + + A stream has property `invocation` that is an instance of Invocation that generated the stream. + This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). + """ + + def __init__( + self, + source: dp.Kernel | None = None, + upstreams: tuple[dp.Stream, ...] = (), + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._source = source + self._upstreams = upstreams + self._last_modified: datetime | None = None + self._update_modified_time() + + @abstractmethod + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + + @abstractmethod + def types(self) -> tuple[TypeSpec, TypeSpec]: ... + + @abstractmethod + def as_table(self) -> pa.Table: ... + + @abstractmethod + def iter_packets( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + def _update_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return + + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) + + @property + def last_modified(self) -> datetime | None: + """ + Returns when the stream's content was last modified. + This is used to track the time when the stream was last accessed. + Returns None if the stream has not been accessed yet. + """ + return self._last_modified + + @property + def is_current(self) -> bool: + """ + Returns whether the stream is current. + A stream is current if the content is up-to-date with respect to its source. + This can be used to determine if a stream with non-None last_modified is up-to-date. + Note that for asynchronous streams, this status is not applicable and always returns False. + """ + if self.last_modified is None: + # If there is no last_modified timestamp, we cannot determine if the stream is current + return False + + for upstream in self.upstreams: + if ( + not upstream.is_current + or upstream.last_modified is None + or upstream.last_modified > self.last_modified + ): + return False + return True + + @property + def source(self) -> dp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + def __iter__( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self.iter_packets() + + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + """ + Flow everything through the stream, returning the entire collection of + (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. + """ + return [e for e in self] + + # --------------------- Recursive methods --------------------------- + # These methods form a step in the multi-class recursive invocation that follows the pattern of + # Stream -> Invocation -> Kernel -> Stream ... -> Invocation -> Kernel + # Most of the method logic would be found in Kernel's implementation of the method with + # Stream and Invocation simply serving as recursive steps + + def identity_structure(self) -> Any: + """ + Identity structure of a stream is deferred to the identity structure + of the associated invocation, if present. + A bare stream without invocation has no well-defined identity structure. + Specialized stream subclasses should override this method to provide more meaningful identity structure + """ + if self.source is not None: + # if the stream is generated by an operation, use the identity structure from the invocation + return self.source.identity_structure(*self.upstreams) + return super().identity_structure() + + +class KernelStream(StreamBase): + """ + Recomputable stream that wraps a streams produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + output_stream: dp.Stream | None = None, + source: dp.Kernel | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (output_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._update_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.update_stream() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.update_stream() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.update_stream() + if not status: # if it failed to update for whatever reason + return False + return True + + def update_stream(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._update_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table(self) -> pa.Table: + self.update_stream() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.update_stream() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + yield from self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" + + +class ImmutableTableStream(StreamBase): + """ + An immutable stream based on a PyArrow Table. + This stream is designed to be used with data that is already in a tabular format, + such as data loaded from a file or database. The columns to be treated as tags are + specified at initialization, and the rest of the columns are treated as packets. + The stream is immutable, meaning that once it is created, it cannot be modified. + This is useful for ensuring that the data in the stream remains consistent and unchanging. + + The types of the tag and packet columns are inferred from the PyArrow Table schema. + """ + + def __init__( + self, + table: pa.Table, + tag_columns: Collection[str] = (), + source: dp.Kernel | None = None, + upstreams: tuple[dp.Stream, ...] = (), + semantic_type_registry: SemanticTypeRegistry | None = None, + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=upstreams, **kwargs) + + self._table = table + + self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + self._packet_columns = tuple( + c for c in table.column_names if c not in tag_columns + ) + + semantic_type_registry = semantic_type_registry or default_registry + tag_schema = pa.schema( + f for f in self._table.schema if f.name in self._tag_columns + ) + packet_schema = pa.schema( + f for f in self._table.schema if f.name in self._packet_columns + ) + self._tag_converter = SemanticConverter.from_arrow_schema( + tag_schema, semantic_type_registry + ) + self._packet_converter = SemanticConverter.from_arrow_schema( + packet_schema, semantic_type_registry + ) + + self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None + self._update_modified_time() # set modified time to now + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + return self._tag_columns, self._packet_columns + + def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + # TODO: consider using MappingProxyType to avoid copying the dicts + return ( + self._tag_converter.python_schema.copy(), + self._packet_converter.python_schema.copy(), + ) + + def as_table(self) -> pa.Table: + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + return self._table + + def clear_cache(self) -> None: + """ + Resets the cached elements of the stream. + This is useful for re-iterating over the stream. + """ + self._cached_elements = None + + def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + if self._cached_elements is None: + self._cached_elements = [] + tags = self._table.select(self._tag_columns) + packets = self._table.select(self._packet_columns) + for tag_batch, packet_batch in zip(tags.to_batches(), packets.to_batches()): + for i in range(len(tag_batch)): + self._cached_elements.append( + ( + ArrowTag(tag_batch.slice(i, 1)), + ArrowPacket(packet_batch.slice(i, 1)), + ) + ) + yield from self._cached_elements + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(table={self._table.column_names}, " + f"tag_columns={self._tag_columns})" + ) + + +class PodStream(StreamBase): + def __init__( + self, + pod: dp.Pod, + input_stream: dp.Stream, + error_handling: Literal["raise", "ignore", "warn"] = "raise", + **kwargs, + ) -> None: + super().__init__(upstreams=(input_stream,), **kwargs) + self.pod = pod + self.input_stream = input_stream + self.error_handling = error_handling + self._source = pod + + # Cache for processed packets + # This is a dictionary mapping the index of the packet in the input stream to a tuple of (Tag, Packet) + # This allows us to efficiently access the processed packets without re-processing them + self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} + self._computation_complete: bool = False + self._cached_output_table: pa.Table | None = None + + @property + def source(self) -> dp.Pod | None: + """ + The source of the stream, which is the pod that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + tag_keys, _ = self.input_stream.keys() + packet_keys = tuple(self.pod.output_typespec.keys()) + return tag_keys, packet_keys + + def types(self) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.input_stream.types() + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_typespec) + return tag_typespec, packet_typespec + + def clear_cache(self) -> None: + """ + Clears the cached results of the processed stream. + This is useful for re-processing the stream with the same processor. + """ + self._cached_output_packets = {} + self._computation_complete = False + self._cached_output_table = None + + def validate_cache(self) -> None: + if not self.is_current: + self.clear_cache() + self._update_modified_time(invalidate=True) + + def as_table(self) -> pa.Table: + self.validate_cache() + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + for tag, packet in self.iter_packets(): + # TODO: evaluate handling efficiency here + all_tags.append(tag.as_dict()) + all_packets.append(packet.as_dict()) + all_tags: pa.Table = pa.Table.from_pylist(all_tags) + all_packets: pa.Table = pa.Table.from_pylist(all_packets) + # assert that column names do not overlap + overlapping_columns = set(all_tags.column_names) & set( + all_packets.column_names + ) + if overlapping_columns: + raise ValueError( + f"Column names overlap between tags and packets: {overlapping_columns}. Overlapping tag and packet columns are not supported yet." + ) + self._cached_output_table = pa.Table.from_arrays( + all_tags.columns + all_packets.columns, + names=all_tags.column_names + all_packets.column_names, + ) + + return self._cached_output_table + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.validate_cache() + if not self._computation_complete or self._cached_output_packets is None: + for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): + if i not in self._cached_output_packets: + try: + processed_tag, processed_packet = self.pod.call(tag, packet) + except Exception as e: + logger.error(f"Error processing packet {packet}: {e}") + if self.error_handling == "raise": + raise e + elif self.error_handling == "warn": + warnings.warn(f"Error processing packet {packet}: {e}") + continue + elif self.error_handling == "ignore": + continue + else: + raise ValueError( + f"Unknown error handling mode: {self.error_handling} encountered while handling error:" + ) from e + if processed_packet is None: + # call returning None means the packet should be skipped + logger.debug( + f"Packet {packet} with tag {tag} was processed but returned None, skipping." + ) + continue + self._cached_output_packets[i] = (processed_tag, processed_packet) + yield processed_tag, processed_packet + self._computation_complete = True + self._update_modified_time() + + else: + for i in range(len(self._cached_output_packets)): + yield self._cached_output_packets[i] diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py new file mode 100644 index 0000000..fab481a --- /dev/null +++ b/src/orcapod/data/trackers.py @@ -0,0 +1,150 @@ +from orcapod.protocols import data_protocols as dp +from collections import defaultdict +from abc import ABC, abstractmethod + + +class BasicTrackerManager: + def __init__(self) -> None: + self._active_trackers: list[dp.Tracker] = [] + + def register_tracker(self, tracker: dp.Tracker) -> None: + """ + Register a new tracker in the system. + This is used to add a new tracker to the list of active trackers. + """ + if tracker not in self._active_trackers: + self._active_trackers.append(tracker) + + def deregister_tracker(self, tracker: dp.Tracker) -> None: + """ + Remove a tracker from the system. + This is used to deactivate a tracker and remove it from the list of active trackers. + """ + if tracker in self._active_trackers: + self._active_trackers.remove(tracker) + + def get_active_trackers(self) -> list[dp.Tracker]: + """ + Get the list of active trackers. + This is used to retrieve the currently active trackers in the system. + """ + return [t for t in self._active_trackers if t.is_active()] + + def record(self, stream: dp.Stream) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + for tracker in self.get_active_trackers(): + tracker.record(stream) + + +class AutoRegisteringContextBasedTracker(ABC): + def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self._active = False + + def set_active(self, active: bool = True) -> None: + if active: + self._tracker_manager.register_tracker(self) + else: + self._tracker_manager.deregister_tracker(self) + self._active = active + + def is_active(self) -> bool: + return self._active + + @abstractmethod + def record(self, stream: dp.Stream) -> None: ... + + def __enter__(self): + self.set_active(True) + return self + + def __exit__(self, exc_type, exc_val, ext_tb): + self.set_active(False) + + +class GraphTracker(AutoRegisteringContextBasedTracker): + """ + A tracker that records the invocations of operations and generates a graph + of the invocations and their dependencies. + """ + + # Thread-local storage to track active trackers + + def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + super().__init__(tracker_manager=tracker_manager) + self.kernel_to_invoked_stream_lut: dict[dp.Kernel, list[dp.Stream]] = ( + defaultdict(list) + ) + + def record(self, stream: dp.Stream) -> None: + assert stream.source is not None, ( + "Stream must have a source kernel when recording." + ) + stream_list = self.kernel_to_invoked_stream_lut[stream.source] + if stream not in stream_list: + stream_list.append(stream) + + def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: + """ + Reset the tracker and return the recorded invocations. + """ + recorded_streams = self.kernel_to_invoked_stream_lut + self.kernel_to_invoked_stream_lut = defaultdict(list) + return recorded_streams + + def generate_graph(self): + import networkx as nx + + G = nx.DiGraph() + + # Add edges for each invocation + for _, streams in self.kernel_to_invoked_stream_lut.items(): + for stream in streams: + if stream not in G: + G.add_node(stream) + for upstream in stream.upstreams: + G.add_edge(upstream, stream) + return G + + # def generate_namemap(self) -> dict[Invocation, str]: + # namemap = {} + # for kernel, invocations in self.invocation_lut.items(): + # # if only one entry present, use the kernel name alone + # if kernel.label is not None: + # node_label = kernel.label + # else: + # node_label = str(kernel) + # if len(invocations) == 1: + # namemap[invocations[0]] = node_label + # continue + # # if multiple entries, use the kernel name and index + # for idx, invocation in enumerate(invocations): + # namemap[invocation] = f"{node_label}_{idx}" + # return namemap + + # def draw_graph(self): + # import networkx as nx + # import matplotlib.pyplot as plt + + # G = self.generate_graph() + # labels = self.generate_namemap() + + # pos = nx.drawing.nx_agraph.graphviz_layout(G, prog="dot") + # nx.draw( + # G, + # pos, + # labels=labels, + # node_size=2000, + # node_color="lightblue", + # with_labels=True, + # font_size=10, + # font_weight="bold", + # arrowsize=20, + # ) + # plt.tight_layout() + + +DEFAULT_TRACKER_MANAGER = BasicTrackerManager() From e58de64c98c08fd8174bfa49d8e740452f2633fc Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 01:42:18 +0000 Subject: [PATCH 102/224] refactor: cleanup protocols --- src/orcapod/data/datagrams.py | 16 +- src/orcapod/data/kernels.py | 5 +- src/orcapod/data/operators.py | 10 +- src/orcapod/data/pods.py | 56 +- src/orcapod/data/streams.py | 135 ++-- src/orcapod/protocols/data_protocols.py | 803 +++++++++++++++++++++--- 6 files changed, 822 insertions(+), 203 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index c21c46b..717b928 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -199,8 +199,7 @@ def as_dict(self) -> dict[str, DataValue]: def as_table(self) -> pa.Table: return pa.Table.from_pylist([self]) - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: # TODO: provide correct implementation return schemas.PythonSchema({k: str for k in self.keys()}) @@ -219,8 +218,7 @@ def __init__(self, table: pa.Table) -> None: def keys(self) -> tuple[str, ...]: return tuple(self.table.column_names) - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: if self._cached_python_schema is None: self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( self.table.schema @@ -262,7 +260,7 @@ def create_from( new_packet = PythonDictPacket( object.as_dict(include_source=False), object.source_info(), - dict(object.typespec), + dict(object.types()), finger_print=finger_print, semantic_converter=semantic_converter, semantic_type_registry=semantic_type_registry, @@ -359,8 +357,9 @@ def content_hash(self) -> str: self._post_hash_callback(self._finger_print, self._cached_content_hash) return self._cached_content_hash - @property - def typespec(self) -> schemas.PythonSchema: + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: @@ -562,8 +561,7 @@ def content_hash(self) -> str: self._post_hash_callback(self._finger_print, self._cached_content_hash) return self._cached_content_hash - @property - def typespec(self) -> schemas.PythonSchema: + def types(self) -> schemas.PythonSchema: return self.semantic_converter.python_schema.copy() def keys(self) -> tuple[str, ...]: diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 0695715..acccf4e 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -39,10 +39,9 @@ def __init__( def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs - ) -> dp.Stream: + ) -> dp.LiveStream: output_stream = self.forward(*streams, **kwargs) - kernel_stream: dp.Stream if output_stream.source is not None: kernel_stream = KernelStream(output_stream, label=label) else: @@ -62,7 +61,7 @@ def __call__( return kernel_stream @abstractmethod - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... @abstractmethod def validate_inputs(self, *streams: dp.Stream) -> None: ... diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 15d255a..3db4949 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -53,10 +53,10 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: left_stream, right_stream = streams return self.op_forward(left_stream, right_stream) - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: self.check_binary_inputs(*streams) left_stream, right_stream = streams - return self.op_types(left_stream, right_stream) + return self.op_output_types(left_stream, right_stream) def identity_structure(self, *streams: dp.Stream) -> Any: """ @@ -75,7 +75,7 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre ... @abstractmethod - def op_types( + def op_output_types( self, left_stream: dp.Stream, right_stream: dp.Stream ) -> tuple[TypeSpec, TypeSpec]: """ @@ -135,7 +135,7 @@ def op_forward( upstreams=(left_stream, right_stream), ) - def op_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: left_tag_typespec, left_packet_typespec = left_stream.types() right_tag_typespec, right_packet_typespec = right_stream.types() joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) @@ -148,7 +148,7 @@ def op_validate_inputs( self, left_stream: dp.Stream, right_stream: dp.Stream ) -> None: try: - self.op_types(left_stream, right_stream) + self.op_output_types(left_stream, right_stream) except Exception as e: raise InputValidationError(f"Input streams are not compatible: {e}") diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 256c34b..7e2ce48 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -33,26 +33,24 @@ class PodBase(TrackedKernelBase): It allows for the execution of a function with a specific label and can be tracked by the system. """ - def types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ input_stream = self.process_and_verify_streams(*streams) tag_typespec, _ = input_stream.types() - return tag_typespec, self.output_typespec + return tag_typespec, self.output_packet_types() - @property @abstractmethod - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the pod. This is used to validate the input streams. """ ... - @property @abstractmethod - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the pod. This is used to validate the output streams. """ @@ -99,12 +97,12 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: stream = Join()(stream, next_stream) combined_streams = [stream] input_stream = combined_streams[0] - _, input_typespec = input_stream.types() + _, incoming_packet_types = input_stream.types() if not tsutils.check_typespec_compatibility( - input_typespec, self.input_typespec + incoming_packet_types, self.input_packet_types() ): raise ValueError( - f"Input typespec {input_typespec} is not compatible with expected input typespec {self.input_typespec}" + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" ) return input_stream @@ -209,28 +207,28 @@ def __init__( self.function_info_extractor = function_info_extractor # extract input and output types from the function signature - self._input_typespec, self._output_typespec = extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_typespec, - output_typespec=output_typespec, + self._input_packet_types, self._output_packet_types = ( + extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, + ) ) - @property - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the function pod. This is used to validate the input streams. """ - return self._input_typespec + return self._input_packet_types - @property - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the function pod. This is used to validate the output streams. """ - return self._output_typespec + return self._output_packet_types def __repr__(self) -> str: return f"FunctionPod:{self.function!r}" @@ -285,15 +283,15 @@ def identity_structure(self, *streams: dp.Stream) -> Any: function_info = self.function_info_extractor.extract_function_info( self.function, function_name=self.function_name, - input_typespec=self.input_typespec, - output_typespec=self.output_typespec, + input_typespec=self.input_packet_types(), + output_typespec=self.output_packet_types(), ) else: # use basic information only function_info = { "name": self.function_name, - "input_typespec": self.input_typespec, - "output_typespec": self.output_typespec, + "input_packet_types": self.input_packet_types, + "output_packet_types": self.output_packet_types, } function_info["output_keys"] = tuple(self.output_keys) @@ -311,21 +309,19 @@ def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: def computed_label(self) -> str | None: return self.pod.label - @property - def input_typespec(self) -> TypeSpec: + def input_packet_types(self) -> TypeSpec: """ Return the input typespec for the stored pod. This is used to validate the input streams. """ - return self.pod.input_typespec + return self.pod.input_packet_types() - @property - def output_typespec(self) -> TypeSpec: + def output_packet_types(self) -> TypeSpec: """ Return the output typespec for the stored pod. This is used to validate the output streams. """ - return self.pod.output_typespec + return self.pod.output_packet_types() def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 9389949..2454f85 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -35,33 +35,35 @@ def __init__( self._source = source self._upstreams = upstreams self._last_modified: datetime | None = None - self._update_modified_time() + self._set_modified_time() - @abstractmethod - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + @property + def source(self) -> dp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source - @abstractmethod - def types(self) -> tuple[TypeSpec, TypeSpec]: ... + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams - @abstractmethod - def as_table(self) -> pa.Table: ... + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None @abstractmethod - def iter_packets( - self, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... - - def _update_modified_time( - self, timestamp: datetime | None = None, invalidate: bool = False - ) -> None: - if invalidate: - self._last_modified = None - return + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... - if timestamp is not None: - self._last_modified = timestamp - else: - self._last_modified = datetime.now(timezone.utc) + @abstractmethod + def types(self) -> tuple[TypeSpec, TypeSpec]: ... @property def last_modified(self) -> datetime | None: @@ -93,33 +95,31 @@ def is_current(self) -> bool: return False return True - @property - def source(self) -> dp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._source - - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._upstreams + def _set_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) def __iter__( self, ) -> Iterator[tuple[dp.Tag, dp.Packet]]: return self.iter_packets() + @abstractmethod + def iter_packets( + self, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + @abstractmethod + def as_table(self) -> pa.Table: ... + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ Flow everything through the stream, returning the entire collection of @@ -185,14 +185,14 @@ def clear_cache(self) -> None: This is useful for re-processing the stream with the same kernel. """ self._cached_stream = None - self._update_modified_time(invalidate=True) + self._set_modified_time(invalidate=True) def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "_cached_stream should not be None here." ) @@ -203,7 +203,7 @@ def types(self) -> tuple[TypeSpec, TypeSpec]: Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "_cached_stream should not be None here." ) @@ -212,12 +212,12 @@ def types(self) -> tuple[TypeSpec, TypeSpec]: @property def is_current(self) -> bool: if self._cached_stream is None or not super().is_current: - status = self.update_stream() + status = self.refresh() if not status: # if it failed to update for whatever reason return False return True - def update_stream(self, force: bool = False) -> bool: + def refresh(self, force: bool = False) -> bool: updated = False if force or (self._cached_stream is not None and not super().is_current): self.clear_cache() @@ -227,7 +227,7 @@ def update_stream(self, force: bool = False) -> bool: "Stream source must be set to recompute the stream." ) self._cached_stream = self.source.forward(*self.upstreams) - self._update_modified_time() + self._set_modified_time() updated = True if self._cached_stream is None: @@ -238,6 +238,14 @@ def update_stream(self, force: bool = False) -> bool: return updated + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + @property def last_modified(self) -> datetime | None: if self._cached_stream is None: @@ -245,14 +253,14 @@ def last_modified(self) -> datetime | None: return self._cached_stream.last_modified def as_table(self) -> pa.Table: - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) return self._cached_stream.as_table() def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.update_stream() + self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) @@ -307,7 +315,7 @@ def __init__( ) self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None - self._update_modified_time() # set modified time to now + self._set_modified_time() # set modified time to now def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ @@ -402,13 +410,13 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: This is useful for accessing the columns in the stream. """ tag_keys, _ = self.input_stream.keys() - packet_keys = tuple(self.pod.output_typespec.keys()) + packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys def types(self) -> tuple[TypeSpec, TypeSpec]: tag_typespec, _ = self.input_stream.types() # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_typespec) + packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec def clear_cache(self) -> None: @@ -420,13 +428,22 @@ def clear_cache(self) -> None: self._computation_complete = False self._cached_output_table = None - def validate_cache(self) -> None: - if not self.is_current: - self.clear_cache() - self._update_modified_time(invalidate=True) + def refresh(self, force: bool = False) -> bool: + if not self.is_current or force: + self.invalidate() + return True + return False + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) def as_table(self) -> pa.Table: - self.validate_cache() + self.refresh() if self._cached_output_table is None: all_tags = [] all_packets = [] @@ -452,7 +469,7 @@ def as_table(self) -> pa.Table: return self._cached_output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.validate_cache() + self.refresh() if not self._computation_complete or self._cached_output_packets is None: for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): if i not in self._cached_output_packets: @@ -480,7 +497,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self._cached_output_packets[i] = (processed_tag, processed_packet) yield processed_tag, processed_packet self._computation_complete = True - self._update_modified_time() + self._set_modified_time() else: for i in range(len(self._cached_output_packets)): diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 677aab6..a997302 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -7,37 +7,163 @@ class Datagram(Protocol): - @property - def typespec(self) -> TypeSpec: ... + """ + Base protocol for all data containers in Orcapod. - def keys(self) -> Collection[str]: ... + Datagrams are the fundamental units of data that flow through the system. + They provide a unified interface for data access and conversion, ensuring + consistent behavior across different data types and sources. + + TypeSpec is a dict[str, type] mapping field names to their Python types, + enabling type checking and validation throughout the computational graph. + """ - def as_table(self) -> pa.Table: ... + def types(self) -> TypeSpec: + """ + Return the type specification for this datagram. - def as_dict(self) -> dict[str, DataValue]: ... + The TypeSpec maps field names to their Python types, enabling + type checking and validation throughout the system. + Returns: + TypeSpec: Dictionary mapping field names to Python types + """ + ... + + def keys(self) -> Collection[str]: + """ + Return the available keys/fields in this datagram. + + This provides a way to inspect the structure of the datagram + without accessing the actual data values. + + Returns: + Collection[str]: Available field names + """ + ... -class Tag(Datagram, Protocol): ... + def as_table(self) -> pa.Table: + """ + Convert to PyArrow Table format. + + Provides a standardized way to convert datagram content to + a columnar format suitable for analysis and processing. + + Returns: + pa.Table: PyArrow table representation + """ + ... + + def as_dict(self) -> dict[str, DataValue]: + """ + Convert to dictionary format. + + Provides a simple key-value representation of the datagram + content, useful for debugging and simple data access. + + Returns: + dict[str, DataValue]: Dictionary representation + """ + ... + + +class Tag(Datagram, Protocol): + """ + Metadata associated with each data item in a stream. + + Tags carry contextual information about data packets as they flow through + the computational graph. They are immutable and provide metadata that + helps with: + - Data lineage tracking + - Grouping and aggregation operations + - Temporal information (timestamps) + - Source identification + - Processing context + + Common examples include: + - Timestamps indicating when data was created/processed + - Source identifiers showing data origin + - Processing metadata like batch IDs or session information + - Grouping keys for aggregation operations + - Quality indicators or confidence scores + """ + + pass class Packet(Datagram, Protocol): + """ + The actual data payload in a stream. + + Packets represent the core data being processed through the computational + graph. Unlike Tags (which are metadata), Packets contain the actual + information that computations operate on. + + Packets extend Datagram with additional capabilities for: + - Source tracking and lineage + - Content-based hashing for caching + - Metadata inclusion for debugging + + The distinction between Tag and Packet is crucial for understanding + data flow: Tags provide context, Packets provide content. + """ + def as_table(self, include_source: bool = False) -> pa.Table: """ Convert the packet to a PyArrow Table. - If include_source is True, the source information is included in the table. + + Args: + include_source: If True, source information is included in the table + for debugging and lineage tracking + + Returns: + pa.Table: PyArrow table representation of packet data """ ... def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ Convert the packet to a dictionary. - If include_source is True, the source information is included in the dictionary. + + Args: + include_source: If True, source information is included in the dictionary + for debugging and lineage tracking + + Returns: + dict[str, DataValue]: Dictionary representation of packet data + """ + ... + + def content_hash(self) -> str: + """ + Return a hash of the packet content for caching/comparison. + + This hash should be deterministic and based only on the packet content, + not on source information or metadata. Used for: + - Caching computation results + - Detecting data changes + - Deduplication operations + + Returns: + str: Deterministic hash of packet content """ ... - def content_hash(self) -> str: ... + def source_info(self) -> dict[str, str | None]: + """ + Return metadata about the packet's source/origin. - def source_info(self) -> dict[str, str | None]: ... + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information + + Returns: + dict[str, str | None]: Source metadata for debugging/lineage + """ + ... # def join(self, other: "Packet") -> "Packet": ... @@ -46,202 +172,685 @@ def source_info(self) -> dict[str, str | None]: ... class PodFunction(Protocol): """ - A function suitable to be used in a FunctionPod. - It takes one or more named arguments, each corresponding to either: - - A path to a file or directory (PathSet) - for backward compatibility - - A simple data value (str, int, float, bool, bytes, Path) - and returns either None, a single value, or a list of values + A function suitable for use in a FunctionPod. + + PodFunctions define the computational logic that operates on individual + packets within a Pod. They represent pure functions that transform + data values without side effects. + + These functions are designed to be: + - Stateless: No dependency on external state + - Deterministic: Same inputs always produce same outputs + - Serializable: Can be cached and distributed + - Type-safe: Clear input/output contracts + + PodFunctions accept named arguments corresponding to packet fields + and return transformed data values. """ - def __call__(self, **kwargs: DataValue) -> None | DataValue: ... + def __call__(self, **kwargs: DataValue) -> None | DataValue: + """ + Execute the pod function with the given arguments. + + The function receives packet data as named arguments and returns + either transformed data or None (for filtering operations). + + Args: + **kwargs: Named arguments mapping packet fields to data values + + Returns: + None: Filter out this packet (don't include in output) + DataValue: Single transformed value + + Raises: + TypeError: If required arguments are missing + ValueError: If argument values are invalid + """ + ... class Labelable(Protocol): """ - A protocol for objects that can have a label. - This is used to provide a human-readable name for the object. + Protocol for objects that can have a human-readable label. + + Labels provide meaningful names for objects in the computational graph, + making debugging, visualization, and monitoring much easier. They serve + as human-friendly identifiers that complement the technical identifiers + used internally. + + Labels are optional but highly recommended for: + - Debugging complex computational graphs + - Visualization and monitoring tools + - Error messages and logging + - User interfaces and dashboards """ @property def label(self) -> str | None: """ - Return the label of the object. - If no label is set, return None. + Return the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Returns: + str: Human-readable label for this object + None: No label is set (will use default naming) """ ... -class Kernel(ContentIdentifiable, Labelable, Protocol): +class Stream(ContentIdentifiable, Labelable, Protocol): """ - Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. - It is the base class for all computations and transformations that can be performed on a collection of streams - (including an empty collection). - A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). - Each "invocation" of the kernel on a collection of streams is assigned a unique ID. - The corresponding invocation information is stored as Invocation object and attached to the output stream - for computational graph tracking. + Base protocol for all streams in Orcapod. + + Streams represent sequences of (Tag, Packet) pairs flowing through the + computational graph. They are the fundamental data structure connecting + kernels and carrying both data and metadata. + + Streams can be either: + - Static: Immutable snapshots created at a specific point in time + - Live: Dynamic streams that stay current with upstream dependencies + + All streams provide: + - Iteration over (tag, packet) pairs + - Type information and schema access + - Lineage information (source kernel and upstream streams) + - Basic caching and freshness tracking + - Conversion to common formats (tables, dictionaries) """ - def __call__( - self, *streams: "Stream", label: str | None = None, **kwargs - ) -> "Stream": + @property + def source(self) -> "Kernel | None": + """ + The kernel that produced this stream. + + This provides lineage information for tracking data flow through + the computational graph. Root streams (like file sources) may + have no source kernel. + + Returns: + Kernel: The source kernel that created this stream + None: This is a root stream with no source kernel """ - This is the main interface for invoking the kernel and perform any side-effects such as registering the invocation with the computational graph. - This method should be called with a collection of streams, which can be empty, and is expected to trigger - the call to the forward method of the kernel. + ... + + @property + def upstreams(self) -> tuple["Stream", ...]: + """ + Input streams used to produce this stream. + + These are the streams that were provided as input to the source + kernel when this stream was created. Used for dependency tracking + and cache invalidation. + + Returns: + tuple[Stream, ...]: Upstream dependency streams (empty for sources) """ ... - def forward(self, *streams: "Stream") -> "Stream": + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior. - The method should return a new stream that represents the output of the kernel, but should not register the invocation - with the computational graph, allowing for the computation to be performed without side effects. + Available keys/fields in the stream content. + + Returns the field names present in both tags and packets. + This provides schema information without requiring type details, + useful for: + - Schema inspection and exploration + - Query planning and optimization + - Field validation and mapping + + Returns: + tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) """ ... - def types(self, *streams: "Stream") -> tuple[TypeSpec, TypeSpec]: ... + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Type specifications for the stream content. - def validate_inputs(self, *streams: "Stream") -> None: ... + Returns the type schema for both tags and packets in this stream. + This information is used for: + - Type checking and validation + - Schema inference and planning + - Compatibility checking between kernels + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) + """ + ... -class Pod(Kernel, Protocol): @property - def input_typespec(self) -> TypeSpec: ... + def last_modified(self) -> datetime | None: + """ + When the stream's content was last modified. + + This property is crucial for caching decisions and dependency tracking: + - datetime: Content was last modified at this time (cacheable) + - None: Content is never stable, always recompute (some dynamic streams) + + Both static and live streams typically return datetime values, but + live streams update this timestamp whenever their content changes. + + Returns: + datetime: Timestamp of last modification for most streams + None: Stream content is never stable (some special dynamic streams) + """ + ... @property - def output_typespec(self) -> TypeSpec: ... + def is_current(self) -> bool: + """ + Whether the stream is up-to-date with its dependencies. - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + A stream is current if its content reflects the latest state of its + source kernel and upstream streams. This is used for cache validation + and determining when refresh is needed. + + For live streams, this should always return True since they stay + current automatically. For static streams, this indicates whether + the cached content is still valid. + + Returns: + bool: True if stream is up-to-date, False if refresh needed """ - Call the function pod with a single input packet. - This is used to invoke the function pod with a single packet. + ... + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + """ + Iterate over (tag, packet) pairs in the stream. + + This is the primary way to access stream data. The behavior depends + on the stream type: + - Static streams: Return cached/precomputed data + - Live streams: May trigger computation and always reflect current state + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs """ ... + def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: + """ + Alias for __iter__ for explicit packet iteration. -class Stream(ContentIdentifiable, Labelable, Protocol): + Provides a more explicit method name when the intent is to iterate + over packets specifically, improving code readability. + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs + """ + ... + + def as_table(self) -> pa.Table: + """ + Convert the entire stream to a PyArrow Table. + + Materializes all (tag, packet) pairs into a single table for + analysis and processing. This operation may be expensive for + large streams or live streams that need computation. + + Tag fields are prefixed with "_tag_" to avoid naming conflicts + with packet fields. + + Returns: + pa.Table: Complete stream data as a PyArrow Table + """ + ... + + +class LiveStream(Stream, Protocol): """ - A stream that is generated by an invocation of a kernel. - This stream is used to represent the output of a kernel invocation. - It is a concrete implementation of the SyncStream that has an associated - invocation that generated the stream. + A stream that automatically stays up-to-date with its upstream dependencies. + + LiveStream extends the base Stream protocol with capabilities for "up-to-date" + data flow and reactive computation. Unlike static streams which represent + snapshots, LiveStreams provide the guarantee that their content always + reflects the current state of their dependencies. + + Key characteristics: + - Automatically refresh the stream if changes in the upstreams are detected + - Track last_modified timestamp when content changes + - Support manual refresh triggering and invalidation + - By design, LiveStream would return True for is_current except when auto-update fails. + + LiveStreams are always returned by Kernel.__call__() methods, ensuring + that normal kernel usage produces live, up-to-date results. + + Caching behavior: + - last_modified updates whenever content changes + - Can be cached based on dependency timestamps + - Invalidation happens automatically when upstreams change + + Use cases: + - Real-time data processing pipelines + - Reactive user interfaces + - Monitoring and alerting systems + - Dynamic dashboard updates + - Any scenario requiring current data """ - @property - def source(self) -> Kernel | None: ... + def refresh(self, force: bool = False) -> bool: + """ + Manually trigger a refresh of this stream's content. - @property - def upstreams(self) -> tuple["Stream", ...]: ... + Forces the stream to check its upstream dependencies and update + its content if necessary. This is useful when: + - You want to ensure the latest data before a critical operation + - You need to force computation at a specific time + - You're debugging data flow issues + - You want to pre-compute results for performance + Args: + force: If True, always refresh even if the stream is current. + If False, only refresh if the stream is not current. - @property - def last_modified(self) -> datetime | None: + Returns: + bool: True if the stream was refreshed, False if it was already current. + Note: LiveStream refreshes automatically on access, so this + method may be a no-op for some implementations. However, it's + always safe to call if you need to control when the cache is refreshed. + """ + ... + + def invalidate(self) -> None: """ - Returns when the stream's content was last modified. + Mark this stream as invalid, forcing a refresh on next access. + + This method is typically called when: + - Upstream dependencies have changed + - The source kernel has been modified + - External data sources have been updated + - Manual cache invalidation is needed + + The stream will automatically refresh its content the next time + it's accessed (via iteration, as_table(), etc.). + + This is more efficient than immediate refresh when you know the + data will be accessed later. + """ + ... + + +class Kernel(ContentIdentifiable, Labelable, Protocol): + """ + The fundamental unit of computation in Orcapod. + + Kernels are the building blocks of computational graphs, transforming + zero, one, or more input streams into a single output stream. They + encapsulate computation logic while providing consistent interfaces + for validation, type checking, and execution. + + Key design principles: + - Immutable: Kernels don't change after creation + - Deterministic: Same inputs always produce same outputs + - Composable: Kernels can be chained and combined + - Trackable: All invocations are recorded for lineage + - Type-safe: Strong typing and validation throughout + + Execution modes: + - __call__(): Full-featured execution with tracking, returns LiveStream + - forward(): Pure computation without side effects, returns Stream + + The distinction between these modes enables both production use (with + full tracking) and testing/debugging (without side effects). + """ + + def __call__( + self, *streams: Stream, label: str | None = None, **kwargs + ) -> LiveStream: + """ + Main interface for kernel invocation with full tracking and guarantees. + + This is the primary way to invoke kernels in production. It provides + a complete execution pipeline: + 1. Validates input streams against kernel requirements + 2. Registers the invocation with the computational graph + 3. Calls forward() to perform the actual computation + 4. Ensures the result is a LiveStream that stays current + + The returned LiveStream automatically stays up-to-date with its + upstream dependencies, making it suitable for real-time processing + and reactive applications. + + Args: + *streams: Input streams to process (can be empty for source kernels) + label: Optional label for this invocation (overrides kernel.label) + **kwargs: Additional arguments for kernel configuration Returns: - datetime: Timestamp of last modification (cacheable streams) - None: Content is never stable - always recompute - (async streams, dynamic streams, etc.) + LiveStream: Live stream that stays up-to-date with upstreams + + Raises: + ValidationError: If input streams are invalid for this kernel + TypeMismatchError: If stream types are incompatible + ValueError: If required arguments are missing """ ... - @property - def is_current(self) -> bool: + def forward(self, *streams: Stream) -> Stream: """ - Returns whether the stream is current. - A stream is current if the content is up-to-date with respect to its source. - This can be used to determine if a stream with non-None last_modified is up-to-date. - Note that for asynchronous streams, this status is not applicable and always returns False. + Perform the actual computation without side effects. + + This method contains the core computation logic and should be + overridden by subclasses. It performs pure computation without: + - Registering with the computational graph + - Performing validation (caller's responsibility) + - Guaranteeing result type (may return static or live streams) + + The returned stream must be accurate at the time of invocation but + need not stay up-to-date with upstream changes. This makes forward() + suitable for: + - Testing and debugging + - Batch processing where currency isn't required + - Internal implementation details + + Args: + *streams: Input streams to process + + Returns: + Stream: Result of the computation (may be static or live) """ ... - def as_table(self) -> pa.Table: + def output_types(self, *streams: Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Determine output types without triggering computation. + + This method performs type inference based on input stream types, + enabling efficient type checking and stream property queries. + It should be fast and not trigger any expensive computation. + + Used for: + - Pre-execution type validation + - Query planning and optimization + - Schema inference in complex pipelines + - IDE support and developer tooling + + Args: + *streams: Input streams to analyze + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + + Raises: + ValidationError: If input types are incompatible + TypeError: If stream types cannot be processed + """ + ... + + def validate_inputs(self, *streams: Stream) -> None: """ - Convert the stream to a PyArrow Table. - To avoid collision, tags should be prefixed with "_tag_". + Validate input streams, raising exceptions if incompatible. + + This method is called automatically by __call__ before computation + to provide fail-fast behavior. It should check: + - Number of input streams + - Stream types and schemas + - Any kernel-specific requirements + - Business logic constraints + + The goal is to catch errors early, before expensive computation + begins, and provide clear error messages for debugging. + + Args: + *streams: Input streams to validate + + Raises: + ValidationError: If streams are invalid for this kernel + TypeError: If stream types are incompatible + ValueError: If stream content violates business rules """ ... - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: ... - def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: ... +class Pod(Kernel, Protocol): + """ + Specialized kernel for packet-level processing with advanced caching. + + Pods represent a different computational model from regular kernels: + - Process data one packet at a time (enabling fine-grained parallelism) + - Support just-in-time evaluation (computation deferred until needed) + - Provide stricter type contracts (clear input/output schemas) + - Enable advanced caching strategies (packet-level caching) + + The Pod abstraction is ideal for: + - Expensive computations that benefit from caching + - Operations that can be parallelized at the packet level + - Transformations with strict type contracts + - Processing that needs to be deferred until access time + - Functions that operate on individual data items + + Pods use a different execution model where computation is deferred + until results are actually needed, enabling efficient resource usage + and fine-grained caching. + """ + + def input_packet_types(self) -> TypeSpec: + """ + TypeSpec for input packets that this Pod can process. + + Defines the exact schema that input packets must conform to. + Pods are typically much stricter about input types than regular + kernels, requiring precise type matching for their packet-level + processing functions. - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + This specification is used for: + - Runtime type validation + - Compile-time type checking + - Schema inference and documentation + - Input validation and error reporting + + Returns: + TypeSpec: Dictionary mapping field names to required packet types + """ + ... + + def output_packet_types(self) -> TypeSpec: """ - Return the keys of the pipeline property. - This is used to define the keys of the pipeline property. + TypeSpec for output packets that this Pod produces. + + Defines the schema of packets that will be produced by this Pod. + This is typically determined by the Pod's computational function + and is used for: + - Type checking downstream kernels + - Schema inference in complex pipelines + - Query planning and optimization + - Documentation and developer tooling + + Returns: + TypeSpec: Dictionary mapping field names to output packet types """ ... - def types(self) -> tuple[TypeSpec, TypeSpec]: + def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: """ - Return the types of the pipeline property. - This is used to define the types of the graph property. + Process a single packet with its associated tag. + + This is the core method that defines the Pod's computational behavior. + It processes one (tag, packet) pair at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Tag transformation (modify metadata) + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + tag: Metadata associated with the packet + packet: The data payload to process + + Returns: + tuple[Tag, Packet | None]: + - Tag: Output tag (may be modified from input) + - Packet: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing """ ... class Source(Kernel, Stream, Protocol): """ - A source is a special type of kernel that produces a stream of data. - It is the entry point for data into the computational graph. - Sources are typically used to read data from external sources such as files, databases, etc. + Entry point for data into the computational graph. + + Sources are special objects that serve dual roles: + - As Kernels: Can be invoked to produce streams + - As Streams: Directly provide data without upstream dependencies + + Sources represent the roots of computational graphs and typically + interface with external data sources. They bridge the gap between + the outside world and the Orcapod computational model. + + Common source types: + - File readers (CSV, JSON, Parquet, etc.) + - Database connections and queries + - API endpoints and web services + - Generated data sources (synthetic data) + - Manual data input and user interfaces + - Message queues and event streams + + Sources have unique properties: + - No upstream dependencies (upstreams is empty) + - Can be both invoked and iterated + - Serve as the starting point for data lineage + - May have their own refresh/update mechanisms """ + pass + class Tracker(Protocol): + """ + Records kernel invocations and stream creation for computational graph tracking. + + Trackers are responsible for maintaining the computational graph by recording + relationships between kernels, streams, and invocations. They enable: + - Lineage tracking and data provenance + - Caching and memoization strategies + - Debugging and error analysis + - Performance monitoring and optimization + - Reproducibility and auditing + + Multiple trackers can be active simultaneously, each serving different + purposes (e.g., one for caching, another for debugging, another for + monitoring). This allows for flexible and composable tracking strategies. + + Trackers can be selectively activated/deactivated to control overhead + and focus on specific aspects of the computational graph. + """ + def set_active(self, active: bool = True) -> None: """ Set the active state of the tracker. - This is used to activate or deactivate the tracker. - If the tracker is active, it will record the invocations of kernels. + + When active, the tracker will record all kernel invocations and + stream creations. When inactive, no recording occurs, reducing + overhead for performance-critical sections. + + Args: + active: True to activate recording, False to deactivate """ ... def is_active(self) -> bool: """ - Check if the tracker is active. - This is used to determine if the tracker is currently recording invocations. + Check if the tracker is currently recording invocations. + + Returns: + bool: True if tracker is active and recording, False otherwise """ ... def record(self, stream: Stream) -> None: """ - Record the output stream of a kernel invocation in the tracker. - This is used to track the computational graph and the invocations of kernels. + Record a stream in the computational graph. + + This method is called whenever a kernel produces a new stream. + The tracker should record: + - The stream and its properties + - The source kernel that created it + - The upstream streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + stream: The stream to record in the computational graph """ ... class TrackerManager(Protocol): + """ + Manages multiple trackers and coordinates their activity. + + The TrackerManager provides a centralized way to: + - Register and manage multiple trackers + - Coordinate recording across all active trackers + - Provide a single interface for graph recording + - Enable dynamic tracker registration/deregistration + + This design allows for: + - Multiple concurrent tracking strategies + - Pluggable tracking implementations + - Easy testing and debugging (mock trackers) + - Performance optimization (selective tracking) + """ + def get_active_trackers(self) -> list[Tracker]: """ - Get the list of active trackers. - This is used to retrieve the currently active trackers in the system. + Get all currently active trackers. + + Returns only trackers that are both registered and active, + providing the list of trackers that will receive recording events. + + Returns: + list[Tracker]: List of trackers that are currently recording """ ... def register_tracker(self, tracker: Tracker) -> None: """ Register a new tracker in the system. - This is used to add a new tracker to the list of active trackers. + + The tracker will be included in future recording operations + if it is active. Registration is separate from activation + to allow for dynamic control of tracking overhead. + + Args: + tracker: The tracker to register """ ... def deregister_tracker(self, tracker: Tracker) -> None: """ - Deregister a tracker from the system. - This is used to remove a tracker from the list of active trackers. + Remove a tracker from the system. + + The tracker will no longer receive recording notifications + even if it is still active. This is useful for: + - Cleaning up temporary trackers + - Removing failed or problematic trackers + - Dynamic tracker management + + Args: + tracker: The tracker to remove """ ... def record(self, stream: Stream) -> None: """ - Record the output stream of a kernel invocation in the tracker. - This is used to track the computational graph and the invocations of kernels. + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers """ ... From d2aacadff31d7bbfe5d3acc6c327cc1aab8c837f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 02:02:49 +0000 Subject: [PATCH 103/224] refactor: further refinement of tracker protocols --- src/orcapod/data/kernels.py | 29 +++++--------- src/orcapod/data/pods.py | 19 ++++++++++ src/orcapod/data/trackers.py | 25 +++++++++++-- src/orcapod/protocols/data_protocols.py | 50 +++++++++++++++++++++---- 4 files changed, 93 insertions(+), 30 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index acccf4e..831a51e 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -37,28 +37,19 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + def record_kernel_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + """ + Register the pod with the upstream streams. This is used to track the pod in the system. + """ + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_kernel_invocation(self, upstreams) + def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - output_stream = self.forward(*streams, **kwargs) - - if output_stream.source is not None: - kernel_stream = KernelStream(output_stream, label=label) - else: - logger.warning( - "Output stream does not have a source. " - "This may lead to unexpected behavior when tracking the kernel invocation." - ) - kernel_stream = KernelStream(source=self, upstreams=streams, label=label) - - # TODO: consider the logic around tracker manager more carefully - if not self._skip_tracking and self._tracker_manager is not None: - # register the invocation to all active trackers - active_trackers = self._tracker_manager.get_active_trackers() - for tracker in active_trackers: - tracker.record(kernel_stream) - - return kernel_stream + output_stream = KernelStream(source=self, upstreams=streams, label=label) + self.record_kernel_invocation(streams) + return output_stream @abstractmethod def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 7e2ce48..7e7ba7b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -109,6 +109,13 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: def validate_inputs(self, *streams: dp.Stream) -> None: self.process_and_verify_streams(*streams) + def record_pod_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + """ + Register the pod with the upstream streams. This is used to track the pod in the system. + """ + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_pod_invocation(self, upstreams) + def forward(self, *streams: dp.Stream) -> PodStream: input_stream = self.process_and_verify_streams(*streams) # at this point, streams should have been joined into one @@ -119,6 +126,18 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) + def __call__( + self, *streams: dp.Stream, label: str | None = None, **kwargs + ) -> PodStream: + """ + Invoke the pod with a collection of streams. This will process the streams and return a PodStream. + """ + output_stream = self.forward(*streams, **kwargs) + + self.record_pod_invocation(output_stream.upstreams) + + return output_stream + def function_pod( output_keys: str | Collection[str] | None = None, diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index fab481a..456711d 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -30,13 +30,25 @@ def get_active_trackers(self) -> list[dp.Tracker]: """ return [t for t in self._active_trackers if t.is_active()] - def record(self, stream: dp.Stream) -> None: + def record_kernel_invocation( + self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + ) -> None: """ Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ for tracker in self.get_active_trackers(): - tracker.record(stream) + tracker.record_kernel_invocation(kernel, upstreams) + + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + ) -> None: + """ + Record the output stream of a pod invocation in the tracker. + This is used to track the computational graph and the invocations of pods. + """ + for tracker in self.get_active_trackers(): + tracker.record_pod_invocation(pod, upstreams) class AutoRegisteringContextBasedTracker(ABC): @@ -55,7 +67,14 @@ def is_active(self) -> bool: return self._active @abstractmethod - def record(self, stream: dp.Stream) -> None: ... + def record_kernel_invocation( + self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + ) -> None: ... + + @abstractmethod + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + ) -> None: ... def __enter__(self): self.set_active(True) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index a997302..24b6861 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -767,20 +767,39 @@ def is_active(self) -> bool: """ ... - def record(self, stream: Stream) -> None: + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...] + ) -> None: """ - Record a stream in the computational graph. + Record a kernel invocation in the computational graph. - This method is called whenever a kernel produces a new stream. - The tracker should record: - - The stream and its properties - - The source kernel that created it + This method is called whenever a kernel is invoked. The tracker + should record: + - The kernel and its properties + - The input streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + kernel: The kernel that was invoked + upstreams: The input streams used for this invocation + """ + ... + + def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + """ + Record a pod invocation in the computational graph. + + This method is called whenever a pod is invoked. The tracker + should record: + - The pod and its properties - The upstream streams that were used as input - Timing and performance information - Any relevant metadata Args: - stream: The stream to record in the computational graph + pod: The pod that was invoked + upstreams: The input streams used for this invocation """ ... @@ -842,7 +861,9 @@ def deregister_tracker(self, tracker: Tracker) -> None: """ ... - def record(self, stream: Stream) -> None: + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...] + ) -> None: """ Record a stream in all active trackers. @@ -854,3 +875,16 @@ def record(self, stream: Stream) -> None: stream: The stream to record in all active trackers """ ... + + def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + """ + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently` + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers + """ + ... From e3b3d925e7c741a1f0c9ff3f6d88e5c2a7151a3b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 11 Jul 2025 07:16:03 +0000 Subject: [PATCH 104/224] feat: refine kernel and pod interaction with tracker --- src/orcapod/__init__.py | 9 ++ src/orcapod/data/__init__.py | 1 + src/orcapod/data/kernels.py | 61 +++++++-- src/orcapod/data/pods.py | 162 +++++++++++++----------- src/orcapod/data/trackers.py | 136 ++++++++++++++++++-- src/orcapod/hashing/hash_utils.py | 56 +++++++- src/orcapod/hashing/legacy_core.py | 1 + src/orcapod/protocols/data_protocols.py | 37 +++++- 8 files changed, 362 insertions(+), 101 deletions(-) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 01cd5db..b4de8e1 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,3 +1,12 @@ +from .data import DEFAULT_TRACKER_MANAGER + +no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking + +__all__ = [ + "DEFAULT_TRACKER_MANAGER", + "no_tracking", +] + # from .core import operators, sources, streams # from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator # from . import hashing, stores diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py index e69de29..6d7e206 100644 --- a/src/orcapod/data/__init__.py +++ b/src/orcapod/data/__init__.py @@ -0,0 +1 @@ +from .trackers import DEFAULT_TRACKER_MANAGER diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 831a51e..538cf11 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -27,6 +27,7 @@ class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): def __init__( self, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, @@ -36,27 +37,62 @@ def __init__( self._label = label self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self.fixed_input_streams = fixed_input_streams - def record_kernel_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: + def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Register the pod with the upstream streams. This is used to track the pod in the system. + Resolve the input streams for the kernel. If the kernel has fixed input streams, + it returns those. Otherwise, it returns the provided streams. + """ + if self.fixed_input_streams is not None: + if len(streams) != 0: + raise ValueError( + f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted." + ) + return self.fixed_input_streams + return streams + + def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + """ + Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing + on the input streams before the main computation. This is useful if you need to modify the input streams + or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the + pre-processing step will be tracked separately from the main computation in forward. + By default, it returns the input streams unchanged. + """ + return streams + + @abstractmethod + def validate_inputs(self, *streams: dp.Stream) -> None: ... + + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> dp.LiveStream: + """ + Prepare the output stream for the kernel invocation. + This method is called after the main computation is performed. + It creates a KernelStream with the provided streams and label. + """ + return KernelStream(source=self, upstreams=streams, label=label) + + def track_invocation(self, *streams: dp.Stream) -> None: + """ + Track the invocation of the kernel with the provided streams. + This is a convenience method that calls record_kernel_invocation. """ if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_kernel_invocation(self, upstreams) + self._tracker_manager.record_kernel_invocation(self, streams) def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - output_stream = KernelStream(source=self, upstreams=streams, label=label) - self.record_kernel_invocation(streams) + streams = self.resolve_input_streams(*streams) + processed_streams = self.pre_processing_step(*streams) + self.validate_inputs(*processed_streams) + output_stream = self.prepare_output_stream(*processed_streams, label=label) + self.track_invocation(*processed_streams) return output_stream - @abstractmethod - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... - - @abstractmethod - def validate_inputs(self, *streams: dp.Stream) -> None: ... - @abstractmethod def forward(self, *streams: dp.Stream) -> dp.Stream: """ @@ -65,6 +101,9 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: Subclasses should override this method to provide the kernel with its unique behavior """ + @abstractmethod + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def __repr__(self): return self.__class__.__name__ diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 7e7ba7b..6b1d730 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,47 +1,30 @@ -from orcapod.data.datagrams import PythonDictPacket -from orcapod.data.streams import PodStream -from orcapod.data.kernels import TrackedKernelBase -from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types import SemanticTypeRegistry, default_registry -from orcapod.types import typespec_utils as tsutils -from abc import abstractmethod - import logging import sys +from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast - -from orcapod.types.typespec_utils import ( - extract_function_typespecs, - check_typespec_compatibility, -) -from orcapod.types import TypeSpec - -from orcapod.hashing.legacy_core import get_function_signature +from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join - +from orcapod.data.streams import PodStream +from orcapod.hashing.hash_utils import get_function_signature +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import SemanticTypeRegistry, TypeSpec, default_registry +from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) error_handling_options = Literal["raise", "ignore", "warn"] -class PodBase(TrackedKernelBase): +class ActivatablePodBase(TrackedKernelBase): """ FunctionPod is a specialized kernel that encapsulates a function to be executed on data streams. It allows for the execution of a function with a specific label and can be tracked by the system. """ - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - """ - Return the input and output typespecs for the pod. - This is used to validate the input and output streams. - """ - input_stream = self.process_and_verify_streams(*streams) - tag_typespec, _ = input_stream.types() - return tag_typespec, self.output_packet_types() - @abstractmethod def input_packet_types(self) -> TypeSpec: """ @@ -63,14 +46,25 @@ def call( def __init__( self, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, error_handling: error_handling_options = "raise", label: str | None = None, **kwargs, ) -> None: - super().__init__(label=label, **kwargs) + super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self._active = True self.error_handling = error_handling + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Return the input and output typespecs for the pod. + This is used to validate the input and output streams. + """ + input_streams = self.pre_processing_step(*streams) + self.validate_inputs(*input_streams) + tag_typespec, _ = input_streams[0].types() + return tag_typespec, self.output_packet_types() + def is_active(self) -> bool: """ Check if the pod is active. If not, it will not process any packets. @@ -83,7 +77,22 @@ def set_active(self, active: bool) -> None: """ self._active = active - def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: + def validate_inputs(self, *streams: dp.Stream) -> None: + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + input_stream = streams[0] + _, incoming_packet_types = input_stream.types() + if not tsutils.check_typespec_compatibility( + incoming_packet_types, self.input_packet_types() + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" + ) + + def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Prepare the incoming streams for execution in the pod. This default implementation joins all the input streams together. @@ -96,29 +105,16 @@ def process_and_verify_streams(self, *streams: dp.Stream) -> dp.Stream: for next_stream in streams[1:]: stream = Join()(stream, next_stream) combined_streams = [stream] - input_stream = combined_streams[0] - _, incoming_packet_types = input_stream.types() - if not tsutils.check_typespec_compatibility( - incoming_packet_types, self.input_packet_types() - ): - raise ValueError( - f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" - ) - return input_stream - def validate_inputs(self, *streams: dp.Stream) -> None: - self.process_and_verify_streams(*streams) + return tuple(combined_streams) - def record_pod_invocation(self, upstreams: tuple[dp.Stream, ...]) -> None: - """ - Register the pod with the upstream streams. This is used to track the pod in the system. - """ + def track_invocation(self, *streams: dp.Stream) -> None: if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, upstreams) + self._tracker_manager.record_pod_invocation(self, streams) def forward(self, *streams: dp.Stream) -> PodStream: - input_stream = self.process_and_verify_streams(*streams) - # at this point, streams should have been joined into one + assert len(streams) == 1, "PodBase.forward expects exactly one input stream" + input_stream = streams[0] return PodStream( self, @@ -126,18 +122,6 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) - def __call__( - self, *streams: dp.Stream, label: str | None = None, **kwargs - ) -> PodStream: - """ - Invoke the pod with a collection of streams. This will process the streams and return a PodStream. - """ - output_stream = self.forward(*streams, **kwargs) - - self.record_pod_invocation(output_stream.upstreams) - - return output_stream - def function_pod( output_keys: str | Collection[str] | None = None, @@ -189,7 +173,7 @@ def decorator(func) -> FunctionPod: return decorator -class FunctionPod(PodBase): +class FunctionPod(ActivatablePodBase): def __init__( self, function: dp.PodFunction, @@ -227,7 +211,7 @@ def __init__( # extract input and output types from the function signature self._input_packet_types, self._output_packet_types = ( - extract_function_typespecs( + tsutils.extract_function_typespecs( self.function, self.output_keys, input_typespec=input_typespec, @@ -250,7 +234,7 @@ def output_packet_types(self) -> TypeSpec: return self._output_packet_types def __repr__(self) -> str: - return f"FunctionPod:{self.function!r}" + return f"FunctionPod:{self.function_name}" def __str__(self) -> str: include_module = self.function.__module__ != "__main__" @@ -271,7 +255,9 @@ def call( return tag, None output_values = [] - values = self.function(**packet.as_dict(include_source=False)) + # any kernel/pod invocation happening inside the function will NOT be tracked + with self._tracker_manager.no_tracking(): + values = self.function(**packet.as_dict(include_source=False)) if len(self.output_keys) == 0: output_values = [] @@ -297,6 +283,7 @@ def call( def identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function + # if function_info_extractor is available, use that but substitute the function_name if self.function_info_extractor is not None: function_info = self.function_info_extractor.extract_function_info( @@ -309,20 +296,39 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # use basic information only function_info = { "name": self.function_name, - "input_packet_types": self.input_packet_types, - "output_packet_types": self.output_packet_types, + "input_packet_types": self.input_packet_types(), + "output_packet_types": self.output_packet_types(), } function_info["output_keys"] = tuple(self.output_keys) - return ( + id_struct = ( self.__class__.__name__, function_info, - ) + streams + ) + # if streams are provided, perform pre-processing step, validate, and add the + # resulting single stream to the identity structure + if len(streams) > 0: + processed_streams = self.pre_processing_step(*streams) + self.validate_inputs(*processed_streams) + id_struct += (processed_streams[0],) + return id_struct -class StoredPod(PodBase): - def __init__(self, pod: dp.Pod, label: str | None = None, **kwargs) -> None: - super().__init__(**kwargs) + +class WrappedPod(ActivatablePodBase): + """ + A wrapper for a pod that allows it to be used as a kernel. + This class is meant to serve as a base class for other pods that need to wrap existing pods. + """ + + def __init__( + self, + pod: dp.Pod, + fixed_input_streams: tuple[dp.Stream, ...] | None = None, + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod def computed_label(self) -> str | None: @@ -349,7 +355,19 @@ def identity_structure(self, *streams: dp.Stream) -> Any: return self.pod.identity_structure(*streams) def __repr__(self) -> str: - return f"StoredPod({self.pod!r})" + return f"WrappedPod({self.pod!r})" def __str__(self) -> str: - return f"StoredPod:{self.pod!s}" + return f"WrappedPod:{self.pod!s}" + + +class CachedPod(WrappedPod): + """ + A pod that caches the results of the wrapped pod. + This is useful for pods that are expensive to compute and can benefit from caching. + """ + + def __init__(self, pod: dp.Pod, cache_key: str, **kwargs): + super().__init__(pod, **kwargs) + self.cache_key = cache_key + self.cache: dict[str, dp.Packet] = {} diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 456711d..5ad2a55 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,11 +1,22 @@ -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from abc import ABC, abstractmethod +from typing import Any, ContextManager, Generator +from contextlib import contextmanager class BasicTrackerManager: def __init__(self) -> None: self._active_trackers: list[dp.Tracker] = [] + self._active = True + + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the tracker manager. + This is used to enable or disable the tracker manager. + """ + self._active = active def register_tracker(self, tracker: dp.Tracker) -> None: """ @@ -28,27 +39,43 @@ def get_active_trackers(self) -> list[dp.Tracker]: Get the list of active trackers. This is used to retrieve the currently active trackers in the system. """ + if not self._active: + return [] + # Filter out inactive trackers + # This is to ensure that we only return trackers that are currently active return [t for t in self._active_trackers if t.is_active()] def record_kernel_invocation( - self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, ) -> None: """ Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ for tracker in self.get_active_trackers(): - tracker.record_kernel_invocation(kernel, upstreams) + tracker.record_kernel_invocation(kernel, upstreams, label=label) def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: """ Record the output stream of a pod invocation in the tracker. This is used to track the computational graph and the invocations of pods. """ for tracker in self.get_active_trackers(): - tracker.record_pod_invocation(pod, upstreams) + tracker.record_pod_invocation(pod, upstreams, label=label) + + @contextmanager + def no_tracking(self) -> Generator[None, Any, None]: + original_state = self._active + self.set_active(False) + try: + yield + finally: + self.set_active(original_state) class AutoRegisteringContextBasedTracker(ABC): @@ -68,12 +95,15 @@ def is_active(self) -> bool: @abstractmethod def record_kernel_invocation( - self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...] + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, ) -> None: ... @abstractmethod def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...] + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: ... def __enter__(self): @@ -84,6 +114,44 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) +class Invocation: + def __init__( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + """ + Represents an invocation of a kernel with its upstream streams. + This is used to track the computational graph and the invocations of kernels. + """ + self.kernel = kernel + self.upstreams = upstreams + self._label = label + + def parents(self) -> tuple["Invocation", ...]: + parent_invoctions = [] + for stream in self.upstreams: + if stream.source is not None: + parent_invoctions.append(Invocation(stream.source, stream.upstreams)) + return tuple(parent_invoctions) + + @property + def label(self) -> str | None: + """ + Return the label of the kernel invocation. + This is used to identify the invocation in the tracker. + """ + return self._label or self.kernel.label or self.kernel.__class__.__name__ + + def identity_structure(self) -> Any: + """ + Return a structure that represents the identity of this invocation. + This is used to uniquely identify the invocation in the tracker. + """ + return self.kernel.identity_structure(*self.upstreams) + + class GraphTracker(AutoRegisteringContextBasedTracker): """ A tracker that records the invocations of operations and generates a graph @@ -92,11 +160,20 @@ class GraphTracker(AutoRegisteringContextBasedTracker): # Thread-local storage to track active trackers - def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + def __init__( + self, + tracker_manager: dp.TrackerManager | None = None, + object_hasher: hp.ObjectHasher | None = None, + ) -> None: super().__init__(tracker_manager=tracker_manager) - self.kernel_to_invoked_stream_lut: dict[dp.Kernel, list[dp.Stream]] = ( - defaultdict(list) - ) + if object_hasher is None: + object_hasher = get_default_object_hasher() + self.object_hasher = object_hasher + # Dictionary to map kernels to the streams they have invoked + # This is used to track the computational graph and the invocations of kernels + self.id_to_invocation_lut: dict[str, Invocation] = {} + self.id_to_label_lut: dict[str, list[str]] = defaultdict(list) + self.id_to_pod_lut: dict[str, dp.Pod] = {} def record(self, stream: dp.Stream) -> None: assert stream.source is not None, ( @@ -106,6 +183,43 @@ def record(self, stream: dp.Stream) -> None: if stream not in stream_list: stream_list.append(stream) + def _record_kernel_and_get_id( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> str: + invocation = Invocation(kernel, upstreams, label=label) + invocation_id = self.object_hasher.hash_to_hex(invocation) + if invocation_id not in self.id_to_invocation_lut: + self.id_to_invocation_lut[invocation_id] = invocation + label = label or kernel.label or kernel.__class__.__name__ + existing_labels = self.id_to_label_lut[invocation_id] + if label not in existing_labels: + existing_labels.append(label) + return invocation_id + + def record_kernel_invocation( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + """ + Record the output stream of a kernel invocation in the tracker. + This is used to track the computational graph and the invocations of kernels. + """ + self._record_kernel_and_get_id(kernel, upstreams, label) + + def record_pod_invocation( + self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None + ) -> None: + """ + Record the output stream of a pod invocation in the tracker. + """ + invocation_id = self._record_kernel_and_get_id(pod, upstreams, label) + self.id_to_pod_lut[invocation_id] = pod + def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: """ Reset the tracker and return the recorded invocations. diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 476b0a0..790b49f 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -5,10 +5,11 @@ import json from uuid import UUID from pathlib import Path -from collections.abc import Mapping, Collection +from collections.abc import Mapping, Collection, Callable import hashlib import xxhash import zlib +import inspect logger = logging.getLogger(__name__) @@ -171,7 +172,7 @@ def process_structure( # handle data types if isinstance(obj, type): logger.debug(f"Processing class/type: {obj.__name__}") - return f"type:{obj.__class__.__module__}.{obj.__class__.__name__}" + return f"type:{obj.__name__}" # For other objects, attempt to create deterministic representation only if force_hash=True class_name = obj.__class__.__name__ @@ -310,3 +311,54 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: hasher.update(data) return hasher.digest() + + +def get_function_signature( + func: Callable, + name_override: str | None = None, + include_defaults: bool = True, + include_module: bool = True, + output_names: Collection[str] | None = None, +) -> str: + """ + Get a stable string representation of a function's signature. + + Args: + func: The function to process + include_defaults: Whether to include default values + include_module: Whether to include the module name + + Returns: + A string representation of the function signature + """ + sig = inspect.signature(func) + + # Build the signature string + parts = {} + + # Add module if requested + if include_module and hasattr(func, "__module__"): + parts["module"] = func.__module__ + + # Add function name + parts["name"] = name_override or func.__name__ + + # Add parameters + param_strs = [] + for name, param in sig.parameters.items(): + param_str = str(param) + if not include_defaults and "=" in param_str: + param_str = param_str.split("=")[0].strip() + param_strs.append(param_str) + + parts["params"] = f"({', '.join(param_strs)})" + + # Add return annotation if present + if sig.return_annotation is not inspect.Signature.empty: + parts["returns"] = sig.return_annotation + + # TODO: fix return handling + fn_string = f"{parts['module'] + '.' if 'module' in parts else ''}{parts['name']}{parts['params']}" + if "returns" in parts: + fn_string = fn_string + f"-> {str(parts['returns'])}" + return fn_string diff --git a/src/orcapod/hashing/legacy_core.py b/src/orcapod/hashing/legacy_core.py index e338a89..83d172b 100644 --- a/src/orcapod/hashing/legacy_core.py +++ b/src/orcapod/hashing/legacy_core.py @@ -884,6 +884,7 @@ def get_function_signature( if sig.return_annotation is not inspect.Signature.empty: parts["returns"] = sig.return_annotation + # TODO: fix return handling fn_string = f"{parts['module'] + '.' if 'module' in parts else ''}{parts['name']}{parts['params']}" if "returns" in parts: fn_string = fn_string + f"-> {str(parts['returns'])}" diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 24b6861..266797b 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,4 +1,4 @@ -from typing import Protocol +from typing import Protocol, Any, ContextManager from orcapod.types import DataValue, TypeSpec from orcapod.protocols.hashing_protocols import ContentIdentifiable from collections.abc import Iterator, Collection @@ -602,6 +602,27 @@ def validate_inputs(self, *streams: Stream) -> None: """ ... + def identity_structure(self, *streams: Stream) -> Any: + """ + Generate a unique identity structure for this kernel and/or kernel invocation. + When invoked without streams, it should return a structure + that uniquely identifies the kernel itself (e.g., class name, parameters). + When invoked with streams, it should include the identity of the streams + to distinguish different invocations of the same kernel. + + This structure is used for: + - Caching and memoization + - Debugging and error reporting + - Tracking kernel invocations in computational graphs + + Args: + *streams: Optional input streams for this invocation + + Returns: + Any: Unique identity structure (e.g., tuple of class name and stream identities) + """ + ... + class Pod(Kernel, Protocol): """ @@ -768,7 +789,7 @@ def is_active(self) -> bool: ... def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...] + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ Record a kernel invocation in the computational graph. @@ -786,7 +807,9 @@ def record_kernel_invocation( """ ... - def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: """ Record a pod invocation in the computational graph. @@ -862,7 +885,7 @@ def deregister_tracker(self, tracker: Tracker) -> None: ... def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...] + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: """ Record a stream in all active trackers. @@ -876,7 +899,9 @@ def record_kernel_invocation( """ ... - def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None: + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: """ Record a stream in all active trackers. @@ -888,3 +913,5 @@ def record_pod_invocation(self, pod: Pod, upstreams: tuple[Stream, ...]) -> None stream: The stream to record in all active trackers """ ... + + def no_tracking(self) -> ContextManager[None]: ... From 2c9c33bbb66a378ef1424d85df04518b67eb5fc8 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:18:45 +0000 Subject: [PATCH 105/224] feat: implement pure immutable datagram --- src/orcapod/data/datagrams.py | 271 ++++++++++++++++++++++++++++++++-- 1 file changed, 259 insertions(+), 12 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 717b928..139e22a 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -1,5 +1,5 @@ from orcapod.types.core import DataValue, StoreValue -from typing import TypeAlias, cast +from typing import TypeAlias, cast, Self from collections.abc import Callable, Mapping, Collection from orcapod.types import TypeSpec, default_registry from orcapod.protocols import data_protocols as dp, hashing_protocols as hp @@ -8,10 +8,45 @@ from orcapod.types import schemas from orcapod.types.typespec_utils import get_typespec_from_dict import pyarrow as pa +import logging from orcapod.hashing.defaults import get_default_arrow_hasher +# Constants used for source info keys +SOURCE_INFO_PREFIX = "_source_info_" + +# TODO: move this to a separate module +def hstack_tables(*tables: pa.Table) -> pa.Table: + if len(tables) == 0: + raise ValueError("At least one table is required for horizontal stacking.") + if len(tables) == 1: + return tables[0] + + N = len(tables[0]) + for table in tables[1:]: + if len(table) != N: + raise ValueError( + "All tables must have the same number of rows for horizontal stacking." + ) + + # create combined column names + all_column_names = [] + all_columns = [] + all_names = set() + for i, table in enumerate(tables): + if overlap := set(table.column_names).intersection(all_names): + raise ValueError( + f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" + ) + all_names.update(table.column_names) + all_column_names += table.column_names + all_columns += table.columns + + return pa.Table.from_arrays(all_columns, names=all_column_names) + + +logger = logging.getLogger(__name__) # A conveniece packet-like type that defines a value that can be # converted to a packet. It's broader than Packet and a simple mapping # from string keys to DataValue (e.g., int, float, str) can be regarded @@ -192,6 +227,124 @@ def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): ) +class ImmutableDict(Mapping[str, DataValue]): + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + +# TODO: Inherit from Mapping instead to provide immutable datagram +class DictDatagram(ImmutableDict): + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the data content and remove any source info keys + super().__init__(data) + + # combine provided typespec info with inferred typespec from content + verified_typespec = {} + if typespec is not None: + verified_typespec = dict(typespec) + # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict + inferred_typespec = get_typespec_from_dict(self) + for key in self: + if key not in verified_typespec: + verified_typespec[key] = inferred_typespec[key] + self._python_schema = schemas.PythonSchema(verified_typespec) + + # create semantic converter + if semantic_converter is not None: + if semantic_converter.python_schema != self._python_schema: + raise ValueError( + "Incompatible Python schema between packet and semantic converter: " + + str(self._python_schema) + + " vs " + + str(semantic_converter.python_schema) + ) + else: + semantic_converter = SemanticConverter.from_typespec( + self._python_schema, + semantic_type_registry or default_registry, + ) + self.semantic_converter = semantic_converter + + if arrow_hasher is None: + arrow_hasher = get_default_arrow_hasher() + self.arrow_hasher = arrow_hasher + + self._cached_table: pa.Table | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + keep_columns: Collection[str] | None = None, + drop_columns: Collection[str] | None = None, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + if keep_columns is not None and drop_columns is not None: + logger.warning( + "It is not recommended to provide both keep_columns and drop_columns. The resulting behavior may not be as expected." + ) + if self._cached_table is None: + self._cached_table = ( + self.semantic_converter.from_python_store_to_arrow_table(self.as_dict()) + ) + assert self._cached_table is not None, "Cached table should not be None" + processed_table = self._cached_table + if keep_columns is not None: + processed_table = processed_table.select(list(keep_columns)) + + if drop_columns is not None: + processed_table = processed_table.drop(list(drop_columns)) + + return processed_table + + def as_dict(self) -> dict[str, DataValue]: + return dict(self) + + def content_hash( + self, + ) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def copy(self) -> Self: + return self.__class__( + self, + typespec=self.types(), + semantic_converter=self.semantic_converter, + arrow_hasher=self.arrow_hasher, + ) + + class PythonDictTag(dict[str, DataValue]): def as_dict(self) -> dict[str, DataValue]: return dict(self) @@ -243,6 +396,99 @@ def __repr__(self) -> str: return f"{self.as_dict()}" +class PythonDictPacket2(DictDatagram): + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX) + } + contained_source_info = { + k.removeprefix(SOURCE_INFO_PREFIX): v + for k, v in data.items() + if k.startswith(SOURCE_INFO_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + + def as_table( + self, + keep_columns: Collection[str] | None = None, + drop_columns: Collection[str] | None = None, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table(keep_columns=keep_columns, drop_columns=drop_columns) + if include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() + } + source_info_schema = pa.schema( + {k: pa.large_string() for k in source_info_data} + ) + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] + ) + table = hstack_tables(table, source_info_table) + return table + + def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + dict_copy = dict(self) + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value + return dict_copy + + def content_hash(self) -> str: + if self._cached_content_hash is None: + self._cached_content_hash = self.arrow_hasher.hash_table( + self.as_table(include_source=False), prefix_hasher_id=True + ) + return self._cached_content_hash + + # use keys() implementation from dict + + def types(self) -> schemas.PythonSchema: + return self._python_schema.copy() + + def source_info(self) -> dict[str, str | None]: + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> "PythonDictPacket2": + """Return a shallow copy of the packet.""" + new_packet = PythonDictPacket2(self, self.source_info()) + new_packet._cached_table = self._cached_table + new_packet._cached_content_hash = self._cached_content_hash + new_packet._python_schema = self._python_schema.copy() + new_packet.semantic_converter = self.semantic_converter + new_packet.arrow_hasher = self.arrow_hasher + return new_packet + + class PythonDictPacket(dict[str, DataValue]): @classmethod def create_from( @@ -281,11 +527,11 @@ def __init__( post_hash_callback: Callable[[str, str], None] | None = None, ) -> None: # normalize the data content and remove any source info keys - data = {k: v for k, v in data.items() if not k.startswith("_source_info_")} + data = {k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX)} contained_source_info = { - k.removeprefix("_source_info_"): v + k.removeprefix(SOURCE_INFO_PREFIX): v for k, v in data.items() - if k.startswith("_source_info_") + if k.startswith(SOURCE_INFO_PREFIX) } super().__init__(data) @@ -345,7 +591,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: dict_copy = self.copy() if include_source: for key, value in self.source_info().items(): - dict_copy[f"_source_info_{key}"] = value + dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy def content_hash(self) -> str: @@ -401,9 +647,9 @@ def process_table_with_source_info( existing_source_info = {} for i, name in enumerate(table.column_names): - if name.startswith("_source_info_"): + if name.startswith(SOURCE_INFO_PREFIX): # Extract the base column name - base_name = name.removeprefix("_source_info_") + base_name = name.removeprefix(SOURCE_INFO_PREFIX) existing_source_info[base_name] = table.column(i) else: regular_columns.append(table.column(i)) @@ -421,7 +667,7 @@ def process_table_with_source_info( num_rows = table.num_rows for col_name in regular_names: - source_info_col_name = f"_source_info_{col_name}" + source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" # if col_name is in source_info, use that value if col_name in source_info: @@ -501,12 +747,12 @@ def __init__( ) else: self._keys: tuple[str, ...] = tuple( - [c for c in table.column_names if not c.startswith("_source_info_")] + [c for c in table.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) for k in self._keys: - if f"_source_info_{k}" not in table.column_names: + if f"{SOURCE_INFO_PREFIX}{k}" not in table.column_names: raise ValueError( - f"Source info column '_source_info_{k}' is missing in the table." + f"Source info column '{SOURCE_INFO_PREFIX}{k}' is missing in the table." ) self._arrow_table = table @@ -571,7 +817,8 @@ def keys(self) -> tuple[str, ...]: def source_info(self) -> dict[str, str | None]: if self._cached_source_info is None: self._cached_source_info = { - k: self._arrow_table[f"_source_info_{k}"][0].as_py() for k in self._keys + k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() + for k in self._keys } return self._cached_source_info.copy() From 83e1ab8c1c3fb1595854614f4d2e2b3a9ca89b62 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:18:59 +0000 Subject: [PATCH 106/224] fix: preparation of output stream in pod --- src/orcapod/data/pods.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 6b1d730..3eb1346 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -108,6 +108,13 @@ def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return tuple(combined_streams) + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> dp.LiveStream: + output_stream = self.forward(*streams) + output_stream.label = label + return output_stream + def track_invocation(self, *streams: dp.Stream) -> None: if not self._skip_tracking and self._tracker_manager is not None: self._tracker_manager.record_pod_invocation(self, streams) From 8dc0353b6537ef907b588efdef283ba1794286b2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:19:50 +0000 Subject: [PATCH 107/224] feat: add feature to include content hash in arrow table --- src/orcapod/data/streams.py | 40 +++++++++++++++++++++++-- src/orcapod/protocols/data_protocols.py | 36 +++++++++++++++------- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 2454f85..223011b 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -335,11 +335,20 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: self._packet_converter.python_schema.copy(), ) - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. """ + if not include_content_hash: + return self._table + hash_column_name = ( + "_content_hash" if include_content_hash is True else include_content_hash + ) + content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] + self._table = self._table.append_column( + hash_column_name, pa.array(content_hashes, type=pa.large_string()) + ) return self._table def clear_cache(self) -> None: @@ -354,6 +363,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). """ + # TODO: make it work with table batch stream if self._cached_elements is None: self._cached_elements = [] tags = self._table.select(self._tag_columns) @@ -395,6 +405,7 @@ def __init__( self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} self._computation_complete: bool = False self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None @property def source(self) -> dp.Pod | None: @@ -427,6 +438,7 @@ def clear_cache(self) -> None: self._cached_output_packets = {} self._computation_complete = False self._cached_output_table = None + self._cached_content_hash_column = None def refresh(self, force: bool = False) -> bool: if not self.is_current or force: @@ -442,7 +454,8 @@ def invalidate(self) -> None: self.clear_cache() self._set_modified_time(invalidate=True) - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + # TODO: note that this is likely NOT multi-thread safe self.refresh() if self._cached_output_table is None: all_tags = [] @@ -450,7 +463,8 @@ def as_table(self) -> pa.Table: for tag, packet in self.iter_packets(): # TODO: evaluate handling efficiency here all_tags.append(tag.as_dict()) - all_packets.append(packet.as_dict()) + all_packets.append(packet.as_dict(include_source=True)) + all_tags: pa.Table = pa.Table.from_pylist(all_tags) all_packets: pa.Table = pa.Table.from_pylist(all_packets) # assert that column names do not overlap @@ -466,6 +480,26 @@ def as_table(self) -> pa.Table: names=all_tags.column_names + all_packets.column_names, ) + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + return self._cached_output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) return self._cached_output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 266797b..767ea0e 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -66,6 +66,21 @@ def as_dict(self) -> dict[str, DataValue]: """ ... + def content_hash(self) -> str: + """ + Return a hash of the packet content for caching/comparison. + + This hash should be deterministic and based only on the packet content, + not on source information or metadata. Used for: + - Caching computation results + - Detecting data changes + - Deduplication operations + + Returns: + str: Deterministic hash of packet content + """ + ... + class Tag(Datagram, Protocol): """ @@ -134,18 +149,16 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ ... - def content_hash(self) -> str: + def as_datagram(self, include_source: bool = False) -> Datagram: """ - Return a hash of the packet content for caching/comparison. + Convert the packet to a Datagram. - This hash should be deterministic and based only on the packet content, - not on source information or metadata. Used for: - - Caching computation results - - Detecting data changes - - Deduplication operations + Args: + include_source: If True, source information is included in the datagram + for debugging and lineage tracking Returns: - str: Deterministic hash of packet content + Datagram: Datagram representation of packet data """ ... @@ -382,7 +395,7 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: """ Convert the entire stream to a PyArrow Table. @@ -390,8 +403,9 @@ def as_table(self) -> pa.Table: analysis and processing. This operation may be expensive for large streams or live streams that need computation. - Tag fields are prefixed with "_tag_" to avoid naming conflicts - with packet fields. + If include_content_hash is True, an additional column called "_content_hash" + containing the content hash of each packet is included. If include_content_hash + is a string, it is used as the name of the content hash column. Returns: pa.Table: Complete stream data as a PyArrow Table From 47726a8a2602ac194ec621e0bb35d8df11097331 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 12 Jul 2025 02:47:43 +0000 Subject: [PATCH 108/224] doc: add comprehensive documentation to datagrams --- src/orcapod/data/datagrams.py | 360 +++++++++++++++++++++++++++++++++- 1 file changed, 359 insertions(+), 1 deletion(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 139e22a..f32ee89 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -1,3 +1,21 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + from orcapod.types.core import DataValue, StoreValue from typing import TypeAlias, cast, Self from collections.abc import Callable, Mapping, Collection @@ -18,6 +36,21 @@ # TODO: move this to a separate module def hstack_tables(*tables: pa.Table) -> pa.Table: + """ + Horizontally stack multiple PyArrow tables by concatenating their columns. + + All input tables must have the same number of rows and unique column names. + + Args: + *tables: Variable number of PyArrow tables to stack horizontally + + Returns: + Combined PyArrow table with all columns from input tables + + Raises: + ValueError: If no tables provided, tables have different row counts, + or duplicate column names are found + """ if len(tables) == 0: raise ValueError("At least one table is required for horizontal stacking.") if len(tables) == 1: @@ -122,11 +155,29 @@ def check_arrow_schema_compatibility( class SemanticConverter: + """ + Converts data between different representations (Python, semantic stores, Arrow tables). + + This class handles the conversion between Python data structures, semantic stores + (which use storage-optimized types), and Arrow tables while maintaining type + information and semantic type metadata. + """ + @staticmethod def prepare_handler( semantic_schema: schemas.SemanticSchema, semantic_type_registry: SemanticTypeRegistry, ) -> dict[str, TypeHandler]: + """ + Prepare type handlers for semantic type conversion. + + Args: + semantic_schema: Schema containing semantic type information + semantic_type_registry: Registry for looking up type handlers + + Returns: + Dictionary mapping field names to their type handlers + """ handler_lut = {} for key, (_, semantic_type) in semantic_schema.items(): if semantic_type is None: @@ -140,6 +191,16 @@ def prepare_handler( def from_typespec( cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry ) -> "SemanticConverter": + """ + Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). + + Args: + typespec: Type specification dictionary + semantic_type_registry: Registry for semantic type lookup + + Returns: + New SemanticConverter instance + """ semantic_schema = schemas.from_typespec_to_semantic_schema( typespec, semantic_type_registry ) @@ -151,6 +212,16 @@ def from_typespec( def from_arrow_schema( cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry ) -> "SemanticConverter": + """ + Create a SemanticConverter from an Arrow schema. + + Args: + arrow_schema: PyArrow schema with semantic type metadata + semantic_type_registry: Registry for semantic type lookup + + Returns: + New SemanticConverter instance + """ semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) python_schema = schemas.from_semantic_schema_to_python_schema( semantic_schema, semantic_type_registry=semantic_type_registry @@ -164,6 +235,15 @@ def __init__( semantic_schema: schemas.SemanticSchema, handler_lut: dict[str, TypeHandler] | None = None, ): + """ + Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. + Use class methods like `from_arrow_schema` or `from_typespec` instead. + + Args: + python_schema: Schema for Python data types + semantic_schema: Schema for semantic types + handler_lut: Optional dictionary of type handlers for conversion + """ self.python_schema = python_schema self.semantic_schema = semantic_schema self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( @@ -176,6 +256,15 @@ def __init__( def from_semantic_store_to_python_store( self, semantic_store: SemanticStore ) -> PythonStore: + """ + Convert a semantic store to a Python store. + + Args: + semantic_store: Store (dict) with data stored in semantic (storage-optimized) types + + Returns: + Store with Python native types + """ python_store = dict(semantic_store) for key, handler in self.handler_lut.items(): python_store[key] = handler.storage_to_python(semantic_store[key]) @@ -184,6 +273,15 @@ def from_semantic_store_to_python_store( def from_python_store_to_semantic_store( self, python_store: PythonStore ) -> SemanticStore: + """ + Convert a Python store to a semantic store. + + Args: + python_store: Store with Python native types + + Returns: + Store with semantic (storage-optimized) types + """ semantic_store = dict(python_store) for key, handler in self.handler_lut.items(): semantic_store[key] = handler.python_to_storage(python_store[key]) @@ -210,13 +308,22 @@ def from_arrow_table_to_semantic_stores( def from_arrow_table_to_python_stores( self, arrow_table: pa.Table ) -> list[PythonStore]: - """Convert an Arrow table to a Python store.""" + """Convert an Arrow table to a list of Python stores.""" return [ self.from_semantic_store_to_python_store(semantic_store) for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) ] def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): + """ + Verify that an Arrow schema is compatible with the expected schema. + + Args: + arrow_schema: Schema to verify + + Raises: + ValueError: If schemas are incompatible + """ compatible, errors = check_arrow_schema_compatibility( arrow_schema, self.arrow_schema ) @@ -228,6 +335,17 @@ def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + def __init__(self, data: Mapping[str, DataValue]): self._data = dict(data) @@ -249,6 +367,24 @@ def __str__(self) -> str: # TODO: Inherit from Mapping instead to provide immutable datagram class DictDatagram(ImmutableDict): + """ + An immutable datagram implementation using a dictionary backend. + + Extends ImmutableDict to provide additional functionality for type handling, + semantic conversion, and Arrow table representation while maintaining + immutability of the underlying data. + + + Initialize DictDatagram with data and optional type information. + + Args: + data: Source data mapping + typespec: Optional type specification for fields + semantic_converter: Optional converter for semantic types + semantic_type_registry: Registry for semantic type lookup + arrow_hasher: Optional hasher for Arrow table content + """ + def __init__( self, data: Mapping[str, DataValue], @@ -319,11 +455,18 @@ def as_table( return processed_table def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" return dict(self) def content_hash( self, ) -> str: + """ + Calculate and return content hash of the datagram. + + Returns: + Hash string of the datagram content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(), @@ -334,9 +477,11 @@ def content_hash( # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """Return copy of the Python schema.""" return self._python_schema.copy() def copy(self) -> Self: + """Return a copy of the datagram.""" return self.__class__( self, typespec=self.types(), @@ -346,18 +491,47 @@ def copy(self) -> Self: class PythonDictTag(dict[str, DataValue]): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation.""" return dict(self) def as_table(self) -> pa.Table: + """Convert to Arrow table representation.""" return pa.Table.from_pylist([self]) def types(self) -> schemas.PythonSchema: + """ + Return Python schema (basic implementation). + + Note: This is a simplified implementation that assumes all values are strings. + """ # TODO: provide correct implementation return schemas.PythonSchema({k: str for k in self.keys()}) class ArrowTag: + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + def __init__(self, table: pa.Table) -> None: self.table = table if len(table) != 1: @@ -369,9 +543,16 @@ def __init__(self, table: pa.Table) -> None: self._cached_python_dict: dict[str, DataValue] | None = None def keys(self) -> tuple[str, ...]: + """Return column names as a tuple.""" return tuple(self.table.column_names) def types(self) -> schemas.PythonSchema: + """ + Return Python schema derived from Arrow schema. + + Returns: + TypeSpec information returned as PythonSchema. + """ if self._cached_python_schema is None: self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( self.table.schema @@ -379,6 +560,12 @@ def types(self) -> schemas.PythonSchema: return self._cached_python_schema.copy() def as_dict(self) -> dict[str, DataValue]: + """ + Convert to Python dictionary representation. + + Returns: + Dictionary with tag data + """ if self._cached_python_dict is None: self._cached_python_dict = cast( dict[str, DataValue], self.table.to_pylist()[0] @@ -386,17 +573,38 @@ def as_dict(self) -> dict[str, DataValue]: return self._cached_python_dict def as_table(self) -> pa.Table: + """Return the underlying Arrow table.""" return self.table def clear_cache(self) -> None: + """Clear cached Python representations.""" self._cached_python_schema = None self._cached_python_dict = None def __repr__(self) -> str: + """Return string representation.""" return f"{self.as_dict()}" class PythonDictPacket2(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + def __init__( self, data: Mapping[str, DataValue], @@ -457,6 +665,15 @@ def as_table( return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ dict_copy = dict(self) if include_source: for key, value in self.source_info().items(): @@ -464,6 +681,12 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return dict_copy def content_hash(self) -> str: + """ + Calculate content hash excluding source information. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(include_source=False), prefix_hasher_id=True @@ -473,9 +696,19 @@ def content_hash(self) -> str: # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """ + Returns: + Packet type information as PythonSchema (dict mapping field names to types). + """ return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ return {key: self._source_info.get(key, None) for key in self.keys()} def copy(self) -> "PythonDictPacket2": @@ -490,6 +723,27 @@ def copy(self) -> "PythonDictPacket2": class PythonDictPacket(dict[str, DataValue]): + """ + Dictionary-based Packet with source tracking and hashing. + + A dictionary-based packet that maintains source information, supports + type specifications, and provides content hashing with optional callbacks. + Includes comprehensive conversion capabilities to Arrow tables. + + Initialize packet with comprehensive configuration options. + + Args: + data: Primary packet data + source_info: Optional source information mapping + typespec: Optional type specification + finger_print: Optional fingerprint for tracking + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + """ + @classmethod def create_from( cls, @@ -500,6 +754,20 @@ def create_from( arrow_hasher: hp.ArrowHasher | None = None, post_hash_callback: Callable[[str, str], None] | None = None, ) -> "PythonDictPacket": + """ + Create a PythonDictPacket from another packet object. + + Args: + object: Source packet to copy from + finger_print: Optional fingerprint identifier + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + Returns: + New PythonDictPacket instance + """ if isinstance(object, PythonDictPacket): return object.copy() @@ -588,6 +856,15 @@ def as_table(self, include_source: bool = False) -> pa.Table: return self._cached_table.select(list(self.keys())) def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ dict_copy = self.copy() if include_source: for key, value in self.source_info().items(): @@ -595,6 +872,15 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return dict_copy def content_hash(self) -> str: + """ + Calculate and return content hash. + + Computes hash of packet data content (thus excluding source info) and + optionally triggers post-hash callback if configured. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self.as_table(include_source=False), prefix_hasher_id=True @@ -606,9 +892,16 @@ def content_hash(self) -> str: # use keys() implementation from dict def types(self) -> schemas.PythonSchema: + """Return packet data type information as PythonSchema (dict mapping field names to types).""" return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ return {key: self._source_info.get(key, None) for key in self.keys()} def copy(self) -> "PythonDictPacket": @@ -697,6 +990,30 @@ def process_table_with_source_info( class ArrowPacket: + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + @classmethod def create_from( cls, @@ -707,6 +1024,20 @@ def create_from( arrow_hasher: hp.ArrowHasher | None = None, post_hash_callback: Callable[[str, str], None] | None = None, ) -> "ArrowPacket": + """ + Create an ArrowPacket from another packet object. + + Args: + object: Source packet to copy from + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint identifier + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + + Returns: + New ArrowPacket instance + """ if isinstance(object, ArrowPacket): return object.copy() @@ -787,6 +1118,15 @@ def as_table(self, include_source: bool = False) -> pa.Table: return base_table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ if self._cached_python_packet is None: self._cached_python_packet = ( self.semantic_converter.from_arrow_table_to_python_stores( @@ -799,6 +1139,15 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: return {k: self._cached_python_packet[k] for k in self._keys} def content_hash(self) -> str: + """ + Calculate and return content hash. + + Computes hash of the Arrow table content and optionally + triggers post-hash callback if configured. + + Returns: + Hash string of the packet content + """ if self._cached_content_hash is None: self._cached_content_hash = self.arrow_hasher.hash_table( self._arrow_table, prefix_hasher_id=True @@ -808,6 +1157,7 @@ def content_hash(self) -> str: return self._cached_content_hash def types(self) -> schemas.PythonSchema: + """Return packet data type information as PythonSchema (dict mapping field names to types).""" return self.semantic_converter.python_schema.copy() def keys(self) -> tuple[str, ...]: @@ -815,6 +1165,12 @@ def keys(self) -> tuple[str, ...]: return tuple(self._keys) def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ if self._cached_source_info is None: self._cached_source_info = { k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() @@ -846,8 +1202,10 @@ def copy(self) -> "ArrowPacket": return new_packet def __repr__(self) -> str: + """Return string representation.""" return f"{self.as_dict(include_source=False)}" # a batch is a tuple of a tag and a list of packets Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] +"""Type alias for a batch: a tuple containing a tag and collection of packets.""" From 251a685a874bcd4eafc2ee0579472a8ff0c277df Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:29:13 +0000 Subject: [PATCH 109/224] refactor: remove unused datagram base --- src/orcapod/data/base.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index 5082c9b..f8788e1 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -10,35 +10,6 @@ logger = logging.getLogger(__name__) -class DatagramBase(ABC): - """ - Base class for data packets that can be processed in a pipeline. - This class provides a common interface for data packets, allowing them to be processed - and transformed in a consistent manner. - """ - - @property - @abstractmethod - def typespec(self) -> TypeSpec: - """Return the type specification of the data packet.""" - pass - - @abstractmethod - def keys(self) -> tuple[str, ...]: - """Return the keys of the data packet.""" - pass - - @abstractmethod - def as_table(self) -> pa.Table: - """Convert the data packet to a PyArrow Table.""" - pass - - @abstractmethod - def as_dict(self) -> dict[str, Any]: - """Convert the data packet to a dictionary.""" - pass - - class LabeledContentIdentifiableBase: """ Base class for content-identifiable objects. From 77b6f2115cbc97fcbc0871461bde0687b8049a5c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:29:36 +0000 Subject: [PATCH 110/224] refactor: combine pre-foward step into one for simplicity --- src/orcapod/data/kernels.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 538cf11..e876916 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -39,10 +39,13 @@ def __init__( self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams - def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Resolve the input streams for the kernel. If the kernel has fixed input streams, - it returns those. Otherwise, it returns the provided streams. + Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing + on the input streams before the main computation. This is useful if you need to modify the input streams + or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the + pre-processing step will be tracked separately from the main computation in forward. + By default, it returns the input streams unchanged. """ if self.fixed_input_streams is not None: if len(streams) != 0: @@ -52,16 +55,6 @@ def resolve_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return self.fixed_input_streams return streams - def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: - """ - Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing - on the input streams before the main computation. This is useful if you need to modify the input streams - or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the - pre-processing step will be tracked separately from the main computation in forward. - By default, it returns the input streams unchanged. - """ - return streams - @abstractmethod def validate_inputs(self, *streams: dp.Stream) -> None: ... @@ -86,8 +79,7 @@ def track_invocation(self, *streams: dp.Stream) -> None: def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - streams = self.resolve_input_streams(*streams) - processed_streams = self.pre_processing_step(*streams) + processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) self.track_invocation(*processed_streams) From 57d59d77d0e310fd76649c556c2b6499a6737028 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:30:04 +0000 Subject: [PATCH 111/224] refactor: adopt the new method signature for pre-forward step --- src/orcapod/data/pods.py | 43 +++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 3eb1346..a6b0d0f 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -60,7 +60,7 @@ def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ - input_streams = self.pre_processing_step(*streams) + input_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*input_streams) tag_typespec, _ = input_streams[0].types() return tag_typespec, self.output_packet_types() @@ -92,21 +92,40 @@ def validate_inputs(self, *streams: dp.Stream) -> None: f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" ) - def pre_processing_step(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + @staticmethod + def _join_streams(*streams: dp.Stream) -> dp.Stream: + if not streams: + raise ValueError("No streams provided for joining") + # Join the streams using a suitable join strategy + if len(streams) == 1: + return streams[0] + + joined_stream = streams[0] + for next_stream in streams[1:]: + joined_stream = Join()(joined_stream, next_stream) + return joined_stream + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Prepare the incoming streams for execution in the pod. This default implementation - joins all the input streams together. + Prepare the incoming streams for execution in the pod. If fixed_input_streams are present, + they will be used as the input streams and the newly provided streams would be used to + restrict (semijoin) the fixed streams. + Otherwise, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - combined_streams = list(streams) - if len(streams) > 1: - stream = streams[0] - for next_stream in streams[1:]: - stream = Join()(stream, next_stream) - combined_streams = [stream] - - return tuple(combined_streams) + if self.fixed_input_streams is not None and len(streams) > 0: + output_stream = self._join_streams(*self.fixed_input_streams) + if len(streams) > 0: + restrict_stream = self._join_streams(*streams) + # output_stream = SemiJoin()(output_stream, restrict_stream) + else: + if len(streams) == 0: + raise ValueError( + f"{self.__class__.__name__} expects at least one input stream" + ) + output_stream = self._join_streams(*streams) + return (output_stream,) def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None From eb08084c36ba8f03f7b5d0602eed2368f53d4fc9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 14 Jul 2025 21:30:28 +0000 Subject: [PATCH 112/224] feat: add non-zero input operator --- src/orcapod/data/operators.py | 155 +++++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 2 deletions(-) diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 3db4949..95667d2 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -14,7 +14,95 @@ class InputValidationError(Exception): """ -class BinaryOperator(TrackedKernelBase): +class Operator(TrackedKernelBase): + """ + Base class for all operators. + Operators are a special type of kernel that can be used to perform operations on streams. + They are defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + """ + + +class NonZeroInputOperator(Operator): + """ + Operators that work with at least one input stream. + This is useful for operators that can take a variable number of (but at least one ) input streams, + such as joins, unions, etc. + """ + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.verify_non_zero_input(*streams) + return self.op_validate_inputs(*streams) + + @abstractmethod + def op_validate_inputs(self, *streams: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + def verify_non_zero_input( + self, + *streams: dp.Stream, + ) -> None: + """ + Check that the inputs to the variable inputs operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if len(streams) == 0: + raise ValueError( + f"Operator {self.__class__.__name__} requires at least one input stream." + ) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for variable inputs operators. + It expects at least one stream as input. + """ + return self.op_forward(*streams) + + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + self.validate_inputs(*streams) + return self.op_output_types(*streams) + + def identity_structure(self, *streams: dp.Stream) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + if len(streams) > 0: + self.verify_non_zero_input(*streams) + return self.op_identity_structure(*streams) + + @abstractmethod + def op_forward(self, *streams: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. + It takes variable number of streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes at least one stream as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure(self, *streams: dp.Stream) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. + If zero, it should return identity of the operator itself. + If one or more, it should return a identity structure approrpiate for the operator invoked on the given streams. + """ + ... + + +class BinaryOperator(Operator): """ Base class for all operators. """ @@ -93,7 +181,7 @@ def op_identity_structure(self, *streams: dp.Stream) -> Any: ... -class Join(BinaryOperator): +class BinaryJoin(BinaryOperator): def op_identity_structure(self, *streams: dp.Stream) -> Any: # Join does not depend on the order of the streams -- convert it onto a set id_struct = (self.__class__.__name__,) @@ -154,3 +242,66 @@ def op_validate_inputs( def __repr__(self) -> str: return "Join()" + + +class Join(NonZeroInputOperator): + def op_identity_structure(self, *streams: dp.Stream) -> Any: + # Join does not depend on the order of the streams -- convert it onto a set + id_struct = (self.__class__.__name__,) + if len(streams) > 0: + id_struct += (set(streams),) + return id_struct + + def op_forward(self, *streams: dp.Stream) -> ImmutableTableStream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + + all_tag_typespecs = [] + all_packet_typespecs = [] + + for stream in streams: + tag_typespec, packet_typespec = stream.types() + all_tag_typespecs.append(tag_typespec) + all_packet_typespecs.append(packet_typespec) + + common_tag_keys = tuple(intersection_typespecs(*all_tag_typespecs).keys()) + joined_tag_keys = tuple(union_typespecs(*all_tag_typespecs).keys()) + + # performing a check to ensure that packets are compatible + union_typespecs(*all_packet_typespecs) + + joined_table = left_stream.as_table().join( + right_stream.as_table(), + keys=common_tag_keys, + join_type="inner", + ) + + return ImmutableTableStream( + joined_table, + tag_columns=tuple(joined_tag_keys), + source=self, + upstreams=streams, + ) + + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + left_stream, right_stream = streams + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) + joined_packet_typespec = union_typespecs( + left_packet_typespec, right_packet_typespec + ) + return joined_tag_typespec, joined_packet_typespec + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + try: + self.op_output_types(left_stream, right_stream) + except Exception as e: + raise InputValidationError(f"Input streams are not compatible: {e}") + + def __repr__(self) -> str: + return "Join()" From 65289bba9bf72ca8148c7be7124448fc55cacb64 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 18 Jul 2025 07:26:07 +0000 Subject: [PATCH 113/224] wip: major refactoring of package structure --- src/orcapod/core/pod.py | 2 +- src/orcapod/data/datagram_store.py | 890 +++++++++++ src/orcapod/data/datagrams.py | 1314 +++++++---------- src/orcapod/data/operators.py | 10 + src/orcapod/data/pods.py | 80 +- src/orcapod/data/streams.py | 77 +- src/orcapod/hashing/arrow_hashers.py | 13 +- src/orcapod/hashing/semantic_type_hashers.py | 22 +- src/orcapod/hashing/versioned_hashers.py | 31 +- src/orcapod/pipeline/nodes.py | 5 +- src/orcapod/protocols/hashing_protocols.py | 5 + src/orcapod/protocols/store_protocols.py | 34 + src/orcapod/stores/__init__.py | 8 +- src/orcapod/stores/delta_lake_stores.py | 861 +++++++++++ .../delta_table_arrow_data_store.py | 0 .../stores/{ => legacy}/dict_data_stores.py | 2 +- .../{ => legacy}/dict_transfer_data_store.py | 2 +- .../legacy_arrow_data_stores.py} | 0 .../{ => legacy}/safe_dir_data_store.py | 2 +- src/orcapod/stores/{ => legacy}/types.py | 0 src/orcapod/types/__init__.py | 20 +- src/orcapod/types/arrow_utils.py | 10 + src/orcapod/types/core.py | 68 +- src/orcapod/types/defaults.py | 51 + src/orcapod/types/{ => legacy}/packets.py | 2 +- .../{ => legacy}/semantic_type_handlers.py | 0 .../{ => legacy}/semantic_type_registry.py | 2 +- src/orcapod/types/schemas.py | 493 +++---- src/orcapod/types/semantic_converter.py | 86 ++ src/orcapod/types/semantic_types.py | 569 +++++++ src/orcapod/types/typespec_utils.py | 68 +- src/orcapod/utils/arrow_utils.py | 126 ++ src/orcapod/utils/object_spec.py | 41 +- tests/test_store/test_dir_data_store.py | 2 +- tests/test_store/test_integration.py | 2 +- tests/test_store/test_noop_data_store.py | 2 +- tests/test_store/test_transfer_data_store.py | 4 +- 37 files changed, 3647 insertions(+), 1257 deletions(-) create mode 100644 src/orcapod/data/datagram_store.py create mode 100644 src/orcapod/stores/delta_lake_stores.py rename src/orcapod/stores/{ => legacy}/delta_table_arrow_data_store.py (100%) rename src/orcapod/stores/{ => legacy}/dict_data_stores.py (99%) rename src/orcapod/stores/{ => legacy}/dict_transfer_data_store.py (97%) rename src/orcapod/stores/{arrow_data_stores.py => legacy/legacy_arrow_data_stores.py} (100%) rename src/orcapod/stores/{ => legacy}/safe_dir_data_store.py (99%) rename src/orcapod/stores/{ => legacy}/types.py (100%) create mode 100644 src/orcapod/types/arrow_utils.py create mode 100644 src/orcapod/types/defaults.py rename src/orcapod/types/{ => legacy}/packets.py (99%) rename src/orcapod/types/{ => legacy}/semantic_type_handlers.py (100%) rename src/orcapod/types/{ => legacy}/semantic_type_registry.py (99%) create mode 100644 src/orcapod/types/semantic_converter.py create mode 100644 src/orcapod/types/semantic_types.py create mode 100644 src/orcapod/utils/arrow_utils.py diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py index 92d8568..3ca7d6b 100644 --- a/src/orcapod/core/pod.py +++ b/src/orcapod/core/pod.py @@ -12,7 +12,7 @@ extract_function_typespecs, check_typespec_compatibility, ) -from orcapod.types.packets import PacketConverter +from orcapod.types.legacy.packets import PacketConverter from orcapod.hashing import ( FunctionInfoExtractor, diff --git a/src/orcapod/data/datagram_store.py b/src/orcapod/data/datagram_store.py new file mode 100644 index 0000000..72d082c --- /dev/null +++ b/src/orcapod/data/datagram_store.py @@ -0,0 +1,890 @@ +# class DatagramStore(Protocol): +# def record_datagram( +# self, +# record_path: tuple[str, ...], +# datagram: dp.Datagram, +# ignore_duplicates: bool = False, +# ) -> str | None: ... + +# def record_stream( +# self, +# record_path: tuple[str, ...], +# stream: dp.Stream, +# ignore_duplicates: bool = False, +# ) -> None: ... + +# def get_recorded_datagram( +# self, +# record_path: tuple[str, ...], +# record_id: str, +# ) -> dp.Datagram | None: ... + +# def get_all_records(self, record_path: tuple[str, ...]) -> dp.Stream | None: +# """Retrieve all records for a given path as a stream.""" +# ... + +# def get_all_records_as_polars( +# self, record_path: tuple[str, ...] +# ) -> pl.DataFrame | None: +# """Retrieve all records for a given path as a Polars stream.""" +# ... + +# def get_records_by_ids( +# self, +# record_path: tuple[str, ...], +# entry_ids: Collection[str], +# add_entry_id_column: bool | str = False, +# preseve_input_order: bool = False, +# ) -> dp.Stream: ... + + +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.dataset as ds +import polars as pl +from pathlib import Path +from typing import Any +import logging +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from collections import defaultdict +from orcapod.data.datagrams import ArrowDatagram, SemanticTypeRegistry +from orcapod.data.streams import ImmutableTableStream +from orcapod.hashing import get_default_arrow_hasher +from orcapod.hashing.types import ArrowHasher +from orcapod.protocols import data_protocols as dp +from orcapod.types import default_registry + + +# Module-level logger +logger = logging.getLogger(__name__) + + +class DeltaTableArrowStore: + """ + Delta Table-based Arrow data store with flexible hierarchical path support and schema preservation. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10, + batch_size: int = 100, + ): + """ + Initialize the DeltaTableArrowDataStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + auto_flush_interval: Time in seconds to auto-flush pending batches (0 to disable) + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, as" + ) + + def flush(self) -> None: + """ + Flush all pending batches immediately. + + This method is called to ensure all pending data is written to the Delta tables. + """ + try: + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def __del__(self): + """Cleanup when object is destroyed.""" + self.flush() + + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + source_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not source_path: + raise ValueError("Source path cannot be empty") + + if len(source_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(source_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path component contains invalid characters: {repr(component)}" + ) + + def _get_source_key(self, source_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(source_path) + + def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in source_path: + path = path / subpath + return path + + def _get_existing_delta_table( + self, source_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + source_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(source_path) + table_path = self._get_table_path(source_path) + + # Check cache first + if dt := self._delta_table_cache.get(source_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: + """Ensure the table has an __entry_id column.""" + if "__entry_id" not in arrow_data.column_names: + # Add entry_id column at the beginning + key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + return arrow_data + + def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the __entry_id column if it exists.""" + if "__entry_id" in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_entry_id_column( + self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + ) -> pa.Table: + """ + Handle entry_id column based on add_entry_id_column parameter. + + Args: + arrow_data: Arrow table with __entry_id column + add_entry_id_column: Control entry ID column inclusion: + - False: Remove __entry_id column + - True: Keep __entry_id column as is + - str: Rename __entry_id column to custom name + """ + if add_entry_id_column is False: + # Remove the __entry_id column + return self._remove_entry_id_column(arrow_data) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + if "__entry_id" in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + # If add_entry_id_column is True, keep __entry_id as is + return arrow_data + + def _create_entry_id_filter(self, entry_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + entry_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "=", entry_id)] + + def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + entry_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "in", entry_ids)] + + def _read_table_with_filter( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> pa.Table: + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def record_data( + self, + record_path: tuple[str, ...], + entry_id: str, + data: pa.Table, + force_flush: bool = False, + error_on_duplicate: bool | None = None, + ) -> pa.Table: + self._validate_source_path(record_path) + source_key = self._get_source_key(record_path) + + # Check for existing entry + if error_on_duplicate is None: + error_on_duplicate = self.duplicate_entry_behavior == "error" + if error_on_duplicate: + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + if existing_record is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table - save original schema first + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {entry_id}: {e}" + ) + + write_deltalake( + table_path, + data_with_entry_id, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) + + # Check if we need to flush + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added record {entry_id} to {source_key}") + return data + + def get_recorded_data( + self, + record_path: tuple[str, ...], + entry_id: str, + flush: bool = False, + ) -> pa.Table | None: + """ + Get a specific record by entry_id with schema preservation. + + Args: + source_path: Tuple of path components + entry_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + # check if entry_id is found in pending batches + source_key = self._get_source_key(record_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_entry_id_filter(entry_id) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Remove the __entry_id column before returning + return self._remove_entry_id_column(result) + + except Exception as e: + logger.error( + f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_all_records( + self, + record_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve all records for a given source path as a single table with schema preservation. + + Args: + source_path: Tuple of path components + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + + Returns: + Arrow table containing all records with original schema, or None if no records found + """ + # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + collected_tables = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(record_path) + ].items(): + collected_tables.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) + + if len(result) != 0: + collected_tables.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(record_path)}: {e}" + ) + if collected_tables: + total_table = pa.concat_tables(collected_tables) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(total_table, add_entry_id_column) + + return None + + # def get_all_records_as_polars( + # self, source_path: tuple[str, ...], flush: bool = True + # ) -> pl.LazyFrame | None: + # """ + # Retrieve all records for a given source path as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + + # Returns: + # Polars LazyFrame containing all records, or None if no records found + # """ + # all_records = self.get_all_records(source_path, flush=flush) + # if all_records is None: + # return None + # # TODO: take care of converting semantics to Python objects + # return pl.LazyFrame(all_records.as_table()) + + def get_records_by_ids( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + + if flush: + self.flush_batch(source_path) + + self._validate_source_path(source_path) + + # Convert input to list of strings for consistency + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read with filters + filter_expr = self._create_entry_ids_filter(entry_ids_list) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + if preserve_input_order: + raise NotImplementedError("Preserve input order is not yet implemented") + # Need to reorder results and add nulls for missing entries + import pandas as pd + + df = result.to_pandas() + df = df.set_index("__entry_id") + + # Create a DataFrame with the desired order, filling missing with NaN + ordered_df = df.reindex(entry_ids_list) + + # Convert back to Arrow + result = pa.Table.from_pandas(ordered_df.reset_index()) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + ) + return None + + # def get_records_by_ids_as_polars( + # self, + # source_path: tuple[str, ...], + # entry_ids: list[str] | pl.Series | pa.Array, + # add_entry_id_column: bool | str = False, + # preserve_input_order: bool = False, + # flush: bool = False, + # ) -> pl.LazyFrame | None: + # """ + # Retrieve records by entry IDs as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + # entry_ids: Entry IDs to retrieve + # add_entry_id_column: Control entry ID column inclusion + # preserve_input_order: If True, return results in input order with nulls for missing + + # Returns: + # Polars LazyFrame containing all found records, or None if no records found + # """ + # arrow_result = self.get_records_by_ids( + # source_path, + # entry_ids, + # add_entry_id_column, + # preserve_input_order, + # flush=flush, + # ) + + # if arrow_result is None: + # return None + + # # Convert to Polars LazyFrame + # return pl.LazyFrame(arrow_result) + + # Additional utility methods + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, source_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + source_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + if not table_path.exists(): + return False + + try: + # Remove from caches + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + """ + Delete a specific record. + + Args: + source_path: Tuple of path components + entry_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return False + + try: + # Check if record exists using proper filter + filter_expr = self._create_entry_id_filter(entry_id) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) + if len(existing) == 0: + return False + + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + + # Update cache + source_key = self._get_source_key(source_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + return True + + except Exception as e: + logger.error( + f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + ) + return False + + def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + source_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_source_path(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + source_key = self._get_source_key(source_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + return { + "path": str(self._get_table_path(source_path)), + "source_path": source_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + "pending_records": pending_count, + } + + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + return None diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index f32ee89..5bab7ba 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -16,69 +16,24 @@ and type conversions between semantic stores, Python stores, and Arrow tables. """ -from orcapod.types.core import DataValue, StoreValue -from typing import TypeAlias, cast, Self -from collections.abc import Callable, Mapping, Collection -from orcapod.types import TypeSpec, default_registry +from orcapod.types.core import DataValue +from typing import TypeAlias, Self +from collections.abc import Mapping, Collection +from orcapod.types import TypeSpec +from orcapod.types.semantic_converter import SemanticConverter from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types.semantic_type_registry import SemanticTypeRegistry -from orcapod.types.core import TypeHandler +from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import schemas -from orcapod.types.typespec_utils import get_typespec_from_dict +from orcapod.types import typespec_utils as tsutils import pyarrow as pa import logging +from orcapod.utils import arrow_utils -from orcapod.hashing.defaults import get_default_arrow_hasher # Constants used for source info keys SOURCE_INFO_PREFIX = "_source_info_" -# TODO: move this to a separate module -def hstack_tables(*tables: pa.Table) -> pa.Table: - """ - Horizontally stack multiple PyArrow tables by concatenating their columns. - - All input tables must have the same number of rows and unique column names. - - Args: - *tables: Variable number of PyArrow tables to stack horizontally - - Returns: - Combined PyArrow table with all columns from input tables - - Raises: - ValueError: If no tables provided, tables have different row counts, - or duplicate column names are found - """ - if len(tables) == 0: - raise ValueError("At least one table is required for horizontal stacking.") - if len(tables) == 1: - return tables[0] - - N = len(tables[0]) - for table in tables[1:]: - if len(table) != N: - raise ValueError( - "All tables must have the same number of rows for horizontal stacking." - ) - - # create combined column names - all_column_names = [] - all_columns = [] - all_names = set() - for i, table in enumerate(tables): - if overlap := set(table.column_names).intersection(all_names): - raise ValueError( - f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" - ) - all_names.update(table.column_names) - all_column_names += table.column_names - all_columns += table.columns - - return pa.Table.from_arrays(all_columns, names=all_column_names) - - logger = logging.getLogger(__name__) # A conveniece packet-like type that defines a value that can be # converted to a packet. It's broader than Packet and a simple mapping @@ -90,248 +45,202 @@ def hstack_tables(*tables: pa.Table) -> pa.Table: # enforce the typespec or source_info, which are important for packet integrity. PacketLike: TypeAlias = Mapping[str, DataValue] -SemanticStore: TypeAlias = Mapping[str, StoreValue] PythonStore: TypeAlias = Mapping[str, DataValue] -def check_arrow_schema_compatibility( - incoming_schema: pa.Schema, current_schema: pa.Schema -) -> tuple[bool, list[str]]: - """ - Check if incoming schema is compatible with current schema. - - Args: - incoming_schema: Schema to validate - current_schema: Expected schema to match against - - Returns: - Tuple of (is_compatible, list_of_errors) - """ - errors = [] - - # Create lookup dictionaries for efficient access - incoming_fields = {field.name: field for field in incoming_schema} - current_fields = {field.name: field for field in current_schema} - - # Check each field in current_schema - for field_name, current_field in current_fields.items(): - if field_name not in incoming_fields: - errors.append(f"Missing field '{field_name}' in incoming schema") - continue - - incoming_field = incoming_fields[field_name] - - # Check data type compatibility - if not current_field.type.equals(incoming_field.type): - errors.append( - f"Type mismatch for field '{field_name}': " - f"expected {current_field.type}, got {incoming_field.type}" - ) - - # Check semantic_type metadata if present in current schema - current_metadata = current_field.metadata or {} - incoming_metadata = incoming_field.metadata or {} - - if b"semantic_type" in current_metadata: - expected_semantic_type = current_metadata[b"semantic_type"] - - if b"semantic_type" not in incoming_metadata: - errors.append( - f"Missing 'semantic_type' metadata for field '{field_name}'" - ) - elif incoming_metadata[b"semantic_type"] != expected_semantic_type: - errors.append( - f"Semantic type mismatch for field '{field_name}': " - f"expected {expected_semantic_type.decode()}, " - f"got {incoming_metadata[b'semantic_type'].decode()}" - ) - elif b"semantic_type" in incoming_metadata: - errors.append( - f"Unexpected 'semantic_type' metadata for field '{field_name}': " - f"{incoming_metadata[b'semantic_type'].decode()}" - ) - - return len(errors) == 0, errors - - -class SemanticConverter: - """ - Converts data between different representations (Python, semantic stores, Arrow tables). - - This class handles the conversion between Python data structures, semantic stores - (which use storage-optimized types), and Arrow tables while maintaining type - information and semantic type metadata. - """ - - @staticmethod - def prepare_handler( - semantic_schema: schemas.SemanticSchema, - semantic_type_registry: SemanticTypeRegistry, - ) -> dict[str, TypeHandler]: - """ - Prepare type handlers for semantic type conversion. - - Args: - semantic_schema: Schema containing semantic type information - semantic_type_registry: Registry for looking up type handlers - - Returns: - Dictionary mapping field names to their type handlers - """ - handler_lut = {} - for key, (_, semantic_type) in semantic_schema.items(): - if semantic_type is None: - continue # Skip keys without semantic type - handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( - semantic_type - ) - return handler_lut - - @classmethod - def from_typespec( - cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticConverter": - """ - Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). - - Args: - typespec: Type specification dictionary - semantic_type_registry: Registry for semantic type lookup - - Returns: - New SemanticConverter instance - """ - semantic_schema = schemas.from_typespec_to_semantic_schema( - typespec, semantic_type_registry - ) - python_schema = schemas.PythonSchema(typespec) - handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) - return cls(python_schema, semantic_schema, handler_lut) - - @classmethod - def from_arrow_schema( - cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticConverter": - """ - Create a SemanticConverter from an Arrow schema. - - Args: - arrow_schema: PyArrow schema with semantic type metadata - semantic_type_registry: Registry for semantic type lookup - - Returns: - New SemanticConverter instance - """ - semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) - python_schema = schemas.from_semantic_schema_to_python_schema( - semantic_schema, semantic_type_registry=semantic_type_registry - ) - handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) - return cls(python_schema, semantic_schema, handler_lut) - - def __init__( - self, - python_schema: schemas.PythonSchema, - semantic_schema: schemas.SemanticSchema, - handler_lut: dict[str, TypeHandler] | None = None, - ): - """ - Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. - Use class methods like `from_arrow_schema` or `from_typespec` instead. - - Args: - python_schema: Schema for Python data types - semantic_schema: Schema for semantic types - handler_lut: Optional dictionary of type handlers for conversion - """ - self.python_schema = python_schema - self.semantic_schema = semantic_schema - self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( - semantic_schema, include_source_info=False - ) - if handler_lut is None: - handler_lut = {} - self.handler_lut = handler_lut - - def from_semantic_store_to_python_store( - self, semantic_store: SemanticStore - ) -> PythonStore: - """ - Convert a semantic store to a Python store. - - Args: - semantic_store: Store (dict) with data stored in semantic (storage-optimized) types - - Returns: - Store with Python native types - """ - python_store = dict(semantic_store) - for key, handler in self.handler_lut.items(): - python_store[key] = handler.storage_to_python(semantic_store[key]) - return python_store - - def from_python_store_to_semantic_store( - self, python_store: PythonStore - ) -> SemanticStore: - """ - Convert a Python store to a semantic store. - - Args: - python_store: Store with Python native types - - Returns: - Store with semantic (storage-optimized) types - """ - semantic_store = dict(python_store) - for key, handler in self.handler_lut.items(): - semantic_store[key] = handler.python_to_storage(python_store[key]) - return semantic_store # type: ignore[return-value] - - def from_semantic_store_to_arrow_table( - self, semantic_store: SemanticStore - ) -> pa.Table: - """Convert a semantic store to an Arrow table.""" - return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) - - def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: - """Convert a Python store to an Arrow table.""" - semantic_store = self.from_python_store_to_semantic_store(python_store) - return self.from_semantic_store_to_arrow_table(semantic_store) - - def from_arrow_table_to_semantic_stores( - self, arrow_table: pa.Table - ) -> list[SemanticStore]: - """Convert an Arrow table to a list of semantic stores.""" - self.verify_compatible_arrow_schema(arrow_table.schema) - return arrow_table.to_pylist() # Ensure the table is materialized - - def from_arrow_table_to_python_stores( - self, arrow_table: pa.Table - ) -> list[PythonStore]: - """Convert an Arrow table to a list of Python stores.""" - return [ - self.from_semantic_store_to_python_store(semantic_store) - for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) - ] - - def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): - """ - Verify that an Arrow schema is compatible with the expected schema. - - Args: - arrow_schema: Schema to verify - - Raises: - ValueError: If schemas are incompatible - """ - compatible, errors = check_arrow_schema_compatibility( - arrow_schema, self.arrow_schema - ) - if not compatible: - raise ValueError( - "Arrow table schema is not compatible with the expected schema: " - + ", ".join(errors) - ) +# class SemanticConverter: +# """ +# Converts data between different representations (Python, semantic stores, Arrow tables). + +# SemanticConverter only tracks the semantic columns to be converted and does not +# enforce any type checking on other columns. Consequently, two completely different +# schemas could share a semantic converter if the have same named fields with identical +# semantic types. Furthermore, semantic types are defined by the association of semantic +# type name with a specific TypeHandler. + +# """ + +# @staticmethod +# def prepare_handler( +# semantic_schema: schemas.SemanticSchema, +# semantic_type_registry: SemanticTypeRegistry, +# ) -> dict[str, TypeHandler]: +# """ +# Prepare type handlers for semantic type conversion. + +# Args: +# semantic_schema: Schema containing semantic type information +# semantic_type_registry: Registry for looking up type handlers + +# Returns: +# Dictionary mapping field names to their type handlers +# """ +# handler_lut = {} +# for key, (_, semantic_type) in semantic_schema.items(): +# if semantic_type is None: +# continue # Skip keys without semantic type +# handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( +# semantic_type +# ) +# return handler_lut + +# @classmethod +# def from_typespec( +# cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry +# ) -> "SemanticConverter": +# """ +# Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). + +# Args: +# typespec: Type specification dictionary +# semantic_type_registry: Registry for semantic type lookup + +# Returns: +# New SemanticConverter instance +# """ +# semantic_schema = schemas.from_typespec_to_semantic_schema( +# typespec, semantic_type_registry +# ) +# python_schema = schemas.PythonSchema(typespec) +# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) +# return cls(python_schema, semantic_schema, handler_lut) + +# @classmethod +# def from_arrow_schema( +# cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry +# ) -> "SemanticConverter": +# """ +# Create a SemanticConverter from an Arrow schema. + +# Args: +# arrow_schema: PyArrow schema with semantic type metadata +# semantic_type_registry: Registry for semantic type lookup + +# Returns: +# New SemanticConverter instance +# """ +# semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) +# python_schema = schemas.from_semantic_schema_to_python_schema( +# semantic_schema, semantic_type_registry=semantic_type_registry +# ) +# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) +# return cls(python_schema, semantic_schema, handler_lut) + +# def __init__( +# self, +# handler_lut: dict[str, tuple[str, TypeHandler]] | None = None, +# ): +# """ +# Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. +# Use class methods like `from_arrow_schema` or `from_typespec` instead. + +# Args: +# python_schema: Schema for Python data types +# semantic_schema: Schema for semantic types +# handler_lut: Optional dictionary of type handlers for conversion +# """ +# if handler_lut is None: +# handler_lut = {} +# self.handler_lut = handler_lut + +# def convert_from_semantic_to_python( +# self, semantic_value: Any, semantic_type: SemanticType +# ) -> Any: +# """ +# Convert a semantic value to a Python value. + +# Args: +# semantic_value: Value in semantic (storage-optimized) format +# semantic_type: Corresponding semantic type + +# Returns: +# Value in Python native format +# """ +# handler = self.handler_lut.get(semantic_type) +# if handler: +# return handler.to_canonical(semantic_value) +# return semantic_value + +# def from_semantic_store_to_python_store( +# self, semantic_store: SemanticStore +# ) -> dict[str, DataValue]: +# """ +# Convert a semantic store to a Python store. + +# Args: +# semantic_store: Store (dict) with data stored in semantic (storage-optimized) types + +# Returns: +# Store with Python native types +# """ +# python_store = dict(semantic_store) +# for key, handler in self.handler_lut.items(): +# python_store[key] = handler.storage_to_python(semantic_store[key]) +# # TODO: come up with a more robust handling/conversion +# return cast(dict[str, DataValue], python_store) + +# def from_python_store_to_semantic_store( +# self, python_store: PythonStore +# ) -> SemanticStore: +# """ +# Convert a Python store to a semantic store. + +# Args: +# python_store: Store with Python native types + +# Returns: +# Store with semantic (storage-optimized) types +# """ +# semantic_store = dict(python_store) +# for key, handler in self.handler_lut.items(): +# semantic_store[key] = handler.python_to_storage(python_store[key]) +# return semantic_store # type: ignore[return-value] + +# def from_semantic_store_to_arrow_table( +# self, semantic_store: SemanticStore +# ) -> pa.Table: +# """Convert a semantic store to an Arrow table.""" +# return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) + +# def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: +# """Convert a Python store to an Arrow table.""" +# semantic_store = self.from_python_store_to_semantic_store(python_store) +# return self.from_semantic_store_to_arrow_table(semantic_store) + +# def from_arrow_table_to_semantic_stores( +# self, arrow_table: pa.Table +# ) -> list[SemanticStore]: +# """Convert an Arrow table to a list of semantic stores.""" +# self.verify_compatible_arrow_schema(arrow_table.schema) +# return arrow_table.to_pylist() # Ensure the table is materialized + +# def from_arrow_table_to_python_stores( +# self, arrow_table: pa.Table +# ) -> list[dict[str, DataValue]]: +# """Convert an Arrow table to a list of Python stores.""" +# return [ +# self.from_semantic_store_to_python_store(semantic_store) +# for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) +# ] + +# def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): +# """ +# Verify that an Arrow schema is compatible with the expected schema. + +# Args: +# arrow_schema: Schema to verify + +# Raises: +# ValueError: If schemas are incompatible +# """ +# compatible, errors = check_arrow_schema_compatibility( +# arrow_schema, self.arrow_schema +# ) +# if not compatible: +# raise ValueError( +# "Arrow table schema is not compatible with the expected schema: " +# + ", ".join(errors) +# ) class ImmutableDict(Mapping[str, DataValue]): @@ -401,58 +310,37 @@ def __init__( if typespec is not None: verified_typespec = dict(typespec) # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict - inferred_typespec = get_typespec_from_dict(self) + inferred_typespec = tsutils.get_typespec_from_dict(self) for key in self: if key not in verified_typespec: verified_typespec[key] = inferred_typespec[key] self._python_schema = schemas.PythonSchema(verified_typespec) # create semantic converter - if semantic_converter is not None: - if semantic_converter.python_schema != self._python_schema: - raise ValueError( - "Incompatible Python schema between packet and semantic converter: " - + str(self._python_schema) - + " vs " - + str(semantic_converter.python_schema) - ) - else: - semantic_converter = SemanticConverter.from_typespec( - self._python_schema, - semantic_type_registry or default_registry, + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._python_schema.to_semantic_schema( + semantic_type_registry=semantic_type_registry + ), ) self.semantic_converter = semantic_converter - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher + self._arrow_hasher = arrow_hasher self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None def as_table( self, - keep_columns: Collection[str] | None = None, - drop_columns: Collection[str] | None = None, ) -> pa.Table: """Convert the packet to an Arrow table.""" - if keep_columns is not None and drop_columns is not None: - logger.warning( - "It is not recommended to provide both keep_columns and drop_columns. The resulting behavior may not be as expected." - ) + if self._cached_table is None: - self._cached_table = ( - self.semantic_converter.from_python_store_to_arrow_table(self.as_dict()) + self._cached_table = self.semantic_converter.from_python_to_arrow( + self, self.types() ) assert self._cached_table is not None, "Cached table should not be None" - processed_table = self._cached_table - if keep_columns is not None: - processed_table = processed_table.select(list(keep_columns)) - - if drop_columns is not None: - processed_table = processed_table.drop(list(drop_columns)) - - return processed_table + return self._cached_table def as_dict(self) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" @@ -468,7 +356,11 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( + if self._arrow_hasher is None: + raise ValueError( + "Arrow hasher must be provided to calculate content hash." + ) + self._cached_content_hash = self._arrow_hasher.hash_table( self.as_table(), prefix_hasher_id=True, ) @@ -480,17 +372,38 @@ def types(self) -> schemas.PythonSchema: """Return copy of the Python schema.""" return self._python_schema.copy() + @classmethod + def _from_copy( + cls, + data: Mapping[str, DataValue], + python_schema: schemas.PythonSchema, + semantic_converter: SemanticConverter, + arrow_hasher: hp.ArrowHasher | None, + ) -> Self: + """Create a new instance from copy without full initialization.""" + instance = cls.__new__(cls) + ImmutableDict.__init__(instance, data) + + # Set attributes directly + instance._python_schema = python_schema + instance.semantic_converter = semantic_converter + instance._arrow_hasher = arrow_hasher + instance._cached_table = None + instance._cached_content_hash = None + + return instance + def copy(self) -> Self: """Return a copy of the datagram.""" - return self.__class__( + return self._from_copy( self, - typespec=self.types(), - semantic_converter=self.semantic_converter, - arrow_hasher=self.arrow_hasher, + self._python_schema.copy(), + self.semantic_converter, + self._arrow_hasher, ) -class PythonDictTag(dict[str, DataValue]): +class DictTag(DictDatagram): """ A simple tag implementation using Python dictionary. @@ -498,95 +411,8 @@ class PythonDictTag(dict[str, DataValue]): to different representations like Arrow tables. """ - def as_dict(self) -> dict[str, DataValue]: - """Return dictionary representation.""" - return dict(self) - - def as_table(self) -> pa.Table: - """Convert to Arrow table representation.""" - return pa.Table.from_pylist([self]) - - def types(self) -> schemas.PythonSchema: - """ - Return Python schema (basic implementation). - - Note: This is a simplified implementation that assumes all values are strings. - """ - # TODO: provide correct implementation - return schemas.PythonSchema({k: str for k in self.keys()}) - - -class ArrowTag: - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__(self, table: pa.Table) -> None: - self.table = table - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - - def keys(self) -> tuple[str, ...]: - """Return column names as a tuple.""" - return tuple(self.table.column_names) - - def types(self) -> schemas.PythonSchema: - """ - Return Python schema derived from Arrow schema. - - Returns: - TypeSpec information returned as PythonSchema. - """ - if self._cached_python_schema is None: - self._cached_python_schema = schemas.from_arrow_schema_to_semantic_schema( - self.table.schema - ).storage_schema - return self._cached_python_schema.copy() - - def as_dict(self) -> dict[str, DataValue]: - """ - Convert to Python dictionary representation. - - Returns: - Dictionary with tag data - """ - if self._cached_python_dict is None: - self._cached_python_dict = cast( - dict[str, DataValue], self.table.to_pylist()[0] - ) - return self._cached_python_dict - - def as_table(self) -> pa.Table: - """Return the underlying Arrow table.""" - return self.table - - def clear_cache(self) -> None: - """Clear cached Python representations.""" - self._cached_python_schema = None - self._cached_python_dict = None - - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict()}" - - -class PythonDictPacket2(DictDatagram): +class DictPacket(DictDatagram): """ Enhanced packet implementation with source information support. @@ -637,12 +463,10 @@ def __init__( def as_table( self, - keep_columns: Collection[str] | None = None, - drop_columns: Collection[str] | None = None, include_source: bool = False, ) -> pa.Table: """Convert the packet to an Arrow table.""" - table = super().as_table(keep_columns=keep_columns, drop_columns=drop_columns) + table = super().as_table() if include_source: if self._cached_source_info_table is None: source_info_data = { @@ -661,7 +485,7 @@ def as_table( source_info_table = self._cached_source_info_table.select( [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] ) - table = hstack_tables(table, source_info_table) + table = arrow_utils.hstack_tables(table, source_info_table) return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: @@ -680,219 +504,50 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy - def content_hash(self) -> str: - """ - Calculate content hash excluding source information. - - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self.as_table(include_source=False), prefix_hasher_id=True - ) - return self._cached_content_hash - - # use keys() implementation from dict - - def types(self) -> schemas.PythonSchema: - """ - Returns: - Packet type information as PythonSchema (dict mapping field names to types). - """ - return self._python_schema.copy() - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info + def as_datagram(self, include_source: bool = False) -> DictDatagram: """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> "PythonDictPacket2": - """Return a shallow copy of the packet.""" - new_packet = PythonDictPacket2(self, self.source_info()) - new_packet._cached_table = self._cached_table - new_packet._cached_content_hash = self._cached_content_hash - new_packet._python_schema = self._python_schema.copy() - new_packet.semantic_converter = self.semantic_converter - new_packet.arrow_hasher = self.arrow_hasher - return new_packet - - -class PythonDictPacket(dict[str, DataValue]): - """ - Dictionary-based Packet with source tracking and hashing. - - A dictionary-based packet that maintains source information, supports - type specifications, and provides content hashing with optional callbacks. - Includes comprehensive conversion capabilities to Arrow tables. - - Initialize packet with comprehensive configuration options. - - Args: - data: Primary packet data - source_info: Optional source information mapping - typespec: Optional type specification - finger_print: Optional fingerprint for tracking - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - """ - - @classmethod - def create_from( - cls, - object: dp.Packet, - finger_print: str | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> "PythonDictPacket": - """ - Create a PythonDictPacket from another packet object. - - Args: - object: Source packet to copy from - finger_print: Optional fingerprint identifier - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - Returns: - New PythonDictPacket instance - """ - if isinstance(object, PythonDictPacket): - return object.copy() - - new_packet = PythonDictPacket( - object.as_dict(include_source=False), - object.source_info(), - dict(object.types()), - finger_print=finger_print, - semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, - post_hash_callback=post_hash_callback, - ) - return new_packet - - def __init__( - self, - data: dict[str, DataValue], - source_info: dict[str, str | None] | None = None, - typespec: TypeSpec | None = None, - finger_print: str | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data = {k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX)} - contained_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in data.items() - if k.startswith(SOURCE_INFO_PREFIX) - } - super().__init__(data) - - self._source_info = {**contained_source_info, **(source_info or {})} - - verified_typespec = {} - if typespec is not None: - verified_typespec = dict(typespec) - inferred_typespec = get_typespec_from_dict(self) - for key in self: - if key not in verified_typespec: - verified_typespec[key] = inferred_typespec[key] - self._typespec = verified_typespec - - self._python_schema = schemas.PythonSchema(self._typespec) - - if semantic_converter is not None: - if semantic_converter.python_schema != self._python_schema.with_source_info: - raise ValueError( - "Incompatible Python schema between packet and semantic converter: " - + str(self._python_schema.with_source_info) - + " vs " - + str(semantic_converter.python_schema) - ) - else: - semantic_converter = SemanticConverter.from_typespec( - self._python_schema.with_source_info, - semantic_type_registry or default_registry, - ) - self.semantic_converter = semantic_converter - - self._finger_print = finger_print - self._post_hash_callback = post_hash_callback - self._cached_table: pa.Table | None = None - self._cached_content_hash: str | None = None - - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher - - def as_table(self, include_source: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - if self._cached_table is None: - self._cached_table = ( - self.semantic_converter.from_python_store_to_arrow_table( - self.as_dict(include_source=True) - ) - ) - assert self._cached_table is not None, "Cached table should not be None" - if include_source: - return self._cached_table - else: - # drop source info columns if not needed - return self._cached_table.select(list(self.keys())) - - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: - """ - Return dictionary representation. + Convert the packet to a DictDatagram. Args: include_source: Whether to include source info fields Returns: - Dictionary representation of the packet + DictDatagram representation of the packet """ - dict_copy = self.copy() + data = self.as_dict(include_source=include_source) + typespec = self.types() + # append source info to typespec if requested if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value - return dict_copy - - def content_hash(self) -> str: - """ - Calculate and return content hash. + for key in self.keys(): + typespec[f"{SOURCE_INFO_PREFIX}{key}"] = str + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + arrow_hasher=self._arrow_hasher, + ) - Computes hash of packet data content (thus excluding source info) and - optionally triggers post-hash callback if configured. + # def content_hash2(self) -> str: + # """ + # Calculate content hash excluding source information. - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self.as_table(include_source=False), prefix_hasher_id=True - ) - if self._post_hash_callback is not None and self._finger_print is not None: - self._post_hash_callback(self._finger_print, self._cached_content_hash) - return self._cached_content_hash + # Returns: + # Hash string of the packet content + # """ + # # TODO: check if this is identical to DictDatagram.content_hash + # if self._cached_content_hash is None: + # self._cached_content_hash = self._arrow_hasher.hash_table( + # self.as_table(include_source=False), prefix_hasher_id=True + # ) + # return self._cached_content_hash # use keys() implementation from dict def types(self) -> schemas.PythonSchema: - """Return packet data type information as PythonSchema (dict mapping field names to types).""" + """ + Returns: + Packet type information as PythonSchema (dict mapping field names to types). + """ return self._python_schema.copy() def source_info(self) -> dict[str, str | None]: @@ -904,22 +559,17 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} - def copy(self) -> "PythonDictPacket": + def copy(self) -> Self: """Return a shallow copy of the packet.""" - new_packet = PythonDictPacket(self, self.source_info()) - new_packet._finger_print = self._finger_print - new_packet._cached_table = self._cached_table - new_packet._cached_content_hash = self._cached_content_hash - new_packet._python_schema = self._python_schema.copy() - new_packet.semantic_converter = self.semantic_converter - new_packet.arrow_hasher = self.arrow_hasher - new_packet._post_hash_callback = self._post_hash_callback - return new_packet + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance -def process_table_with_source_info( +def prepare_data_and_source_tables( table: pa.Table, source_info: dict[str, str | None] | None = None -) -> tuple[tuple[str, ...], pa.Table]: +) -> tuple[pa.Table, pa.Table]: """ Process a table to ensure proper source_info columns. @@ -929,14 +579,14 @@ def process_table_with_source_info( it will take precedence over existing source_info columns in the table. Returns: - Processed table with source_info columns + tuple of table without any source info and another table only containing source info columns (with prefix) """ if source_info is None: source_info = {} # Step 1: Separate source_info columns from regular columns - regular_columns = [] - regular_names = [] + data_columns = [] + data_column_names = [] existing_source_info = {} for i, name in enumerate(table.column_names): @@ -945,21 +595,19 @@ def process_table_with_source_info( base_name = name.removeprefix(SOURCE_INFO_PREFIX) existing_source_info[base_name] = table.column(i) else: - regular_columns.append(table.column(i)) - regular_names.append(name) + data_columns.append(table.column(i)) + data_column_names.append(name) # Step 2: Create source_info columns for each regular column - final_columns = [] - final_names = [] + source_info_columns = [] + source_info_column_names = [] # Add all regular columns first - final_columns.extend(regular_columns) - final_names.extend(regular_names) # Create source_info columns for each regular column num_rows = table.num_rows - for col_name in regular_names: + for col_name in data_column_names: source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" # if col_name is in source_info, use that value @@ -981,15 +629,192 @@ def process_table_with_source_info( # Use null values source_values = pa.array([None] * num_rows, type=pa.large_string()) - final_columns.append(source_values) - final_names.append(source_info_col_name) + source_info_columns.append(source_values) + source_info_column_names.append(source_info_col_name) # Step 3: Create the final table - result: pa.Table = pa.Table.from_arrays(final_columns, names=final_names) - return tuple(regular_names), result + data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) + source_info_table: pa.Table = pa.Table.from_arrays( + source_info_columns, names=source_info_column_names + ) + return data_table, source_info_table + + +class ArrowDatagram: + """ + An immutable datagram implementation using a PyArrow Table backend. + TODO: handle RecordBatch in addition to table + + This basic datagram provides functionality for type handling, + semantic conversion, and dict-based content representation while maintaining + immutability of the underlying data. + + Initialize ArrowDatagram with a PyArrow table. -class ArrowPacket: + Args: + data: Source data mapping + typespec: Optional type specification for fields + semantic_converter: Optional converter for semantic types + semantic_type_registry: Registry for semantic type lookup + arrow_hasher: Optional hasher for Arrow table content + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + # normalize the table to ensure it contains proper source columns + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # TODO: add check for compatible types, especially of str being pa.large_string + self._table = table + + # create semantic converter + # TODO: consider some validation of passed semantic_converter + if semantic_converter is None: + if semantic_type_registry is None: + raise ValueError( + "Semantic type registry must be provided if semantic converter is not specified." + ) + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._table.schema, + semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + self._arrow_hasher = arrow_hasher + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + return self._table + + def as_dict(self) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._table + )[0] + assert self._cached_python_dict is not None, "Cached dict should not be None" + return dict(self._cached_python_dict) + + def content_hash( + self, + ) -> str: + """ + Calculate and return content hash of the datagram. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + if self._arrow_hasher is None: + raise ValueError( + "Arrow hasher must be provided to calculate content hash." + ) + self._cached_content_hash = self._arrow_hasher.hash_table( + self.as_table(), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + def keys(self) -> tuple[str, ...]: + return tuple(self._table.column_names) + + def types(self) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema(self._table.schema) + ) + return self._cached_python_schema.copy() + + @classmethod + def _from_copy( + cls, + table: pa.Table, + python_schema: schemas.PythonSchema, + semantic_converter: SemanticConverter, + hash_keys: tuple[str, ...], + arrow_hasher: hp.ArrowHasher, + ) -> Self: + """Create a new instance from copy without full initialization.""" + instance = cls.__new__(cls) + instance._table = table + instance._semantic_converter = semantic_converter + instance._arrow_hasher = arrow_hasher + + # Set attributes directly + instance._cached_content_hash = None + + return instance + + def copy(self) -> Self: + """Return a copy of the datagram.""" + new_datagram = self.__class__( + self._table, + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, + ) + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_python_dict = self._cached_python_dict + return new_datagram + + def __repr__(self) -> str: + """Return string representation.""" + return f"{self.as_dict()}" + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) + + +class ArrowPacket(ArrowDatagram): """ Arrow table-based packet implementation with comprehensive features. @@ -1014,56 +839,16 @@ class ArrowPacket: ValueError: If table doesn't contain exactly one row """ - @classmethod - def create_from( - cls, - object: dp.Packet, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - finger_print: str | None = None, - arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, - ) -> "ArrowPacket": - """ - Create an ArrowPacket from another packet object. - - Args: - object: Source packet to copy from - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint identifier - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - - Returns: - New ArrowPacket instance - """ - if isinstance(object, ArrowPacket): - return object.copy() - - new_packet = ArrowPacket( - object.as_table(include_source=True), - semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - finger_print=finger_print, - arrow_hasher=arrow_hasher, - post_hash_callback=post_hash_callback, - skip_source_info_extraction=True, - ) - return new_packet - def __init__( self, - table: pa.Table, + data: pa.Table, source_info: dict[str, str | None] | None = None, semantic_converter: SemanticConverter | None = None, semantic_type_registry: SemanticTypeRegistry | None = None, - finger_print: str | None = None, arrow_hasher: hp.ArrowHasher | None = None, - post_hash_callback: Callable[[str, str], None] | None = None, skip_source_info_extraction: bool = False, ) -> None: - if len(table) != 1: + if len(data) != 1: raise ValueError( "ArrowPacket should only contain a single row, " "as it represents a single packet." @@ -1073,49 +858,43 @@ def __init__( if not skip_source_info_extraction: # normalize the table to ensure it has the expected source_info columns - self._keys, self._arrow_table = process_table_with_source_info( - table, source_info - ) - else: - self._keys: tuple[str, ...] = tuple( - [c for c in table.column_names if not c.startswith(SOURCE_INFO_PREFIX)] - ) - for k in self._keys: - if f"{SOURCE_INFO_PREFIX}{k}" not in table.column_names: - raise ValueError( - f"Source info column '{SOURCE_INFO_PREFIX}{k}' is missing in the table." - ) - self._arrow_table = table - - self._finger_print = finger_print - self._post_hash_callback = post_hash_callback - - if semantic_converter is not None: - check_arrow_schema_compatibility( - semantic_converter.arrow_schema, self._arrow_table.schema + data_table, self._source_info_table = prepare_data_and_source_tables( + data, source_info ) else: - semantic_converter = SemanticConverter.from_arrow_schema( - self._arrow_table.schema, semantic_type_registry or default_registry + data_columns: tuple[str, ...] = tuple( + [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) - self.semantic_converter = semantic_converter + source_columns = [f"{SOURCE_INFO_PREFIX}{c}" for c in data_columns] + # Add conversion to large_string type + data_table = data.select(data_columns) + self._source_info_table = data.select(source_columns) - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self.arrow_hasher = arrow_hasher + super().__init__( + data_table, + semantic_converter=semantic_converter, + semantic_type_registry=semantic_type_registry, + arrow_hasher=arrow_hasher, + ) - self._cached_python_packet: PythonStore | None = None - self._cached_content_hash: str | None = None - self._cached_python_schema: schemas.PythonSchema | None = None self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None - def as_table(self, include_source: bool = False) -> pa.Table: - """Return the Arrow table representation of the packet.""" - base_table = self._arrow_table - if not include_source: - # Select only the keys that are not source info - base_table = base_table.select(self._keys) - return base_table + def as_table( + self, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table() + if include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables( + table, + self._source_info_table.select( + [f"{SOURCE_INFO_PREFIX}{c}" for c in table.column_names] + ), + ) + return table def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: """ @@ -1127,42 +906,20 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - if self._cached_python_packet is None: - self._cached_python_packet = ( - self.semantic_converter.from_arrow_table_to_python_stores( - self._arrow_table - )[0] - ) + return_dict = super().as_dict() if include_source: - return dict(self._cached_python_packet) - - return {k: self._cached_python_packet[k] for k in self._keys} - - def content_hash(self) -> str: - """ - Calculate and return content hash. - - Computes hash of the Arrow table content and optionally - triggers post-hash callback if configured. - - Returns: - Hash string of the packet content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self.arrow_hasher.hash_table( - self._arrow_table, prefix_hasher_id=True + return_dict.update( + {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} ) - if self._post_hash_callback is not None and self._finger_print is not None: - self._post_hash_callback(self._finger_print, self._cached_content_hash) - return self._cached_content_hash - - def types(self) -> schemas.PythonSchema: - """Return packet data type information as PythonSchema (dict mapping field names to types).""" - return self.semantic_converter.python_schema.copy() - - def keys(self) -> tuple[str, ...]: - """Return the keys of the packet.""" - return tuple(self._keys) + return return_dict + + def as_datagram(self, include_source: bool = False) -> ArrowDatagram: + table = self.as_table(include_source=include_source) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, + ) def source_info(self) -> dict[str, str | None]: """ @@ -1173,37 +930,26 @@ def source_info(self) -> dict[str, str | None]: """ if self._cached_source_info is None: self._cached_source_info = { - k: self._arrow_table[f"{SOURCE_INFO_PREFIX}{k}"][0].as_py() - for k in self._keys + k.removeprefix(SOURCE_INFO_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() } return self._cached_source_info.copy() - def copy(self) -> "ArrowPacket": - """Return a shallow copy of the packet.""" - new_packet = ArrowPacket( - self._arrow_table, - semantic_converter=self.semantic_converter, - finger_print=self._finger_print, - arrow_hasher=self.arrow_hasher, - post_hash_callback=self._post_hash_callback, + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + arrow_hasher=self._arrow_hasher, skip_source_info_extraction=True, ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema new_packet._cached_content_hash = self._cached_content_hash - new_packet._cached_source_info = ( - self._cached_source_info.copy() - if self._cached_source_info is not None - else None - ) - new_packet._cached_python_packet = ( - dict(self._cached_python_packet) - if self._cached_python_packet is not None - else None - ) - return new_packet - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict(include_source=False)}" + return new_packet # a batch is a tuple of a tag and a list of packets diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 95667d2..a276e23 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -261,6 +261,16 @@ def op_forward(self, *streams: dp.Stream) -> ImmutableTableStream: all_tag_typespecs = [] all_packet_typespecs = [] + joined_stream = streams[0] + for stream in streams[1:]: + joined_tag_typespec, joined_packet_typespec = joined_stream.types() + stream_tag_typespec, stream_packet_typespec = stream.types() + joined_table = joined_stream.as_table().join( + stream.as_table(), + keys=intersection_typespecs(joined_tag_typespec, stream_tag_typespec), + join_type="inner", + ) + for stream in streams: tag_typespec, packet_typespec = stream.types() all_tag_typespecs.append(tag_typespec) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a6b0d0f..e6c2c96 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -4,14 +4,19 @@ from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast -from orcapod.data.datagrams import PythonDictPacket +from orcapod.data.datagrams import DictPacket, DictTag from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join from orcapod.data.streams import PodStream +from orcapod.hashing import get_default_arrow_hasher from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp -from orcapod.types import SemanticTypeRegistry, TypeSpec, default_registry +from orcapod.protocols.store_protocols import ArrowDataStore +from orcapod.types import TypeSpec, default_registry +from orcapod.types.schemas import PythonSchema +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) @@ -209,6 +214,7 @@ def __init__( output_typespec: TypeSpec | Sequence[type] | None = None, label: str | None = None, semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: hp.ArrowHasher | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, ) -> None: @@ -228,36 +234,37 @@ def __init__( self.function_name = function_name super().__init__(label=label or self.function_name, **kwargs) - if semantic_type_registry is None: - # TODO: reconsider the use of default registry here - semantic_type_registry = default_registry - - self.semantic_type_registry = semantic_type_registry - self.function_info_extractor = function_info_extractor - # extract input and output types from the function signature - self._input_packet_types, self._output_packet_types = ( - tsutils.extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_typespec, - output_typespec=output_typespec, - ) + input_packet_types, output_packet_types = tsutils.extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_typespec, + output_typespec=output_typespec, ) + self._input_packet_schema = PythonSchema(input_packet_types) + self._output_packet_schema = PythonSchema(output_packet_types) - def input_packet_types(self) -> TypeSpec: + semantic_type_registry = semantic_type_registry or default_registry + self._output_semantic_converter = SemanticConverter.from_semantic_schema( + self._output_packet_schema.to_semantic_schema(semantic_type_registry) + ) + + self.arrow_hasher = arrow_hasher or get_default_arrow_hasher() + self.function_info_extractor = function_info_extractor + + def input_packet_types(self) -> PythonSchema: """ Return the input typespec for the function pod. This is used to validate the input streams. """ - return self._input_packet_types + return self._input_packet_schema.copy() - def output_packet_types(self) -> TypeSpec: + def output_packet_types(self) -> PythonSchema: """ Return the output typespec for the function pod. This is used to validate the output streams. """ - return self._output_packet_types + return self._output_packet_schema.copy() def __repr__(self) -> str: return f"FunctionPod:{self.function_name}" @@ -271,9 +278,7 @@ def __str__(self) -> str: ) return f"FunctionPod:{func_sig}" - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[dp.Tag, PythonDictPacket | None]: + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -302,8 +307,11 @@ def call( ) # TODO: add source info based on this function call - output_packet = PythonDictPacket( - {k: v for k, v in zip(self.output_keys, output_values)} + output_packet = DictPacket( + {k: v for k, v in zip(self.output_keys, output_values)}, + typespec=self.output_packet_types(), + semantic_converter=self._output_semantic_converter, + arrow_hasher=self.arrow_hasher, ) return tag, output_packet @@ -325,7 +333,6 @@ def identity_structure(self, *streams: dp.Stream) -> Any: "input_packet_types": self.input_packet_types(), "output_packet_types": self.output_packet_types(), } - function_info["output_keys"] = tuple(self.output_keys) id_struct = ( self.__class__.__name__, @@ -334,7 +341,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure if len(streams) > 0: - processed_streams = self.pre_processing_step(*streams) + # TODO: extract the common handling of input streams + processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) id_struct += (processed_streams[0],) @@ -393,7 +401,19 @@ class CachedPod(WrappedPod): This is useful for pods that are expensive to compute and can benefit from caching. """ - def __init__(self, pod: dp.Pod, cache_key: str, **kwargs): + def __init__( + self, + pod: dp.Pod, + result_store: ArrowDataStore, + lineage_store: ArrowDataStore | None, + record_path_prefix: tuple[str, ...] = (), + **kwargs, + ): super().__init__(pod, **kwargs) - self.cache_key = cache_key - self.cache: dict[str, dp.Packet] = {} + self.record_path_prefix = record_path_prefix + self.result_store = result_store + self.lineage_store = lineage_store + + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[DictTag, DictPacket | None]: ... diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 223011b..bce6585 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,6 +1,8 @@ +from orcapod.hashing.types import ArrowHasher from orcapod.protocols import data_protocols as dp -from orcapod.types import SemanticTypeRegistry, default_registry, schemas, TypeSpec -from orcapod.data.datagrams import ArrowPacket, ArrowTag, SemanticConverter +from orcapod.types import schemas, TypeSpec +from orcapod.types.semantic_types import SemanticTypeRegistry +from orcapod.data.datagrams import ArrowPacket, ArrowTag, DictTag, SemanticConverter from orcapod.data.base import LabeledContentIdentifiableBase import pyarrow as pa from collections.abc import Iterator, Collection @@ -9,6 +11,7 @@ from typing import Any, Literal import logging import warnings +from itertools import repeat # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -118,7 +121,7 @@ def iter_packets( ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... @abstractmethod - def as_table(self) -> pa.Table: ... + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: ... def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ @@ -252,12 +255,12 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified - def as_table(self) -> pa.Table: + def as_table(self, include_content_hash: bool | str = False) -> pa.Table: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) - return self._cached_stream.as_table() + return self._cached_stream.as_table(include_content_hash=include_content_hash) def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() @@ -289,6 +292,7 @@ def __init__( source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), semantic_type_registry: SemanticTypeRegistry | None = None, + arrow_hasher: ArrowHasher | None = None, **kwargs, ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) @@ -299,21 +303,31 @@ def __init__( self._packet_columns = tuple( c for c in table.column_names if c not in tag_columns ) + if len(self._packet_columns) == 0: + raise ValueError( + "No packet columns found in the table. At least one packet column is required." + ) - semantic_type_registry = semantic_type_registry or default_registry tag_schema = pa.schema( f for f in self._table.schema if f.name in self._tag_columns ) packet_schema = pa.schema( f for f in self._table.schema if f.name in self._packet_columns ) - self._tag_converter = SemanticConverter.from_arrow_schema( - tag_schema, semantic_type_registry + + self._tag_schema = tag_schema + self._packet_schema = packet_schema + self._tag_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema(tag_schema, semantic_type_registry) ) - self._packet_converter = SemanticConverter.from_arrow_schema( - packet_schema, semantic_type_registry + self._packet_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + packet_schema, semantic_type_registry + ) ) + self._arrow_hasher = arrow_hasher + self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now @@ -331,8 +345,12 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: """ # TODO: consider using MappingProxyType to avoid copying the dicts return ( - self._tag_converter.python_schema.copy(), - self._packet_converter.python_schema.copy(), + schemas.PythonSchema.from_arrow_schema( + self._tag_schema, converters=self._tag_converter.as_dict() + ), + schemas.PythonSchema.from_arrow_schema( + self._packet_schema, converters=self._packet_converter.as_dict() + ), ) def as_table(self, include_content_hash: bool | str = False) -> pa.Table: @@ -346,10 +364,10 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: "_content_hash" if include_content_hash is True else include_content_hash ) content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] - self._table = self._table.append_column( + table_with_hash = self._table.append_column( hash_column_name, pa.array(content_hashes, type=pa.large_string()) ) - return self._table + return table_with_hash def clear_cache(self) -> None: """ @@ -366,14 +384,35 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: # TODO: make it work with table batch stream if self._cached_elements is None: self._cached_elements = [] - tags = self._table.select(self._tag_columns) + tag_present = len(self._tag_columns) > 0 + if tag_present: + tags = self._table.select(self._tag_columns) + tag_batches = tags.to_batches() + else: + tag_batches = repeat(DictTag({})) + + # TODO: come back and clean up this logic + packets = self._table.select(self._packet_columns) - for tag_batch, packet_batch in zip(tags.to_batches(), packets.to_batches()): - for i in range(len(tag_batch)): + for tag_batch, packet_batch in zip(tag_batches, packets.to_batches()): + for i in range(len(packet_batch)): + if tag_present: + tag = ArrowTag( + tag_batch.slice(i, 1), # type: ignore + semantic_converter=self._tag_converter, + arrow_hasher=self._arrow_hasher, + ) + + else: + tag = tag_batch self._cached_elements.append( ( - ArrowTag(tag_batch.slice(i, 1)), - ArrowPacket(packet_batch.slice(i, 1)), + tag, + ArrowPacket( + packet_batch.slice(i, 1), + semantic_converter=self._packet_converter, + arrow_hasher=self._arrow_hasher, + ), ) ) yield from self._cached_elements diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 2b66b52..a89ab4e 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -56,6 +56,7 @@ def __init__( handle_missing: str = "error", semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, serialization_method: str = "logical", + # TODO: consider passing options for serialization method ): """ Initialize SemanticArrowHasher. @@ -111,7 +112,11 @@ def _get_semantic_type(self, field: pa.Field) -> str | None: return None def _create_hash_column( - self, original_column: pa.Array, hash_bytes: bytes, original_field: pa.Field + self, + original_column: pa.Array, + hash_algorithm: str, + hash_bytes: bytes, + original_field: pa.Field, ) -> tuple[pa.Array, pa.Field]: """Create a new column containing the hash bytes.""" # Create array of hash bytes (one hash value repeated for each row) @@ -124,11 +129,11 @@ def _create_hash_column( "semantic_type", "unknown" ) new_metadata["semantic_type"] = "hash" - new_metadata["hash_algorithm"] = "sha256" + new_metadata["hash_algorithm"] = hash_algorithm_id new_field = pa.field( original_field.name, - pa.string(), # Hash stored as string + pa.large_string(), # Hash stored as large string nullable=original_field.nullable, metadata=new_metadata, ) @@ -152,7 +157,7 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: # Replace column with hash hash_column, hash_field = self._create_hash_column( - column, hash_bytes, field + column, hasher.hasher_id, hash_bytes, field ) new_columns.append(hash_column) new_fields.append(hash_field) diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 4508f95..bcd489f 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -30,31 +30,31 @@ def __init__( self.cacher = string_cacher self.cache_key_prefix = cache_key_prefix - def _hash_file_content(self, file_path: str) -> str: - """Hash the content of a single file and return hex string.""" + def _hash_file_content(self, file_path: str) -> bytes: + """Hash the content of a single file""" import os # if cacher exists, check if the hash is cached if self.cacher: cache_key = f"{self.cache_key_prefix}:{file_path}" - cached_hash = self.cacher.get_cached(cache_key) - if cached_hash is not None: - return cached_hash + cached_hash_hex = self.cacher.get_cached(cache_key) + if cached_hash_hex is not None: + return bytes.fromhex(cached_hash_hex) try: if not os.path.exists(file_path): if self.handle_missing == "error": raise FileNotFoundError(f"File not found: {file_path}") elif self.handle_missing == "skip": - return hashlib.sha256(b"").hexdigest() + return hashlib.sha256(b"").digest() elif self.handle_missing == "null_hash": - return hashlib.sha256(b"").hexdigest() + return hashlib.sha256(b"").digest() - hashed_value = self.file_hasher.hash_file(file_path).hex() + hashed_value = self.file_hasher.hash_file(file_path) if self.cacher: - # Cache the computed hash + # Cache the computed hash hex self.cacher.set_cached( - f"{self.cache_key_prefix}:{file_path}", hashed_value + f"{self.cache_key_prefix}:{file_path}", hashed_value.hex() ) return hashed_value @@ -63,7 +63,7 @@ def _hash_file_content(self, file_path: str) -> str: raise IOError(f"Cannot read file {file_path}: {e}") else: # skip or null_hash error_msg = f"" - return hashlib.sha256(error_msg.encode("utf-8")).hexdigest() + return hashlib.sha256(error_msg.encode("utf-8")).digest() def hash_column(self, column: pa.Array) -> pa.Array: """ diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index c5c1919..0cd0722 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -1,7 +1,7 @@ # A collection of versioned hashers that provide a "default" implementation of hashers. from .arrow_hashers import SemanticArrowHasher +from orcapod.utils.object_spec import parse_objectspec from orcapod.protocols.hashing_protocols import ObjectHasher -import importlib from typing import Any CURRENT_VERSION = "v0.1" @@ -9,7 +9,7 @@ versioned_semantic_arrow_hashers = { "v0.1": { "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", - "config": { + "_config": { "hasher_id": "arrow_v0.1", "hash_algorithm": "sha256", "chunk_size": 8192, @@ -17,10 +17,10 @@ "semantic_type_hashers": { "path": { "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", - "config": { + "_config": { "file_hasher": { "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "config": { + "_config": { "algorithm": "sha256", }, } @@ -34,36 +34,17 @@ versioned_object_hashers = { "v0.1": { "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", - "config": { + "_config": { "hasher_id": "object_v0.1", "function_info_extractor": { "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", - "config": {"include_module": True, "include_defaults": True}, + "_config": {"include_module": True, "include_defaults": True}, }, }, } } -def parse_objectspec(obj_spec: dict) -> Any: - if "_class" in obj_spec: - # if _class is specified, treat the dict as an object specification - module_name, class_name = obj_spec["_class"].rsplit(".", 1) - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - configs = parse_objectspec(obj_spec.get("config", {})) - return cls(**configs) - else: - # otherwise, parse through the dictionary recursively - parsed_object = obj_spec - for k, v in obj_spec.items(): - if isinstance(v, dict): - parsed_object[k] = parse_objectspec(v) - else: - parsed_object[k] = v - return parsed_object - - def get_versioned_semantic_arrow_hasher( version: str | None = None, ) -> SemanticArrowHasher: diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 405714f..9470c1e 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -4,9 +4,10 @@ from orcapod.core.streams import EmptyStream from orcapod.stores import ArrowDataStore from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry +from orcapod.types.legacy import packets from orcapod.types.typespec_utils import union_typespecs -from orcapod.types.semantic_type_registry import SemanticTypeRegistry -from orcapod.types import packets, schemas +from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry +from orcapod.types import schemas from orcapod.hashing import ObjectHasher, ArrowHasher from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher from typing import Any, Literal diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 7c14e2e..1767509 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -127,6 +127,11 @@ def extract_function_info( class SemanticTypeHasher(Protocol): """Abstract base class for semantic type-specific hashers.""" + @property + def hasher_id(self) -> str: + """Unique identifier for this semantic type hasher.""" + ... + def hash_column( self, column: pa.Array, diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index e69de29..d5ca902 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -0,0 +1,34 @@ +from typing import Collection, Protocol, TYPE_CHECKING +from orcapod.protocols import data_protocols as dp +import pyarrow as pa + +if TYPE_CHECKING: + import polars as pl + + +class ArrowDataStore(Protocol): + def record_data( + self, + record_path: tuple[str, ...], + record_id: str, + data: pa.Table, + ignore_duplicates: bool = False, + ) -> str | None: ... + + def get_recorded_data( + self, + record_path: tuple[str, ...], + record_id: str, + ) -> pa.Table | None: ... + + def get_all_records(self, record_path: tuple[str, ...]) -> pa.Table | None: + """Retrieve all records for a given path as a stream.""" + ... + + def get_records_by_ids( + self, + record_path: tuple[str, ...], + record_ids: Collection[str], + add_entry_id_column: bool | str = False, + preseve_input_order: bool = False, + ) -> pa.Table: ... diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 1114c11..573a316 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -1,7 +1,7 @@ -from .types import DataStore, ArrowDataStore -from .arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore -from .dict_data_stores import DirDataStore, NoOpDataStore -from .safe_dir_data_store import SafeDirDataStore +from .legacy.types import DataStore, ArrowDataStore +from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore +from .legacy.dict_data_stores import DirDataStore, NoOpDataStore +from .legacy.safe_dir_data_store import SafeDirDataStore __all__ = [ "DataStore", diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py new file mode 100644 index 0000000..f8f0451 --- /dev/null +++ b/src/orcapod/stores/delta_lake_stores.py @@ -0,0 +1,861 @@ +import pyarrow as pa +import pyarrow.dataset as ds +import polars as pl +from pathlib import Path +from typing import Any +import logging +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from collections import defaultdict + + +# Module-level logger +logger = logging.getLogger(__name__) + + +class BasicDeltaTableArrowStore: + """ + A basic Delta Table-based Arrow data store with flexible hierarchical path support. + This store does NOT implement lazy loading or streaming capabilities, therefore + being "basic" in that sense. It is designed for simple use cases where data is written + in batches and read back as complete tables. It is worth noting that the Delta table + structure created by this store IS compatible with more advanced Delta Table-based + data stores (to be implemented) that will support lazy loading and streaming. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10, + batch_size: int = 100, + ): + """ + Initialize the BasicDeltaTableArrowStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate entry_ids: + - 'error': Raise ValueError when entry_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, as" + ) + + def flush(self) -> None: + """ + Flush all pending batches immediately. + + This method is called to ensure all pending data is written to the Delta tables. + """ + try: + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during flush: {e}") + + def flush_batch(self, source_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + source_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(source_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(source_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(source_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + entry_ids = combined_table.column("__entry_id").to_pylist() + unique_entry_ids = list(set(entry_ids)) + + # Delete existing records with these IDs + if unique_entry_ids: + entry_ids_str = "', '".join(unique_entry_ids) + delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same entry_id does not exist yet + delta_table.merge( + source=combined_table, + predicate="target.__entry_id = source.__entry_id", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + source_path = tuple(source_key.split("/")) + try: + self.flush_batch(source_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def __del__(self): + """Cleanup when object is destroyed.""" + self.flush() + + def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + source_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not source_path: + raise ValueError("Source path cannot be empty") + + if len(source_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(source_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path component contains invalid characters: {repr(component)}" + ) + + def _get_source_key(self, source_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(source_path) + + def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in source_path: + path = path / subpath + return path + + def _get_existing_delta_table( + self, source_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + source_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(source_path) + table_path = self._get_table_path(source_path) + + # Check cache first + if dt := self._delta_table_cache.get(source_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + + def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: + """Ensure the table has an __entry_id column.""" + if "__entry_id" not in arrow_data.column_names: + # Add entry_id column at the beginning + key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + return arrow_data + + def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the __entry_id column if it exists.""" + if "__entry_id" in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i for i, name in enumerate(column_names) if name != "__entry_id" + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_entry_id_column( + self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + ) -> pa.Table: + """ + Handle entry_id column based on add_entry_id_column parameter. + + Args: + arrow_data: Arrow table with __entry_id column + add_entry_id_column: Control entry ID column inclusion: + - False: Remove __entry_id column + - True: Keep __entry_id column as is + - str: Rename __entry_id column to custom name + """ + if add_entry_id_column is False: + # Remove the __entry_id column + return self._remove_entry_id_column(arrow_data) + elif isinstance(add_entry_id_column, str): + # Rename __entry_id to custom name + if "__entry_id" in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + add_entry_id_column if name == "__entry_id" else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + # If add_entry_id_column is True, keep __entry_id as is + return arrow_data + + def _create_entry_id_filter(self, entry_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + entry_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "=", entry_id)] + + def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + entry_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [("__entry_id", "in", entry_ids)] + + def _read_table_with_filter( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> pa.Table: + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def record_data( + self, + record_path: tuple[str, ...], + entry_id: str, + data: pa.Table, + force_flush: bool = False, + error_on_duplicate: bool | None = None, + ) -> pa.Table: + self._validate_source_path(record_path) + source_key = self._get_source_key(record_path) + + # Check for existing entry + if error_on_duplicate is None: + error_on_duplicate = self.duplicate_entry_behavior == "error" + if error_on_duplicate: + pending_table = self._pending_batches[source_key].get(entry_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + if existing_record is not None: + raise ValueError( + f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Add entry_id column to the data + data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table - save original schema first + write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {entry_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {entry_id}: {e}" + ) + + write_deltalake( + table_path, + data_with_entry_id, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to the batch for later flushing + self._pending_batches[source_key][entry_id] = data_with_entry_id + batch_size = len(self._pending_batches[source_key]) + + # Check if we need to flush + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added record {entry_id} to {source_key}") + return data + + def get_recorded_data( + self, + record_path: tuple[str, ...], + entry_id: str, + flush: bool = False, + ) -> pa.Table | None: + """ + Get a specific record by entry_id with schema preservation. + + Args: + source_path: Tuple of path components + entry_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + # check if entry_id is found in pending batches + source_key = self._get_source_key(record_path) + if entry_id in self._pending_batches[source_key]: + # Return the pending record directly + return self._pending_batches[source_key][entry_id] + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_entry_id_filter(entry_id) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Remove the __entry_id column before returning + return self._remove_entry_id_column(result) + + except Exception as e: + logger.error( + f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_all_records( + self, + record_path: tuple[str, ...], + add_entry_id_column: bool | str = False, + retrieve_pending: bool = True, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve all records for a given source path as a single table with schema preservation. + + Args: + source_path: Tuple of path components + add_entry_id_column: Control entry ID column inclusion: + - False: Don't include entry ID column (default) + - True: Include entry ID column as "__entry_id" + - str: Include entry ID column with custom name + + Returns: + Arrow table containing all records with original schema, or None if no records found + """ + # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily + + if flush: + self.flush_batch(record_path) + self._validate_source_path(record_path) + + collected_tables = [] + if retrieve_pending: + # Check if there are pending records in the batch + for entry_id, arrow_table in self._pending_batches[ + self._get_source_key(record_path) + ].items(): + collected_tables.append( + self._ensure_entry_id_column(arrow_table, entry_id) + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) + + if len(result) != 0: + collected_tables.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(record_path)}: {e}" + ) + if collected_tables: + total_table = pa.concat_tables(collected_tables) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(total_table, add_entry_id_column) + + return None + + # def get_all_records_as_polars( + # self, source_path: tuple[str, ...], flush: bool = True + # ) -> pl.LazyFrame | None: + # """ + # Retrieve all records for a given source path as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + + # Returns: + # Polars LazyFrame containing all records, or None if no records found + # """ + # all_records = self.get_all_records(source_path, flush=flush) + # if all_records is None: + # return None + # # TODO: take care of converting semantics to Python objects + # return pl.LazyFrame(all_records.as_table()) + + def get_records_by_ids( + self, + source_path: tuple[str, ...], + entry_ids: list[str] | pl.Series | pa.Array, + add_entry_id_column: bool | str = False, + preserve_input_order: bool = False, + flush: bool = False, + ) -> pa.Table | None: + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + source_path: Tuple of path components + entry_ids: Entry IDs to retrieve + add_entry_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + + if flush: + self.flush_batch(source_path) + + self._validate_source_path(source_path) + + # Convert input to list of strings for consistency + if isinstance(entry_ids, list): + if not entry_ids: + return None + entry_ids_list = entry_ids + elif isinstance(entry_ids, pl.Series): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_list() + elif isinstance(entry_ids, pa.Array): + if len(entry_ids) == 0: + return None + entry_ids_list = entry_ids.to_pylist() + else: + raise TypeError( + f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + ) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read with filters + filter_expr = self._create_entry_ids_filter(entry_ids_list) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + if preserve_input_order: + raise NotImplementedError("Preserve input order is not yet implemented") + # Need to reorder results and add nulls for missing entries + import pandas as pd + + df = result.to_pandas() + df = df.set_index("__entry_id") + + # Create a DataFrame with the desired order, filling missing with NaN + ordered_df = df.reindex(entry_ids_list) + + # Convert back to Arrow + result = pa.Table.from_pandas(ordered_df.reset_index()) + + # Handle entry_id column based on parameter + return self._handle_entry_id_column(result, add_entry_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + ) + return None + + # def get_records_by_ids_as_polars( + # self, + # source_path: tuple[str, ...], + # entry_ids: list[str] | pl.Series | pa.Array, + # add_entry_id_column: bool | str = False, + # preserve_input_order: bool = False, + # flush: bool = False, + # ) -> pl.LazyFrame | None: + # """ + # Retrieve records by entry IDs as a single Polars LazyFrame. + + # Args: + # source_path: Tuple of path components + # entry_ids: Entry IDs to retrieve + # add_entry_id_column: Control entry ID column inclusion + # preserve_input_order: If True, return results in input order with nulls for missing + + # Returns: + # Polars LazyFrame containing all found records, or None if no records found + # """ + # arrow_result = self.get_records_by_ids( + # source_path, + # entry_ids, + # add_entry_id_column, + # preserve_input_order, + # flush=flush, + # ) + + # if arrow_result is None: + # return None + + # # Convert to Polars LazyFrame + # return pl.LazyFrame(arrow_result) + + # Additional utility methods + + def get_pending_batch_info(self) -> dict[str, int]: + """ + Get information about pending batches. + + Returns: + Dictionary mapping source keys to number of pending records + """ + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } + + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, source_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + source_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + table_path = self._get_table_path(source_path) + source_key = self._get_source_key(source_path) + + if not table_path.exists(): + return False + + try: + # Remove from caches + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + """ + Delete a specific record. + + Args: + source_path: Tuple of path components + entry_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_source_path(source_path) + + # Flush any pending batches first + self.flush_batch(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return False + + try: + # Check if record exists using proper filter + filter_expr = self._create_entry_id_filter(entry_id) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) + if len(existing) == 0: + return False + + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + ) + + # Update cache + source_key = self._get_source_key(source_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + return True + + except Exception as e: + logger.error( + f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + ) + return False + + def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + source_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_source_path(source_path) + + delta_table = self._get_existing_delta_table(source_path) + if delta_table is None: + return None + + try: + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + source_key = self._get_source_key(source_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + return { + "path": str(self._get_table_path(source_path)), + "source_path": source_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + "pending_records": pending_count, + } + + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + return None diff --git a/src/orcapod/stores/delta_table_arrow_data_store.py b/src/orcapod/stores/legacy/delta_table_arrow_data_store.py similarity index 100% rename from src/orcapod/stores/delta_table_arrow_data_store.py rename to src/orcapod/stores/legacy/delta_table_arrow_data_store.py diff --git a/src/orcapod/stores/dict_data_stores.py b/src/orcapod/stores/legacy/dict_data_stores.py similarity index 99% rename from src/orcapod/stores/dict_data_stores.py rename to src/orcapod/stores/legacy/dict_data_stores.py index c4eff60..718fef0 100644 --- a/src/orcapod/stores/dict_data_stores.py +++ b/src/orcapod/stores/legacy/dict_data_stores.py @@ -7,7 +7,7 @@ from orcapod.hashing.legacy_core import hash_packet from orcapod.hashing.types import LegacyPacketHasher from orcapod.hashing.defaults import get_default_composite_file_hasher -from orcapod.stores.types import DataStore +from orcapod.stores.legacy.types import DataStore from orcapod.types import Packet, PacketLike logger = logging.getLogger(__name__) diff --git a/src/orcapod/stores/dict_transfer_data_store.py b/src/orcapod/stores/legacy/dict_transfer_data_store.py similarity index 97% rename from src/orcapod/stores/dict_transfer_data_store.py rename to src/orcapod/stores/legacy/dict_transfer_data_store.py index 7e8762f..fe7a52a 100644 --- a/src/orcapod/stores/dict_transfer_data_store.py +++ b/src/orcapod/stores/legacy/dict_transfer_data_store.py @@ -1,6 +1,6 @@ # Implements transfer data store that lets you transfer memoized packets between data stores. -from orcapod.stores.types import DataStore +from orcapod.stores.legacy.types import DataStore from orcapod.types import PacketLike diff --git a/src/orcapod/stores/arrow_data_stores.py b/src/orcapod/stores/legacy/legacy_arrow_data_stores.py similarity index 100% rename from src/orcapod/stores/arrow_data_stores.py rename to src/orcapod/stores/legacy/legacy_arrow_data_stores.py diff --git a/src/orcapod/stores/safe_dir_data_store.py b/src/orcapod/stores/legacy/safe_dir_data_store.py similarity index 99% rename from src/orcapod/stores/safe_dir_data_store.py rename to src/orcapod/stores/legacy/safe_dir_data_store.py index e02e9cc..72f8ef0 100644 --- a/src/orcapod/stores/safe_dir_data_store.py +++ b/src/orcapod/stores/legacy/safe_dir_data_store.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Optional, Union -from .file_utils import atomic_copy, atomic_write +from ..file_utils import atomic_copy, atomic_write logger = logging.getLogger(__name__) diff --git a/src/orcapod/stores/types.py b/src/orcapod/stores/legacy/types.py similarity index 100% rename from src/orcapod/stores/types.py rename to src/orcapod/stores/legacy/types.py diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index 179a253..ca29627 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,30 +1,16 @@ -from .core import Tag, PathLike, PathSet, TypeSpec, DataValue, StoreValue -from .semantic_type_registry import SemanticTypeRegistry -from .semantic_type_handlers import PathHandler, UUIDHandler, DateTimeHandler -from . import semantic_type_handlers +from .core import PathLike, PathSet, TypeSpec, DataValue from . import typespec_utils +from .defaults import DEFAULT_REGISTRY as default_registry Packet = dict[str, str] PacketLike = Packet -# Create default registry and register handlers -default_registry = SemanticTypeRegistry() - -# Register with semantic names - registry extracts supported types automatically -default_registry.register("path", PathHandler()) -default_registry.register("uuid", UUIDHandler()) -default_registry.register( - "datetime", DateTimeHandler() -) # Registers for datetime, date, time __all__ = [ - "default_registry", - "Tag", "TypeSpec", "PathLike", "PathSet", - "semantic_type_handlers", "typespec_utils", "DataValue", - "StoreValue", + "default_registry", ] diff --git a/src/orcapod/types/arrow_utils.py b/src/orcapod/types/arrow_utils.py new file mode 100644 index 0000000..c446901 --- /dev/null +++ b/src/orcapod/types/arrow_utils.py @@ -0,0 +1,10 @@ +import pyarrow as pa + + +def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: + """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, + no field names should collide.""" + merged_fields = [] + for schema in schemas: + merged_fields.extend(schema) + return pa.schema(merged_fields) diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py index 98b49b8..b43d21a 100644 --- a/src/orcapod/types/core.py +++ b/src/orcapod/types/core.py @@ -4,6 +4,7 @@ import logging +logger = logging.getLogger(__name__) DataType: TypeAlias = type @@ -11,18 +12,6 @@ str, DataType ] # Mapping of parameter names to their types -logger = logging.getLogger(__name__) - - -# class TypeSpec(dict[str, DataType]): -# def __init__(self, *args, **kwargs): -# """ -# TypeSpec is a mapping of parameter names to their types. -# It can be used to define the expected types of parameters in a function or a pod. -# """ -# super().__init__(*args, **kwargs) - - # Convenience alias for anything pathlike PathLike = str | os.PathLike @@ -30,10 +19,6 @@ # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists TagValue: TypeAlias = int | str | None | Collection["TagValue"] -# the top level tag is a mapping from string keys to values that can be a string or -# an arbitrary depth of nested list of strings or None -Tag: TypeAlias = Mapping[str, TagValue] - # a pathset is a path or an arbitrary depth of nested list of paths PathSet: TypeAlias = PathLike | Collection[PathLike | None] @@ -46,55 +31,4 @@ # Either the original PathSet or one of our supported simple data types DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None -StoreValue: TypeAlias = SupportedNativePythonData | Collection["StoreValue"] | None - PacketLike: TypeAlias = Mapping[str, DataValue] - - -# class PodFunction(Protocol): -# """ -# A function suitable to be used in a FunctionPod. -# It takes one or more named arguments, each corresponding to either: -# - A path to a file or directory (PathSet) - for backward compatibility -# - A simple data value (str, int, float, bool, bytes, Path) -# and returns either None, a single value, or a list of values -# """ - -# def __call__(self, **kwargs: DataValue) -> None | DataValue | list[DataValue]: ... - - -class TypeHandler(Protocol): - """Protocol for handling conversion between Python type and Arrow - data types used for storage. - - The handler itself IS the definition of a semantic type. The semantic type - name/identifier is provided by the registerer when registering the handler. - - TypeHandlers should clearly communicate what Python types they can handle, - and focus purely on conversion logic. - """ - - def python_type(self) -> type: - """Return the Python type(s) this handler can process. - - Returns: - Python type the handler supports - - Examples: - - PathHandler: return Path - - NumericHandler: return (int, float) - - CollectionHandler: return (list, tuple, set) - """ - ... - - def storage_type(self) -> type: - """Return the Arrow DataType instance for schema definition.""" - ... - - def python_to_storage(self, value: Any) -> Any: - """Convert Python value to Arrow-compatible storage representation.""" - ... - - def storage_to_python(self, value: Any) -> Any: - """Convert storage representation back to Python object.""" - ... diff --git a/src/orcapod/types/defaults.py b/src/orcapod/types/defaults.py new file mode 100644 index 0000000..f7b5773 --- /dev/null +++ b/src/orcapod/types/defaults.py @@ -0,0 +1,51 @@ +# A collection of versioned hashers that provide a "default" implementation of hashers. +from orcapod.utils.object_spec import parse_objectspec + + +from orcapod.types.semantic_types import ( + SemanticTypeRegistry, + SemanticType, + CanonicalPath, + PathlibPathConverter, + ArrowStringPathConverter, +) + +CURRENT_VERSION = "v0.1" + + +semantic_path_objectspec = { + "v0.1": { + "_class": "orcapod.types.semantic_types.SemanticType", + "_config": { + "name": "path", + "description": "File system path representation", + "python_converters": [ + { + "_class": "orcapod.types.semantic_types.PathlibPathConverter", + } + ], + "arrow_converters": [ + { + "_class": "orcapod.types.semantic_types.ArrowStringPathConverter", + } + ], + }, + } +} + +semantic_registry_objectspec = { + "v0.1": { + "_class": "orcapod.types.semantic_types.SemanticTypeRegistry", + "_config": {"semantic_types": [semantic_path_objectspec["v0.1"]]}, + } +} + + +SEMANTIC_PATH = SemanticType[CanonicalPath]( + "path", + "File system path representation", + python_converters=[PathlibPathConverter()], + arrow_converters=[ArrowStringPathConverter()], +) + +DEFAULT_REGISTRY = SemanticTypeRegistry([SEMANTIC_PATH]) diff --git a/src/orcapod/types/packets.py b/src/orcapod/types/legacy/packets.py similarity index 99% rename from src/orcapod/types/packets.py rename to src/orcapod/types/legacy/packets.py index a5836b1..7950d5b 100644 --- a/src/orcapod/types/packets.py +++ b/src/orcapod/types/legacy/packets.py @@ -2,7 +2,7 @@ from typing import TypeAlias, Any from collections.abc import Mapping, Collection from orcapod.types.core import TypeSpec, Tag, TypeHandler -from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry from orcapod.types import schemas from orcapod.types.typespec_utils import get_typespec_from_dict import pyarrow as pa diff --git a/src/orcapod/types/semantic_type_handlers.py b/src/orcapod/types/legacy/semantic_type_handlers.py similarity index 100% rename from src/orcapod/types/semantic_type_handlers.py rename to src/orcapod/types/legacy/semantic_type_handlers.py diff --git a/src/orcapod/types/semantic_type_registry.py b/src/orcapod/types/legacy/semantic_type_registry.py similarity index 99% rename from src/orcapod/types/semantic_type_registry.py rename to src/orcapod/types/legacy/semantic_type_registry.py index 2091904..6934bae 100644 --- a/src/orcapod/types/semantic_type_registry.py +++ b/src/orcapod/types/legacy/semantic_type_registry.py @@ -1,6 +1,6 @@ import logging import pyarrow as pa -from .core import TypeHandler +from ..core import TypeHandler from dataclasses import dataclass # This mapping is expected to be stable diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py index 31e56d5..57f0551 100644 --- a/src/orcapod/types/schemas.py +++ b/src/orcapod/types/schemas.py @@ -1,5 +1,10 @@ +from typing import Self from orcapod.types.core import DataType, TypeSpec -from orcapod.types.semantic_type_registry import SemanticTypeRegistry +from orcapod.types.semantic_types import ( + SemanticType, + SemanticTypeRegistry, + PythonArrowConverter, +) import pyarrow as pa import datetime @@ -58,23 +63,140 @@ class PythonSchema(dict[str, DataType]): {'name': , 'age': } """ - @property - def with_source_info(self) -> dict[str, type]: + def copy(self) -> "PythonSchema": + return PythonSchema(self) + + def to_semantic_schema( + self, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticSchema": """ - Get the schema with source info fields included. + Convert the Python schema to a semantic schema using the provided semantic type registry. + + Parameters + ---------- + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. Returns ------- - dict[str, type|None] - A new schema including source info fields. + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + + Examples + -------- + >>> python_schema = PythonSchema(name=str, age=int) + >>> semantic_schema = python_schema.to_semantic_schema(registry) + >>> print(semantic_schema) + {'name': (str, None), 'age': (int, None)} """ - return {**self, **{f"_source_info_{k}": str for k in self.keys()}} + return SemanticSchema.from_typespec(self, semantic_type_registry) - def copy(self) -> "PythonSchema": - return PythonSchema(self) + def to_arrow_schema( + self, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> pa.Schema: + """ + Convert the Python schema to an Arrow schema. + If converters are provided, they are used to convert the schema. Note that + no validation is performed on the converters, so they must be compatible with the schema. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + fields = [] + for field_name, python_type in self.items(): + if field_name in converters: + converter = converters[field_name] + arrow_type = converter.arrow_type + metadata = None + if converter.semantic_type_name is not None: + metadata = { + b"semantic_type": converter.semantic_type_name.encode( + "utf-8" + ) + } + else: + arrow_type = python_to_arrow_type(python_type) + metadata = None + fields.append(pa.field(field_name, arrow_type, metadata=metadata)) + return pa.schema(fields) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + # Otherwise, convert using the semantic type registry + return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() + + @classmethod + def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: + """ + Create a PythonSchema from a SemanticSchema. + + Parameters + ---------- + semantic_schema : SemanticSchema + The semantic schema to convert. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return cls(semantic_schema.get_python_types()) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> Self: + """ + Create a PythonSchema from an Arrow schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + skip_system_columns : bool, optional + Whether to skip system columns (default is True). + converters : dict[str, PythonArrowConverter], optional + A dictionary of converters to use for converting the schema. If provided, the schema will be + converted using the converters. If not provided, the schema will be converted using the semantic type + registry. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + python_types = {} + for field in arrow_schema: + # TODO: consider performing validation of semantic type + if field.name in converters: + converter = converters[field.name] + python_types[field.name] = converter.python_type + else: + python_types[field.name] = arrow_to_python_type(field.type) + return cls(python_types) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + semantic_schema = SemanticSchema.from_arrow_schema( + arrow_schema, + semantic_type_registry, + ) + return cls(semantic_schema.get_python_types()) -class SemanticSchema(dict[str, tuple[type, str | None]]): +class SemanticSchema(dict[str, type | SemanticType]): """ A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. @@ -84,275 +206,152 @@ class SemanticSchema(dict[str, tuple[type, str | None]]): ---------- keys : str The keys of the schema. - values : tuple[type, str|None] - The types and optional semantic type corresponding to each key. + values : type | SemanticType + Either type for simple fields or SemanticType for semantic fields. Examples -------- - >>> schema = SemanticSchema(image=(str, 'path'), age=(int, None)) + >>> schema = SemanticSchema(image=SemanticType('path'), age=int) >>> print(schema) - {'image': (, 'path'), 'age': (, None)} + {"image": SemanticType(name='path'), "age": })} """ - def get_store_type(self, key: str) -> type | None: + def get_semantic_fields(self) -> dict[str, SemanticType]: """ - Get the storage type for a given key in the schema. - - Parameters - ---------- - key : str - The key for which to retrieve the storage type. + Get a dictionary of semantic fields in the schema. Returns ------- - type | None - The storage type associated with the key, or None if not found. + dict[str, SemanticType] + A dictionary mapping keys to their corresponding SemanticType. """ - return self.get(key, (None, None))[0] + return {k: v for k, v in self.items() if isinstance(v, SemanticType)} - def get_semantic_type(self, key: str) -> str | None: + def get_python_types(self) -> dict[str, type]: """ - Get the semantic type for a given key in the schema. - - Parameters - ---------- - key : str - The key for which to retrieve the semantic type. + Get the Python types for all keys in the schema. Returns ------- - str | None - The semantic type associated with the key, or None if not found. + dict[str, type] + A dictionary mapping keys to their corresponding Python types. """ - return self.get(key, (None, None))[1] + return { + k: v.get_default_python_type() if isinstance(v, SemanticType) else v + for k, v in self.items() + } - @property - def storage_schema(self) -> PythonSchema: + def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: """ - Get the storage schema, which is a PythonSchema representation of the semantic schema. + Get the Arrow types for all keys in the schema. Returns ------- - PythonSchema - A new schema mapping keys to Python types. + dict[str, tuple[pa.DataType, str|None]] + A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, + the second element of the tuple is the semantic type name; otherwise, it is None. """ - return PythonSchema({k: v[0] for k, v in self.items()}) - - @property - def storage_schema_with_source_info(self) -> dict[str, type]: + return { + k: (v.get_default_arrow_type(), v.name) + if isinstance(v, SemanticType) + else (python_to_arrow_type(v), None) + for k, v in self.items() + } + + def to_arrow_schema(self) -> pa.Schema: """ - Get the storage schema with source info fields included. + Get the Arrow schema, which is a PythonSchema representation of the semantic schema. Returns ------- - dict[str, type] - A new schema including source info fields. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, 'name'), age=(int, None)) - >>> storage_schema = semantic_schema.storage_schema_with_source_info - >>> print(storage_schema) - {'name': , 'age': , '_source_info_name': , '_source_info_age': } + PythonSchema + A new schema mapping keys to Python types. """ - return self.storage_schema.with_source_info - - -def from_typespec_to_semantic_schema( - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, -) -> SemanticSchema: - """ - Convert a Python schema to a semantic schema using the provided semantic type registry. - - Parameters - ---------- - typespec : TypeSpec - The typespec to convert, mapping keys to Python types. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - - Examples - -------- - >>> typespec: TypeSpec = dict(name=str, age=int) - >>> semantic_schema = from_typespec_to_semanticn_schema(typespec, registry) - >>> print(semantic_schema) - {'name': (, None), 'age': (, None)} - """ - semantic_schema = {} - for key, python_type in typespec.items(): - if python_type in semantic_type_registry: - type_info = semantic_type_registry.get_type_info(python_type) - assert type_info is not None, ( - f"Type {python_type} should be found in the registry as `in` returned True" - ) - semantic_schema[key] = (type_info.storage_type, type_info.semantic_type) - else: - semantic_schema[key] = (python_type, None) - return SemanticSchema(semantic_schema) - - -def from_semantic_schema_to_python_schema( - semantic_schema: SemanticSchema, - semantic_type_registry: SemanticTypeRegistry, -) -> PythonSchema: - """ - Convert a semantic schema to a Python schema using the provided semantic type registry. - - Parameters - ---------- - semantic_schema : SemanticSchema - The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) - >>> python_schema = from_semantic_schema_to_python_schema(semantic_schema, registry) - >>> print(python_schema) - {'name': , 'age': } - """ - python_schema_content = {} - for key, (python_type, semantic_type) in semantic_schema.items(): - if semantic_type is not None: - # If the semantic type is registered, use the corresponding Python type - python_type = semantic_type_registry.get_python_type(semantic_type) - python_schema_content[key] = python_type - return PythonSchema(python_schema_content) - - -def from_semantic_schema_to_arrow_schema( - semantic_schema: SemanticSchema, - include_source_info: bool = True, -) -> pa.Schema: - """ - Convert a semantic schema to an Arrow schema. - - Parameters - ---------- - semantic_schema : SemanticSchema - The schema to convert, mapping keys to tuples of Python types and optional semantic type identifiers. - - Returns - ------- - dict[str, type] - A new schema mapping keys to Arrow-compatible types. - - Examples - -------- - >>> semantic_schema = SemanticSchema(name=(str, None), age=(int, None)) - >>> arrow_schema = from_semantic_schema_to_arrow_schema(semantic_schema) - >>> print(arrow_schema) - {'name': str, 'age': int} - """ - fields = [] - for field_name, (python_type, semantic_type) in semantic_schema.items(): - arrow_type = DEFAULT_ARROW_TYPE_LUT[python_type] - field_metadata = ( - {b"semantic_type": semantic_type.encode("utf-8")} if semantic_type else {} - ) - fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) - - if include_source_info: - for field in semantic_schema: - field_metadata = {b"field_type": b"source_info"} - fields.append( - pa.field( - f"_source_info_{field}", pa.large_string(), metadata=field_metadata + fields = [] + for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): + if semantic_type_name is not None: + field = pa.field( + k, + arrow_type, + metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, ) - ) - - return pa.schema(fields) + else: + field = pa.field(k, arrow_type) + fields.append(field) + return pa.schema(fields) -def from_arrow_schema_to_semantic_schema( - arrow_schema: pa.Schema, -) -> SemanticSchema: - """ - Convert an Arrow schema to a semantic schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The schema to convert, containing fields with metadata. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + def to_python_schema(self) -> PythonSchema: + """ + Get the Python schema, which is a PythonSchema representation of the semantic schema. - Examples - -------- - >>> arrow_schema = pa.schema([pa.field('name', pa.string(), metadata={'semantic_type': 'name'}), - ... pa.field('age', pa.int64(), metadata={'semantic_type': 'age'})]) - >>> semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) - >>> print(semantic_schema) - {'name': (str, 'name'), 'age': (int, 'age')} - """ - semantic_schema = {} - for field in arrow_schema: - if field.name.startswith("_source_info_") or ( - field.metadata and field.metadata.get(b"field_type", b"") == b"source_info" - ): - # Skip source info fields - continue - - semantic_type = None - if field.metadata is not None: - semantic_type = field.metadata.get(b"semantic_type", None) - semantic_type = semantic_type.decode() if semantic_type else None - python_type = arrow_to_python_type(field.type) - semantic_schema[field.name] = (python_type, semantic_type) - return SemanticSchema(semantic_schema) - - -def from_typespec_to_arrow_schema( - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, - include_source_info: bool = True, -) -> pa.Schema: - semantic_schema = from_typespec_to_semantic_schema(typespec, semantic_type_registry) - return from_semantic_schema_to_arrow_schema( - semantic_schema, include_source_info=include_source_info - ) - - -def from_arrow_schema_to_python_schema( - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry, -) -> PythonSchema: - """ - Convert an Arrow schema to a Python schema. + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return PythonSchema.from_semantic_schema(self) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + """ + Create a SemanticSchema from an Arrow schema. - Parameters - ---------- - arrow_schema : pa.Schema - The schema to convert, containing fields with metadata. + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + """ - Examples - -------- - >>> arrow_schema = pa.schema([pa.field('name', pa.string()), pa.field('age', pa.int64())]) - >>> python_schema = from_arrow_schema_to_python_schema(arrow_schema) - >>> print(python_schema) - {'name': , 'age': } - """ - semantic_schema = from_arrow_schema_to_semantic_schema(arrow_schema) - return from_semantic_schema_to_python_schema( - semantic_schema, semantic_type_registry - ) + semantic_schema = {} + for field in arrow_schema: + field_type = None + if field.metadata is not None: + semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() + if semantic_type_name: + semantic_type = semantic_type_registry.get_semantic_type( + semantic_type_name + ) + if semantic_type is None: + raise ValueError( + f"Semantic type '{semantic_type_name}' not found in registry" + ) + if not semantic_type.supports_arrow_type(field.type): + raise ValueError( + f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" + ) + field_type = semantic_type + + if ( + field_type is None + ): # was not set to semantic type, so fallback to simple conversion + field_type = arrow_to_python_type(field.type) + + semantic_schema[field.name] = field_type + return cls(semantic_schema) + + @classmethod + def from_typespec( + cls, + typespec: TypeSpec, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + semantic_schema = {} + for key, python_type in typespec.items(): + semantic_type = semantic_type_registry.get_semantic_type_for_python_type( + python_type + ) + if semantic_type is not None: + semantic_schema[key] = semantic_type + else: + semantic_schema[key] = python_type + return cls(semantic_schema) diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py new file mode 100644 index 0000000..8dc0df1 --- /dev/null +++ b/src/orcapod/types/semantic_converter.py @@ -0,0 +1,86 @@ +from orcapod.types.semantic_types import PythonArrowConverter +from orcapod.types.schemas import PythonSchema, SemanticSchema +from orcapod.types import typespec_utils as tsutils + +from typing import Any, Mapping, Self +import pyarrow as pa +import logging + +logger = logging.getLogger(__name__) + + +class SemanticConverter: + @classmethod + def from_semantic_schema(cls, semantic_schema: SemanticSchema) -> Self: + converter_lut = {} + for ( + field, + semantic_type, + ) in semantic_schema.get_semantic_fields().items(): + converter_lut[field] = PythonArrowConverter.from_semantic_type( + semantic_type + ) + return cls(converter_lut) + + def __init__( + self, + converter_lut: dict[str, PythonArrowConverter], + ): + self._converter_lut = converter_lut + + def from_python_to_arrow_schema(self, python_schema: PythonSchema) -> pa.Schema: + """Convert a Python schema to an Arrow schema""" + return python_schema.to_arrow_schema(converters=self._converter_lut) + + def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: + """Convert an Arrow schema to a Python schema""" + return PythonSchema.from_arrow_schema( + arrow_schema, converters=self._converter_lut + ) + + def from_python_to_arrow( + self, python_data: Mapping[str, Any], python_schema: PythonSchema | None = None + ) -> pa.Table: + """Convert a dictionary of Python values to Arrow arrays""" + if python_schema is None: + # infer schema from data + python_schema = PythonSchema(tsutils.get_typespec_from_dict(python_data)) + logger.warning( + f"Inferred schema {python_schema} from Python data {python_data}. Note that this may not behave as expected." + ) + + arrow_schema = self.from_python_to_arrow_schema(python_schema) + + arrow_data = {} + for field, value in python_data.items(): + if field in self._converter_lut: + converter = self._converter_lut[field] + arrow_data[field] = converter.from_python_to_arrow(value) + else: + arrow_data[field] = [value] + return pa.Table.from_pydict(arrow_data, schema=arrow_schema) + + def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: + """Convert a dictionary of Arrow arrays to Python values""" + + values = [] + for column_name in arrow_data.column_names: + column = arrow_data[column_name] + if column_name not in self._converter_lut: + values.append(column.to_pylist()) + else: + converter = self._converter_lut[column_name] + values.append(converter.from_arrow_to_python(column)) + all_entries = [] + + for entry in zip(*values): + assert len(entry) == len(arrow_data.column_names), ( + "Mismatch in number of columns and values" + ) + all_entries.append(dict(zip(arrow_data.column_names, entry))) + + return all_entries + + def as_dict(self) -> dict[str, PythonArrowConverter]: + """Return the converter lookup table as a dictionary.""" + return self._converter_lut.copy() diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py new file mode 100644 index 0000000..169da69 --- /dev/null +++ b/src/orcapod/types/semantic_types.py @@ -0,0 +1,569 @@ +from typing import Any, Self, cast +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path +import pyarrow as pa +from collections.abc import Collection + + +# Converter interfaces using modern generics with ABC +class PythonConverter[T, R](ABC): + """ + Abstract base class for converters between canonical and Python representation types. + T: canonical type, R: Python representation type + """ + + def __init__(self): + # Automatically infer types from inheritance + self._python_type = self._infer_python_type() + + def _infer_python_type(self) -> type[R]: + """Infer the Python type from __orig_bases__""" + for base in getattr(self.__class__, "__orig_bases__", []): + if hasattr(base, "__origin__") and issubclass( + base.__origin__, PythonConverter + ): + # Get the R type parameter (second argument) + args = getattr(base, "__args__", ()) + if len(args) >= 2: + return args[1] # R is the second type parameter + raise RuntimeError(f"Could not infer Python type for {self.__class__.__name__}") + + @abstractmethod + def to_canonical(self, value: R) -> T: + """Convert from Python representation to canonical form""" + pass + + @abstractmethod + def from_canonical(self, value: T) -> R: + """Convert from canonical to Python representation form""" + pass + + @abstractmethod + def can_handle(self, python_type: type) -> bool: ... + + def get_python_type(self) -> type[R]: + """Get the Python type this converter converts into (auto-inferred)""" + return self._python_type + + +class ArrowConverter[T](ABC): + """ + Abstract base class for converters between canonical and Arrow representation types. + T: canonical type + """ + + @abstractmethod + def to_canonical(self, value: pa.Array) -> list[T]: + """Convert from Arrow representation to canonical form""" + pass + + @abstractmethod + def from_canonical(self, value: T | Collection[T]) -> pa.Array: + """Convert from canonical to Arrow representation""" + pass + + @abstractmethod + def can_handle(self, arrow_type: pa.DataType) -> bool: ... + + @abstractmethod + def get_arrow_type(self) -> pa.DataType: + """Get the Arrow DataType this converter handles""" + pass + + +# Canonical types with explicit definitions +@dataclass(frozen=True) +class CanonicalPath: + """Canonical representation of a file system path""" + + path_str: str + is_absolute: bool = False + + def __str__(self) -> str: + return self.path_str + + def __post_init__(self) -> None: + if not self.path_str: + raise ValueError("Path string cannot be empty") + + +@dataclass(frozen=True) +class CanonicalTimestamp: + """Canonical representation of a timestamp""" + + timestamp: int + timezone: str = "UTC" + + def __post_init__(self) -> None: + if self.timestamp < 0: + raise ValueError("Timestamp cannot be negative") + + +@dataclass(frozen=True) +class CanonicalURL: + """Canonical representation of a URL""" + + url: str + scheme: str + host: str + + def __post_init__(self) -> None: + if not self.url.startswith(f"{self.scheme}://"): + raise ValueError(f"URL must start with {self.scheme}://") + + +# Python converters for Path +class PathlibPathConverter(PythonConverter[CanonicalPath, Path]): + """Converter for pathlib.Path objects""" + + def to_canonical(self, value: Path) -> CanonicalPath: + return CanonicalPath(path_str=str(value), is_absolute=value.is_absolute()) + + def from_canonical(self, value: CanonicalPath) -> Path: + return Path(value.path_str) + + def can_handle(self, python_type: type) -> bool: + return issubclass(python_type, Path) + + +# Arrow converters for Path +class ArrowStringPathConverter(ArrowConverter[CanonicalPath]): + """Converter for Arrow string representation of paths""" + + def to_canonical(self, value: pa.Array) -> list[CanonicalPath]: + return [ + CanonicalPath(v, is_absolute=Path(v).is_absolute()) + for v in value.to_pylist() + ] + + def from_canonical( + self, value: CanonicalPath | Collection[CanonicalPath] + ) -> pa.Array: + if isinstance(value, CanonicalPath): + value = [value] + return pa.array([v.path_str for v in value], type=pa.large_string()) + + def can_handle(self, arrow_type: pa.DataType) -> bool: + return arrow_type == pa.large_string() + + def get_arrow_type(self) -> pa.DataType: + return pa.large_string() + + +# Enhanced SemanticType with explicit Python and Arrow handling +class SemanticType[T]: + """ + Represents a semantic type with explicit Python/Arrow converters. + + A SemanticType is a central concept that: + 1. Defines a canonical representation (T) for a domain concept + 2. Manages separate Python and Arrow converters + 3. Provides explicit methods for Python and Arrow operations + 4. Maintains type safety while allowing runtime discovery + + Type parameter T represents the canonical representation type. + """ + + def __init__( + self, + name: str, + description: str = "", + python_converters: Collection[PythonConverter[T, Any]] | None = None, + arrow_converters: Collection[ArrowConverter[T]] | None = None, + ): + self.name = name + self.description = description + + self._python_type_converters: list[PythonConverter[T, Any]] = [] + self._arrow_type_converters: list[ArrowConverter[T]] = [] + + # Default converters + self._default_python_converter: PythonConverter[T, Any] | None = None + self._default_arrow_converter: ArrowConverter[T] | None = None + + if python_converters is not None: + for converter in python_converters: + self.register_python_converter( + converter, + set_default=self._default_python_converter is None, + force=False, + ) + + if arrow_converters is not None: + for converter in arrow_converters: + self.register_arrow_converter( + converter, + set_default=self._default_arrow_converter is None, + force=False, + ) + + def get_default_python_type(self) -> type[T]: + """Get the default Python type for this semantic type""" + if self._default_python_converter: + return self._default_python_converter.get_python_type() + raise ValueError( + f"No default Python converter registered for semantic type '{self.name}'" + ) + + def get_default_arrow_type(self) -> pa.DataType: + """Get the default Arrow DataType for this semantic type""" + if self._default_arrow_converter: + return self._default_arrow_converter.get_arrow_type() + raise ValueError( + f"No default Arrow converter registered for semantic type '{self.name}'" + ) + + def register_python_converter[R]( + self, + converter: PythonConverter[T, R], + set_default: bool = False, + force: bool = False, + ): + """ + Register a Python converter + """ + if converter not in self._python_type_converters: + self._python_type_converters.append(converter) + + if set_default: + if self._default_python_converter is not None and not force: + raise ValueError( + f"Default Python converter already set for semantic type '{self.name}'" + ) + self._default_python_converter = converter + + def register_arrow_converter( + self, + converter: ArrowConverter[T], + set_default: bool = False, + force: bool = False, + ) -> None: + """Register an Arrow converter""" + if converter not in self._arrow_type_converters: + self._arrow_type_converters.append(converter) + + if set_default: + if self._default_arrow_converter is not None and not force: + raise ValueError( + f"Default Arrow converter already set for semantic type '{self.name}'" + ) + self._default_arrow_converter = converter + + # Python-specific methods + def get_python_converter_for_type( + self, python_type: type + ) -> PythonConverter[T, Any] | None: + """Find a Python converter that can handle the given type""" + for converter in self._python_type_converters: + if converter.can_handle(python_type): + return converter + return None + + def get_arrow_converter_for_type( + self, arrow_type: pa.DataType + ) -> ArrowConverter[T] | None: + """Find an Arrow converter for the given Arrow DataType""" + for converter in self._arrow_type_converters: + if converter.can_handle(arrow_type): + return converter + return None + + def get_python_converter_with_output_type( + self, output_type: type + ) -> PythonConverter[T, Any] | None: + """Get a Python converter that can handle the specified output type""" + for converter in self._python_type_converters: + if issubclass(converter.get_python_type(), output_type): + return converter + return None + + def get_arrow_converter_with_output_type( + self, output_type: pa.DataType + ) -> ArrowConverter[T] | None: + for converter in self._arrow_type_converters: + if output_type == converter.get_arrow_type(): + return converter + return None + + def supports_python_type(self, python_type: type) -> bool: + return self.get_python_converter_for_type(python_type) is not None + + def supports_arrow_type(self, arrow_type: pa.DataType) -> bool: + return self.get_arrow_converter_for_type(arrow_type) is not None + + @property + def default_python_converter(self) -> PythonConverter[T, Any] | None: + """Get the default Python converter""" + return self._default_python_converter + + @property + def default_arrow_converter(self) -> ArrowConverter[T] | None: + return self._default_arrow_converter + + def to_canonical_from_python(self, value: Any) -> T: + """Convert Python value to canonical form""" + converter = self.get_python_converter_for_type(type(value)) + if not converter: + raise ValueError( + f"No Python converter found for {type(value)} in semantic type '{self.name}'" + ) + + return converter.to_canonical(value) + + def from_canonical_to_python( + self, value: T, target_type: type | None = None + ) -> Any: + """Convert from canonical to Python representation""" + if target_type is None: + converter = self.default_python_converter + if not converter: + raise ValueError( + f"No default Python converter for semantic type '{self.name}'" + ) + else: + converter = self.get_python_converter_for_type(target_type) + if not converter: + raise ValueError( + f"No converter found for target type '{target_type}' in semantic type '{self.name}'" + ) + + return converter.from_canonical(value) + + def to_canonical_from_arrow(self, value: pa.Array) -> list[T]: + """Convert Arrow value to canonical form using explicit Arrow DataType""" + converter = self.get_arrow_converter_for_type(value.type) + if not converter: + raise ValueError( + f"No Arrow converter found for type '{value.type}' in semantic type '{self.name}'" + ) + + canonical = converter.to_canonical(value) + + return canonical + + def from_canonical_to_arrow( + self, value: T, target_type: pa.DataType | None = None + ) -> Any: + """Convert from canonical to Arrow representation using explicit Arrow DataType""" + + if target_type is None: + converter = self.default_arrow_converter + if not converter: + raise ValueError( + f"No default Arrow converter for semantic type '{self.name}'" + ) + else: + converter = self.get_arrow_converter_for_type(target_type) + if not converter: + raise ValueError( + f"No Arrow converter found for target type '{target_type}' in semantic type '{self.name}'" + ) + + return converter.from_canonical(value) + + def get_python_types(self) -> list[type]: + """Get all supported output Python DataTypes""" + return [ + converter.get_python_type() for converter in self._python_type_converters + ] + + def get_arrow_types(self) -> list[pa.DataType]: + """Get all supported output Arrow DataTypes""" + return [converter.get_arrow_type() for converter in self._arrow_type_converters] + + # Cross-system conversion methods + def convert_python_to_arrow( + self, python_value: Any, arrow_type: pa.DataType | None = None + ) -> Any: + """Convert directly from Python to Arrow representation""" + canonical = self.to_canonical_from_python(python_value) + return self.from_canonical_to_arrow(canonical, arrow_type) + + def convert_arrow_to_python( + self, arrow_value, python_type: type | None = None + ) -> list[Any]: + """Convert directly from Arrow to Python representation""" + canonical_values = self.to_canonical_from_arrow(arrow_value) + return [ + self.from_canonical_to_python(value, target_type=python_type) + for value in canonical_values + ] + + def __str__(self) -> str: + return f"SemanticType(name='{self.name}')" + + def __repr__(self) -> str: + python_count = len(self._python_type_converters) + arrow_count = len(self._arrow_type_converters) + return ( + f"SemanticType(name='{self.name}', " + f"python_converters={python_count}, " + f"arrow_converters={arrow_count})" + ) + + +# Registry with explicit Python and Arrow handling +class SemanticTypeRegistry: + """Registry that manages SemanticType objects with explicit Python/Arrow operations""" + + def __init__(self, semantic_types: Collection[SemanticType] | None = None): + self._semantic_type_lut: dict[str, SemanticType] = {} + self._python_to_semantic_lut: dict[type, SemanticType] = {} + if semantic_types is not None: + for semantic_type in semantic_types: + self.register_semantic_type(semantic_type) + + def register_semantic_type[T](self, semantic_type: SemanticType[T]): + """Register a semantic type""" + if semantic_type.name not in self._semantic_type_lut: + self._semantic_type_lut[semantic_type.name] = semantic_type + else: + raise ValueError( + f"Semantic type {self._semantic_type_lut[semantic_type.name]} is already registered for semantic name {semantic_type.name}" + ) + + python_type = semantic_type.get_default_python_type() + if python_type is None: + raise ValueError( + f"Semantic type {semantic_type.name} does not have a default Python type" + ) + if python_type in self._python_to_semantic_lut: + raise ValueError( + f"Python type {python_type} is already registered for semantic type {self._python_to_semantic_lut[python_type]}" + ) + self._python_to_semantic_lut[python_type] = semantic_type + + def get_semantic_type_for_python_type( + self, python_type: type + ) -> SemanticType | None: + """Get a semantic type by Python type""" + return self._python_to_semantic_lut.get(python_type) + + def get_semantic_type(self, name: str) -> SemanticType | None: + """Get a semantic type by name""" + return self._semantic_type_lut.get(name) + + def list_semantic_types(self) -> list[SemanticType]: + """Get all registered semantic types""" + return list(self._semantic_type_lut.values()) + + def supports_python_type(self, python_type: type) -> bool: + """Check if registry supports the given Python type""" + return python_type in self._python_to_semantic_lut + + # Python-specific registry methods + def supports_semantic_and_arrow_type( + self, semantic_type_name: str, arrow_type: pa.DataType + ) -> bool: + """Check if registry supports the given semantic type and Arrow DataType combination""" + semantic_type = self._semantic_type_lut.get(semantic_type_name) + if not semantic_type: + return False + return semantic_type.supports_arrow_type(arrow_type) + + +# Type-safe wrapper for semantic values +class SemanticValue[T]: + """Type-safe wrapper for semantic values""" + + def __init__(self, value: T, semantic_type: SemanticType[T]): + self._value = value + self._semantic_type = semantic_type + + @property + def value(self) -> T: + return self._value + + @property + def semantic_type(self) -> SemanticType[T]: + return self._semantic_type + + def to_python(self) -> Any: + """Convert to Python representation""" + return self._semantic_type.from_canonical_to_python(self._value) + + def to_python_type(self, python_type: type) -> Any: + """Convert to Arrow representation using specific Arrow DataType""" + return self._semantic_type.from_canonical_to_arrow(self._value, python_type) + + def to_arrow(self) -> Any: + """Convert to Arrow representation using default dtype""" + return self._semantic_type.from_canonical_to_arrow(self._value) + + def to_arrow_with_type(self, arrow_type: pa.DataType) -> Any: + """Convert to Arrow representation using specific Arrow DataType""" + return self._semantic_type.from_canonical_to_arrow(self._value, arrow_type) + + @classmethod + def from_python(cls, python_value: Any, semantic_type: SemanticType[T]) -> Self: + """Create from a Python value""" + canonical = semantic_type.to_canonical_from_python(python_value) + return cls(canonical, semantic_type) + + @classmethod + def from_arrow(cls, arrow_value: Any, semantic_type: SemanticType[T]) -> Self: + """Create from an Arrow value with explicit Arrow DataType""" + canonical = semantic_type.to_canonical_from_arrow(arrow_value) + if len(canonical) != 1: + raise ValueError( + f"Expected single value from Arrow, got {len(canonical)} values" + ) + return cls(canonical[0], semantic_type) + + def __str__(self) -> str: + return f"SemanticValue({self._value}, {self._semantic_type.name})" + + def __repr__(self) -> str: + return f"SemanticValue(value={self._value!r}, semantic_type={self._semantic_type.name})" + + +class PythonArrowConverter[T, R]: + @classmethod + def from_semantic_type(cls, semantic_type: SemanticType[T]) -> Self: + """Create a PythonArrowConverter from a SemanticType""" + python_converter = semantic_type.default_python_converter + arrow_converter = semantic_type.default_arrow_converter + + if not python_converter or not arrow_converter: + raise ValueError( + f"Semantic type '{semantic_type.name}' does not have default converters" + ) + + return cls(python_converter, arrow_converter, semantic_type.name) + + def __init__( + self, + python_converter: PythonConverter[T, R], + arrow_converter: ArrowConverter[T], + semantic_type_name: str | None = None, + ): + self.python_converter = python_converter + self.arrow_converter = arrow_converter + self.semantic_type_name = semantic_type_name + + @property + def python_type(self) -> type[R]: + """Get the Python type this converter handles""" + return self.python_converter.get_python_type() + + @property + def arrow_type(self) -> pa.DataType: + """Get the Arrow DataType this converter handles""" + return self.arrow_converter.get_arrow_type() + + def from_python_to_arrow(self, python_value: R | Collection[R]) -> pa.Array: + """Convert from Python to Arrow representation""" + if isinstance(python_value, self.python_type): + python_value = [python_value] + assert isinstance(python_value, Collection), ( + "Expected a collection of values at this point" + ) + python_values = cast(Collection[R], python_value) + canonicals = [self.python_converter.to_canonical(val) for val in python_values] + return self.arrow_converter.from_canonical(canonicals) + + def from_arrow_to_python(self, arrow_value: pa.Array) -> list[R]: + """Convert from Arrow to Python representation""" + canonical = self.arrow_converter.to_canonical(arrow_value) + return [self.python_converter.from_canonical(value) for value in canonical] diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 71318aa..940820f 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -232,34 +232,62 @@ def get_compatible_type(type1: Any, type2: Any) -> Any: raise TypeError(f"Types {type1} and {type2} are not compatible") -def union_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: # Merge the two TypeSpecs but raise an error if conflicts in types are found - merged = dict(left) - for key, right_type in right.items(): - merged[key] = ( - get_compatible_type(merged[key], right_type) - if key in merged - else right_type - ) + merged = dict(typespecs[0]) + for typespec in typespecs[1:]: + for key, right_type in typespec.items(): + merged[key] = ( + get_compatible_type(merged[key], right_type) + if key in merged + else right_type + ) return merged -def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +def intersection_typespecs(*typespecs: TypeSpec) -> TypeSpec: """ - Returns the intersection of two TypeSpecs, only returning keys that are present in both. + Returns the intersection of all TypeSpecs, only returning keys that are present in all typespecs. If a key is present in both TypeSpecs, the type must be the same. """ # Find common keys and ensure types match - common_keys = set(left.keys()).intersection(set(right.keys())) - intersection = {} - for key in common_keys: - try: - intersection[key] = get_compatible_type(left[key], right[key]) - except TypeError: - # If types are not compatible, raise an error - raise TypeError( - f"Type conflict for key '{key}': {left[key]} vs {right[key]}" - ) + common_keys = set(typespecs[0].keys()) + for typespec in typespecs[1:]: + common_keys.intersection_update(typespec.keys()) + + intersection = {k: typespecs[0][k] for k in common_keys} + for typespec in typespecs[1:]: + for key in common_keys: + try: + intersection[key] = get_compatible_type( + intersection[key], typespec[key] + ) + except TypeError: + # If types are not compatible, raise an error + raise TypeError( + f"Type conflict for key '{key}': {intersection[key]} vs {typespec[key]}" + ) return intersection + + +# def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +# """ +# Returns the intersection of two TypeSpecs, only returning keys that are present in both. +# If a key is present in both TypeSpecs, the type must be the same. +# """ + +# # Find common keys and ensure types match +# common_keys = set(left.keys()).intersection(set(right.keys())) +# intersection = {} +# for key in common_keys: +# try: +# intersection[key] = get_compatible_type(left[key], right[key]) +# except TypeError: +# # If types are not compatible, raise an error +# raise TypeError( +# f"Type conflict for key '{key}': {left[key]} vs {right[key]}" +# ) + +# return intersection diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py new file mode 100644 index 0000000..5a072de --- /dev/null +++ b/src/orcapod/utils/arrow_utils.py @@ -0,0 +1,126 @@ +# TODO: move this to a separate module + +import pyarrow as pa + + +def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: + """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, + no field names should collide.""" + merged_fields = [] + for schema in schemas: + merged_fields.extend(schema) + return pa.schema(merged_fields) + + +def hstack_tables(*tables: pa.Table) -> pa.Table: + """ + Horizontally stack multiple PyArrow tables by concatenating their columns. + + All input tables must have the same number of rows and unique column names. + + Args: + *tables: Variable number of PyArrow tables to stack horizontally + + Returns: + Combined PyArrow table with all columns from input tables + + Raises: + ValueError: If no tables provided, tables have different row counts, + or duplicate column names are found + """ + if len(tables) == 0: + raise ValueError("At least one table is required for horizontal stacking.") + if len(tables) == 1: + return tables[0] + + N = len(tables[0]) + for table in tables[1:]: + if len(table) != N: + raise ValueError( + "All tables must have the same number of rows for horizontal stacking." + ) + + # create combined column names + all_column_names = [] + all_columns = [] + all_names = set() + for i, table in enumerate(tables): + if overlap := set(table.column_names).intersection(all_names): + raise ValueError( + f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" + ) + all_names.update(table.column_names) + all_column_names += table.column_names + all_columns += table.columns + + return pa.Table.from_arrays(all_columns, names=all_column_names) + + +def check_arrow_schema_compatibility( + incoming_schema: pa.Schema, target_schema: pa.Schema, strict: bool = False +) -> tuple[bool, list[str]]: + # TODO: add strict comparison + """ + Check if incoming schema is compatible with current schema. + + Args: + incoming_schema: Schema to validate + target_schema: Expected schema to match against + strict: If True, requires exact match of field names and types. If False (default), + incoming_schema can have additional fields or different types as long as they are compatible. + + Returns: + Tuple of (is_compatible, list_of_errors) + """ + errors = [] + + # Create lookup dictionaries for efficient access + incoming_fields = {field.name: field for field in incoming_schema} + target_fields = {field.name: field for field in target_schema} + + # Check each field in target_schema + for field_name, target_field in target_fields.items(): + if field_name not in incoming_fields: + errors.append(f"Missing field '{field_name}' in incoming schema") + continue + + incoming_field = incoming_fields[field_name] + + # Check data type compatibility + if not target_field.type.equals(incoming_field.type): + # TODO: if not strict, allow type coercion + errors.append( + f"Type mismatch for field '{field_name}': " + f"expected {target_field.type}, got {incoming_field.type}" + ) + + # Check semantic_type metadata if present in current schema + current_metadata = target_field.metadata or {} + incoming_metadata = incoming_field.metadata or {} + + if b"semantic_type" in current_metadata: + expected_semantic_type = current_metadata[b"semantic_type"] + + if b"semantic_type" not in incoming_metadata: + errors.append( + f"Missing 'semantic_type' metadata for field '{field_name}'" + ) + elif incoming_metadata[b"semantic_type"] != expected_semantic_type: + errors.append( + f"Semantic type mismatch for field '{field_name}': " + f"expected {expected_semantic_type.decode()}, " + f"got {incoming_metadata[b'semantic_type'].decode()}" + ) + elif b"semantic_type" in incoming_metadata: + errors.append( + f"Unexpected 'semantic_type' metadata for field '{field_name}': " + f"{incoming_metadata[b'semantic_type'].decode()}" + ) + + # If strict mode, check for additional fields in incoming schema + if strict: + for field_name in incoming_fields: + if field_name not in target_fields: + errors.append(f"Unexpected field '{field_name}' in incoming schema") + + return len(errors) == 0, errors diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py index dd09e1f..8949622 100644 --- a/src/orcapod/utils/object_spec.py +++ b/src/orcapod/utils/object_spec.py @@ -1,20 +1,29 @@ import importlib +from typing import Any -def parse_objectspec(obj_spec: dict) -> Any: - if "_class" in obj_spec: - # if _class is specified, treat the dict as an object specification - module_name, class_name = obj_spec["_class"].rsplit(".", 1) - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - configs = parse_objectspec(obj_spec.get("config", {})) - return cls(**configs) - else: - # otherwise, parse through the dictionary recursively - parsed_object = obj_spec - for k, v in obj_spec.items(): - if isinstance(v, dict): +def parse_objectspec(obj_spec: Any) -> Any: + if isinstance(obj_spec, dict): + if "_class" in obj_spec: + # if _class is specified, treat the dict as an object specification, looking for + # _config key to extract configuration parameters + module_name, class_name = obj_spec["_class"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + configs = parse_objectspec(obj_spec.get("_config", {})) + return cls(**configs) + else: + # otherwise, parse through the dictionary recursively + parsed_object = obj_spec + for k, v in obj_spec.items(): parsed_object[k] = parse_objectspec(v) - else: - parsed_object[k] = v - return parsed_object + return parsed_object + elif isinstance(obj_spec, list): + # if it's a list, parse each item in the list + return [parse_objectspec(item) for item in obj_spec] + elif isinstance(obj_spec, tuple): + # if it's a tuple, parse each item in the tuple + return tuple(parse_objectspec(item) for item in obj_spec) + else: + # if it's neither a dict nor a list, return it as is + return obj_spec diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index 09d84d7..d7f6a3c 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -13,7 +13,7 @@ LegacyPacketHasher, LegacyPathSetHasher, ) -from orcapod.stores.dict_data_stores import DirDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore class MockFileHasher(LegacyFileHasher): diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 2a6e253..0c50292 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -12,7 +12,7 @@ LegacyDefaultCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 4ff838f..564b449 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -3,7 +3,7 @@ import pytest -from orcapod.stores.dict_data_stores import NoOpDataStore +from orcapod.stores.legacy.dict_data_stores import NoOpDataStore def test_noop_data_store_memoize(): diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index 4721691..f4076d6 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -6,8 +6,8 @@ import pytest from orcapod.hashing.types import LegacyPacketHasher -from orcapod.stores.dict_data_stores import DirDataStore, NoOpDataStore -from orcapod.stores.dict_transfer_data_store import TransferDataStore +from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.stores.legacy.dict_transfer_data_store import TransferDataStore class MockPacketHasher(LegacyPacketHasher): From d6de91e5f38ec1a585efe1a738eab6fbc4ee0d66 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 19 Jul 2025 02:41:36 +0000 Subject: [PATCH 114/224] feat: implement data context to capture shared hashing and semantic context information easily --- src/orcapod/data/context.py | 70 +++++ src/orcapod/data/datagrams.py | 387 ++++++------------------ src/orcapod/data/kernels.py | 17 ++ src/orcapod/data/operators.py | 8 + src/orcapod/data/pods.py | 133 ++++++-- src/orcapod/data/streams.py | 143 +++++++-- src/orcapod/protocols/data_protocols.py | 68 ++++- src/orcapod/types/arrow_utils.py | 129 +++++++- src/orcapod/types/semantic_converter.py | 3 +- src/orcapod/types/typespec_utils.py | 9 +- src/orcapod/utils/arrow_utils.py | 124 ++++++++ 11 files changed, 735 insertions(+), 356 deletions(-) create mode 100644 src/orcapod/data/context.py diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py new file mode 100644 index 0000000..cc47cff --- /dev/null +++ b/src/orcapod/data/context.py @@ -0,0 +1,70 @@ +from typing import Self +from orcapod.types.semantic_types import SemanticTypeRegistry +from orcapod.types import default_registry +from orcapod.protocols import hashing_protocols as hp +from orcapod.hashing.defaults import get_default_arrow_hasher, get_default_object_hasher +from dataclasses import dataclass + + +DATA_CONTEXT_COLUMN = "_orcapod_context_key" + + +@dataclass +class DataContext: + context_key: str + semantic_type_registry: SemanticTypeRegistry + arrow_hasher: hp.ArrowHasher + object_hasher: hp.ObjectHasher + + @staticmethod + def get_data_context_column() -> str: + """ + Returns the column name used to store the data context key in Arrow tables. + """ + return DATA_CONTEXT_COLUMN + + @staticmethod + def resolve_data_context(data_context: "str | DataContext | None") -> "DataContext": + """ + Returns the default data context manager. + This is typically used when no specific context is provided. + """ + return orcapod_system_data_context_manager.resolve_context(data_context) + + +default_data_context = DataContext( + "std:v0.1.0:default", + default_registry, + get_default_arrow_hasher(), + get_default_object_hasher(), +) + + +class DataContextManager(dict[str, DataContext]): + def register_context(self, DataContext): + """ + Register a new DataContext instance. + + Args: + DataContext: The DataContext instance to register. + """ + if DataContext.context_key in self: + raise ValueError( + f"DataContext with key {DataContext.context_key} already exists." + ) + self[DataContext.context_key] = DataContext + + def resolve_context(self, context_info: str | DataContext | None) -> DataContext: + if isinstance(context_info, DataContext): + return context_info + if context_info is None: + return default_data_context + if isinstance(context_info, str): + if context_info in self: + return self[context_info] + else: + raise ValueError(f"DataContext with key {context_info} not found.") + + +orcapod_system_data_context_manager = DataContextManager() +orcapod_system_data_context_manager.register_context(default_data_context) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 5bab7ba..0e8df1e 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -22,9 +22,11 @@ from orcapod.types import TypeSpec from orcapod.types.semantic_converter import SemanticConverter from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import schemas from orcapod.types import typespec_utils as tsutils +from orcapod.data.context import ( + DataContext, +) import pyarrow as pa import logging from orcapod.utils import arrow_utils @@ -48,201 +50,6 @@ PythonStore: TypeAlias = Mapping[str, DataValue] -# class SemanticConverter: -# """ -# Converts data between different representations (Python, semantic stores, Arrow tables). - -# SemanticConverter only tracks the semantic columns to be converted and does not -# enforce any type checking on other columns. Consequently, two completely different -# schemas could share a semantic converter if the have same named fields with identical -# semantic types. Furthermore, semantic types are defined by the association of semantic -# type name with a specific TypeHandler. - -# """ - -# @staticmethod -# def prepare_handler( -# semantic_schema: schemas.SemanticSchema, -# semantic_type_registry: SemanticTypeRegistry, -# ) -> dict[str, TypeHandler]: -# """ -# Prepare type handlers for semantic type conversion. - -# Args: -# semantic_schema: Schema containing semantic type information -# semantic_type_registry: Registry for looking up type handlers - -# Returns: -# Dictionary mapping field names to their type handlers -# """ -# handler_lut = {} -# for key, (_, semantic_type) in semantic_schema.items(): -# if semantic_type is None: -# continue # Skip keys without semantic type -# handler_lut[key] = semantic_type_registry.get_handler_by_semantic_type( -# semantic_type -# ) -# return handler_lut - -# @classmethod -# def from_typespec( -# cls, typespec: TypeSpec, semantic_type_registry: SemanticTypeRegistry -# ) -> "SemanticConverter": -# """ -# Create a SemanticConverter from a basic Python type specification dictionary (TypeSpec). - -# Args: -# typespec: Type specification dictionary -# semantic_type_registry: Registry for semantic type lookup - -# Returns: -# New SemanticConverter instance -# """ -# semantic_schema = schemas.from_typespec_to_semantic_schema( -# typespec, semantic_type_registry -# ) -# python_schema = schemas.PythonSchema(typespec) -# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) -# return cls(python_schema, semantic_schema, handler_lut) - -# @classmethod -# def from_arrow_schema( -# cls, arrow_schema: pa.Schema, semantic_type_registry: SemanticTypeRegistry -# ) -> "SemanticConverter": -# """ -# Create a SemanticConverter from an Arrow schema. - -# Args: -# arrow_schema: PyArrow schema with semantic type metadata -# semantic_type_registry: Registry for semantic type lookup - -# Returns: -# New SemanticConverter instance -# """ -# semantic_schema = schemas.from_arrow_schema_to_semantic_schema(arrow_schema) -# python_schema = schemas.from_semantic_schema_to_python_schema( -# semantic_schema, semantic_type_registry=semantic_type_registry -# ) -# handler_lut = cls.prepare_handler(semantic_schema, semantic_type_registry) -# return cls(python_schema, semantic_schema, handler_lut) - -# def __init__( -# self, -# handler_lut: dict[str, tuple[str, TypeHandler]] | None = None, -# ): -# """ -# Initialize SemanticConverter with schemas and type handlers. This is not meant to be called directly. -# Use class methods like `from_arrow_schema` or `from_typespec` instead. - -# Args: -# python_schema: Schema for Python data types -# semantic_schema: Schema for semantic types -# handler_lut: Optional dictionary of type handlers for conversion -# """ -# if handler_lut is None: -# handler_lut = {} -# self.handler_lut = handler_lut - -# def convert_from_semantic_to_python( -# self, semantic_value: Any, semantic_type: SemanticType -# ) -> Any: -# """ -# Convert a semantic value to a Python value. - -# Args: -# semantic_value: Value in semantic (storage-optimized) format -# semantic_type: Corresponding semantic type - -# Returns: -# Value in Python native format -# """ -# handler = self.handler_lut.get(semantic_type) -# if handler: -# return handler.to_canonical(semantic_value) -# return semantic_value - -# def from_semantic_store_to_python_store( -# self, semantic_store: SemanticStore -# ) -> dict[str, DataValue]: -# """ -# Convert a semantic store to a Python store. - -# Args: -# semantic_store: Store (dict) with data stored in semantic (storage-optimized) types - -# Returns: -# Store with Python native types -# """ -# python_store = dict(semantic_store) -# for key, handler in self.handler_lut.items(): -# python_store[key] = handler.storage_to_python(semantic_store[key]) -# # TODO: come up with a more robust handling/conversion -# return cast(dict[str, DataValue], python_store) - -# def from_python_store_to_semantic_store( -# self, python_store: PythonStore -# ) -> SemanticStore: -# """ -# Convert a Python store to a semantic store. - -# Args: -# python_store: Store with Python native types - -# Returns: -# Store with semantic (storage-optimized) types -# """ -# semantic_store = dict(python_store) -# for key, handler in self.handler_lut.items(): -# semantic_store[key] = handler.python_to_storage(python_store[key]) -# return semantic_store # type: ignore[return-value] - -# def from_semantic_store_to_arrow_table( -# self, semantic_store: SemanticStore -# ) -> pa.Table: -# """Convert a semantic store to an Arrow table.""" -# return pa.Table.from_pylist([semantic_store], schema=self.arrow_schema) - -# def from_python_store_to_arrow_table(self, python_store: PythonStore) -> pa.Table: -# """Convert a Python store to an Arrow table.""" -# semantic_store = self.from_python_store_to_semantic_store(python_store) -# return self.from_semantic_store_to_arrow_table(semantic_store) - -# def from_arrow_table_to_semantic_stores( -# self, arrow_table: pa.Table -# ) -> list[SemanticStore]: -# """Convert an Arrow table to a list of semantic stores.""" -# self.verify_compatible_arrow_schema(arrow_table.schema) -# return arrow_table.to_pylist() # Ensure the table is materialized - -# def from_arrow_table_to_python_stores( -# self, arrow_table: pa.Table -# ) -> list[dict[str, DataValue]]: -# """Convert an Arrow table to a list of Python stores.""" -# return [ -# self.from_semantic_store_to_python_store(semantic_store) -# for semantic_store in self.from_arrow_table_to_semantic_stores(arrow_table) -# ] - -# def verify_compatible_arrow_schema(self, arrow_schema: pa.Schema): -# """ -# Verify that an Arrow schema is compatible with the expected schema. - -# Args: -# arrow_schema: Schema to verify - -# Raises: -# ValueError: If schemas are incompatible -# """ -# compatible, errors = check_arrow_schema_compatibility( -# arrow_schema, self.arrow_schema -# ) -# if not compatible: -# raise ValueError( -# "Arrow table schema is not compatible with the expected schema: " -# + ", ".join(errors) -# ) - - class ImmutableDict(Mapping[str, DataValue]): """ An immutable dictionary-like container for DataValues. @@ -299,52 +106,54 @@ def __init__( data: Mapping[str, DataValue], typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: - # normalize the data content and remove any source info keys super().__init__(data) + # normalize the data content and remove any source info keys + self._data_context = DataContext.resolve_data_context(data_context) # combine provided typespec info with inferred typespec from content - verified_typespec = {} - if typespec is not None: - verified_typespec = dict(typespec) - # TODO: enhance get_typespec_from_dict to also use info from supplied typespec dict - inferred_typespec = tsutils.get_typespec_from_dict(self) - for key in self: - if key not in verified_typespec: - verified_typespec[key] = inferred_typespec[key] - self._python_schema = schemas.PythonSchema(verified_typespec) + inferred_typespec = tsutils.get_typespec_from_dict(self, typespec) + self._python_schema = schemas.PythonSchema(inferred_typespec) # create semantic converter if semantic_converter is None: semantic_converter = SemanticConverter.from_semantic_schema( self._python_schema.to_semantic_schema( - semantic_type_registry=semantic_type_registry + semantic_type_registry=self._data_context.semantic_type_registry ), ) self.semantic_converter = semantic_converter - self._arrow_hasher = arrow_hasher - self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None - def as_table( - self, - ) -> pa.Table: + @property + def data_context_key(self) -> str: + """Return the context key of the datagram.""" + return self._data_context.context_key + + def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if self._cached_table is None: + typespec = self.types() + typespec[DataContext.get_data_context_column()] = str self._cached_table = self.semantic_converter.from_python_to_arrow( - self, self.types() + self.as_dict(include_data_context=True), typespec ) - assert self._cached_table is not None, "Cached table should not be None" - return self._cached_table + assert self._cached_table is not None, "Cached table should not be None" + if include_data_context: + return self._cached_table - def as_dict(self) -> dict[str, DataValue]: + return self._cached_table.drop([DataContext.get_data_context_column()]) + + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" - return dict(self) + data = dict(self) + if include_data_context: + data[DataContext.get_data_context_column()] = self._data_context.context_key + return data def content_hash( self, @@ -356,18 +165,12 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - if self._arrow_hasher is None: - raise ValueError( - "Arrow hasher must be provided to calculate content hash." - ) - self._cached_content_hash = self._arrow_hasher.hash_table( - self.as_table(), + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_data_context=False), prefix_hasher_id=True, ) return self._cached_content_hash - # use keys() implementation from dict - def types(self) -> schemas.PythonSchema: """Return copy of the Python schema.""" return self._python_schema.copy() @@ -378,7 +181,7 @@ def _from_copy( data: Mapping[str, DataValue], python_schema: schemas.PythonSchema, semantic_converter: SemanticConverter, - arrow_hasher: hp.ArrowHasher | None, + data_context: DataContext, ) -> Self: """Create a new instance from copy without full initialization.""" instance = cls.__new__(cls) @@ -387,7 +190,7 @@ def _from_copy( # Set attributes directly instance._python_schema = python_schema instance.semantic_converter = semantic_converter - instance._arrow_hasher = arrow_hasher + instance._data_context = data_context instance._cached_table = None instance._cached_content_hash = None @@ -399,7 +202,7 @@ def copy(self) -> Self: self, self._python_schema.copy(), self.semantic_converter, - self._arrow_hasher, + self._data_context, ) @@ -437,8 +240,7 @@ def __init__( source_info: Mapping[str, str | None] | None = None, typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: # normalize the data content and remove any source info keys data_only = { @@ -454,8 +256,7 @@ def __init__( data_only, typespec=typespec, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) self._source_info = {**contained_source_info, **(source_info or {})} @@ -463,10 +264,11 @@ def __init__( def as_table( self, + include_data_context: bool = False, include_source: bool = False, ) -> pa.Table: """Convert the packet to an Arrow table.""" - table = super().as_table() + table = super().as_table(include_data_context=include_data_context) if include_source: if self._cached_source_info_table is None: source_info_data = { @@ -488,7 +290,9 @@ def as_table( table = arrow_utils.hstack_tables(table, source_info_table) return table - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Return dictionary representation. @@ -498,7 +302,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - dict_copy = dict(self) + dict_copy = super().as_dict(include_data_context=include_data_context) if include_source: for key, value in self.source_info().items(): dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value @@ -524,7 +328,7 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: data, typespec=typespec, semantic_converter=self.semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) # def content_hash2(self) -> str: @@ -543,13 +347,6 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: # use keys() implementation from dict - def types(self) -> schemas.PythonSchema: - """ - Returns: - Packet type information as PythonSchema (dict mapping field names to types). - """ - return self._python_schema.copy() - def source_info(self) -> dict[str, str | None]: """ Return source information for all keys. @@ -567,8 +364,9 @@ def copy(self) -> Self: return instance -def prepare_data_and_source_tables( - table: pa.Table, source_info: dict[str, str | None] | None = None +def prepare_system_data_tables( + table: pa.Table, + source_info: dict[str, str | None] | None = None, ) -> tuple[pa.Table, pa.Table]: """ Process a table to ensure proper source_info columns. @@ -602,8 +400,6 @@ def prepare_data_and_source_tables( source_info_columns = [] source_info_column_names = [] - # Add all regular columns first - # Create source_info columns for each regular column num_rows = table.num_rows @@ -664,8 +460,7 @@ def __init__( self, table: pa.Table, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: # normalize the table to ensure it contains proper source columns if len(table) != 1: @@ -674,41 +469,63 @@ def __init__( ) # TODO: add check for compatible types, especially of str being pa.large_string + table, data_context_table = arrow_utils.split_by_column_groups( + table, [DataContext.get_data_context_column()] + ) + self._table = table + if data_context is None and data_context_table is not None: + data_context = data_context_table[ + DataContext.get_data_context_column() + ].to_pylist()[0] + + self._data_context = DataContext.resolve_data_context(data_context) + + schema = pa.schema({DataContext.get_data_context_column(): pa.large_string()}) + self._context_info_table = pa.Table.from_pylist( + [{DataContext.get_data_context_column(): self._data_context.context_key}], + schema=schema, + ) + # create semantic converter # TODO: consider some validation of passed semantic_converter if semantic_converter is None: - if semantic_type_registry is None: - raise ValueError( - "Semantic type registry must be provided if semantic converter is not specified." - ) semantic_converter = SemanticConverter.from_semantic_schema( schemas.SemanticSchema.from_arrow_schema( self._table.schema, - semantic_type_registry, + self._data_context.semantic_type_registry, ) ) self._semantic_converter = semantic_converter - self._arrow_hasher = arrow_hasher self._cached_python_schema: schemas.PythonSchema | None = None self._cached_python_dict: dict[str, DataValue] | None = None self._cached_content_hash: str | None = None - def as_table( - self, - ) -> pa.Table: + @property + def data_context_key(self) -> str: + """Return the context key of the datagram.""" + return self._data_context.context_key + + def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" + if include_data_context: + return arrow_utils.hstack_tables(self._table, self._context_info_table) return self._table - def as_dict(self) -> dict[str, DataValue]: + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """Return dictionary representation of the datagram.""" if self._cached_python_dict is None: self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self._table + self.as_table(include_data_context=False) )[0] assert self._cached_python_dict is not None, "Cached dict should not be None" - return dict(self._cached_python_dict) + output = dict(self._cached_python_dict) + if include_data_context: + output[DataContext.get_data_context_column()] = ( + self._data_context.context_key + ) + return output def content_hash( self, @@ -720,12 +537,8 @@ def content_hash( Hash string of the datagram content """ if self._cached_content_hash is None: - if self._arrow_hasher is None: - raise ValueError( - "Arrow hasher must be provided to calculate content hash." - ) - self._cached_content_hash = self._arrow_hasher.hash_table( - self.as_table(), + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_data_context=False), prefix_hasher_id=True, ) return self._cached_content_hash @@ -747,14 +560,13 @@ def _from_copy( table: pa.Table, python_schema: schemas.PythonSchema, semantic_converter: SemanticConverter, - hash_keys: tuple[str, ...], arrow_hasher: hp.ArrowHasher, ) -> Self: """Create a new instance from copy without full initialization.""" instance = cls.__new__(cls) instance._table = table instance._semantic_converter = semantic_converter - instance._arrow_hasher = arrow_hasher + instance._data_context = arrow_hasher # Set attributes directly instance._cached_content_hash = None @@ -766,7 +578,7 @@ def copy(self) -> Self: new_datagram = self.__class__( self._table, semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) new_datagram._cached_python_schema = self._cached_python_schema new_datagram._cached_python_dict = self._cached_python_dict @@ -798,8 +610,7 @@ def __init__( self, table: pa.Table, semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: if len(table) != 1: raise ValueError( @@ -809,8 +620,7 @@ def __init__( super().__init__( table=table, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) @@ -843,10 +653,9 @@ def __init__( self, data: pa.Table, source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, skip_source_info_extraction: bool = False, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, ) -> None: if len(data) != 1: raise ValueError( @@ -858,10 +667,10 @@ def __init__( if not skip_source_info_extraction: # normalize the table to ensure it has the expected source_info columns - data_table, self._source_info_table = prepare_data_and_source_tables( + data_table, self._source_info_table = prepare_system_data_tables( data, source_info ) - else: + else: # assume that data already contains source info columns with appropriate prefixes data_columns: tuple[str, ...] = tuple( [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] ) @@ -873,8 +682,7 @@ def __init__( super().__init__( data_table, semantic_converter=semantic_converter, - semantic_type_registry=semantic_type_registry, - arrow_hasher=arrow_hasher, + data_context=data_context, ) self._cached_source_info: dict[str, str | None] | None = None @@ -883,9 +691,10 @@ def __init__( def as_table( self, + include_data_context: bool = False, include_source: bool = False, ) -> pa.Table: - table = super().as_table() + table = super().as_table(include_data_context=include_data_context) if include_source: # add source_info only for existing data columns table = arrow_utils.hstack_tables( @@ -896,7 +705,9 @@ def as_table( ) return table - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Convert to dictionary representation. @@ -906,7 +717,7 @@ def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: Returns: Dictionary representation of the packet """ - return_dict = super().as_dict() + return_dict = super().as_dict(include_data_context=include_data_context) if include_source: return_dict.update( {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} @@ -918,7 +729,7 @@ def as_datagram(self, include_source: bool = False) -> ArrowDatagram: return ArrowDatagram( table, semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) def source_info(self) -> dict[str, str | None]: @@ -941,7 +752,7 @@ def copy(self) -> Self: self.as_table(), self.source_info(), semantic_converter=self._semantic_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, skip_source_info_extraction=True, ) new_packet._cached_source_info = self._cached_source_info diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index e876916..f77f7e1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -4,6 +4,7 @@ import logging from orcapod.data.streams import KernelStream from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.data.context import DataContext from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER from orcapod.types import TypeSpec @@ -29,16 +30,32 @@ def __init__( self, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, + data_context: str | DataContext | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, **kwargs, ) -> None: super().__init__(**kwargs) self._label = label + + self._data_context = DataContext.resolve_data_context(data_context) + self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams + @property + def data_context_key(self) -> str: + return self._data_context.context_key + + @property + def data_context(self) -> DataContext: + return self._data_context + + @property + @abstractmethod + def kernel_id(self) -> tuple[str, ...]: ... + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index a276e23..b1b3d1b 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -245,6 +245,14 @@ def __repr__(self) -> str: class Join(NonZeroInputOperator): + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}",) + def op_identity_structure(self, *streams: dp.Stream) -> Any: # Join does not depend on the order of the streams -- convert it onto a set id_struct = (self.__class__.__name__,) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index e6c2c96..a625a3a 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -4,19 +4,21 @@ from collections.abc import Callable, Collection, Iterable, Sequence from typing import Any, Literal, cast -from orcapod.data.datagrams import DictPacket, DictTag +from orcapod.data.datagrams import ( + DictPacket, + ArrowPacket, +) +from orcapod.data.context import DataContext from orcapod.data.kernels import TrackedKernelBase from orcapod.data.operators import Join from orcapod.data.streams import PodStream -from orcapod.hashing import get_default_arrow_hasher from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore -from orcapod.types import TypeSpec, default_registry +from orcapod.types import TypeSpec from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter -from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import typespec_utils as tsutils logger = logging.getLogger(__name__) @@ -213,12 +215,11 @@ def __init__( input_typespec: TypeSpec | None = None, output_typespec: TypeSpec | Sequence[type] | None = None, label: str | None = None, - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: hp.ArrowHasher | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, ) -> None: self.function = function + if output_keys is None: output_keys = [] if isinstance(output_keys, str): @@ -243,14 +244,17 @@ def __init__( ) self._input_packet_schema = PythonSchema(input_packet_types) self._output_packet_schema = PythonSchema(output_packet_types) - - semantic_type_registry = semantic_type_registry or default_registry self._output_semantic_converter = SemanticConverter.from_semantic_schema( - self._output_packet_schema.to_semantic_schema(semantic_type_registry) + self._output_packet_schema.to_semantic_schema( + semantic_type_registry=self.data_context.semantic_type_registry + ) ) - self.arrow_hasher = arrow_hasher or get_default_arrow_hasher() - self.function_info_extractor = function_info_extractor + self._function_info_extractor = function_info_extractor + + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.function_name,) def input_packet_types(self) -> PythonSchema: """ @@ -311,7 +315,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non {k: v for k, v in zip(self.output_keys, output_values)}, typespec=self.output_packet_types(), semantic_converter=self._output_semantic_converter, - arrow_hasher=self.arrow_hasher, + data_context=self._data_context, ) return tag, output_packet @@ -319,8 +323,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name - if self.function_info_extractor is not None: - function_info = self.function_info_extractor.extract_function_info( + if self._function_info_extractor is not None: + function_info = self._function_info_extractor.extract_function_info( self.function, function_name=self.function_name, input_typespec=self.input_packet_types(), @@ -357,7 +361,7 @@ class WrappedPod(ActivatablePodBase): def __init__( self, - pod: dp.Pod, + pod: FunctionPod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, **kwargs, @@ -365,6 +369,30 @@ def __init__( super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod + @property + def data_context_key(self) -> str: + """ + Return the data context for the wrapped pod. + This is used to resolve semantic types and other context-specific information. + """ + return self.pod.data_context_key + + @property + def data_context(self) -> DataContext: + """ + Return the data context for the wrapped pod. + This is used to resolve semantic types and other context-specific information. + """ + return self.pod.data_context + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.kernel_id + def computed_label(self) -> str | None: return self.pod.label @@ -403,10 +431,11 @@ class CachedPod(WrappedPod): def __init__( self, - pod: dp.Pod, + pod: FunctionPod, result_store: ArrowDataStore, lineage_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), + data_context: str | DataContext | None = None, **kwargs, ): super().__init__(pod, **kwargs) @@ -414,6 +443,72 @@ def __init__( self.result_store = result_store self.lineage_store = lineage_store - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[DictTag, DictPacket | None]: ... + self.pod_hash = self.data_context.object_hasher.hash_to_hex( + self.pod, prefix_hasher_id=True + ) + + @property + def pod_id(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.kernel_id + (self.pod_hash,) + + @property + def record_path(self) -> tuple[str, ...]: + """ + Return the path to the record in the result store. + This is used to store the results of the pod. + """ + return self.record_path_prefix + self.pod_id + + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + output_packet = self.get_recorded_output_packet(packet) + if output_packet is not None: + return tag, output_packet + output_tag, output_packet = self.pod.call(tag, packet) + if output_packet is not None: + self.record_packet(packet, output_packet) + return output_tag, output_packet + + def record_packet( + self, + input_packet: dp.Packet, + output_packet: dp.Packet, + ignore_duplicates: bool = False, + ) -> dp.Packet: + """ + Record the output packet against the input packet in the result store. + """ + result_flag = self.result_store.record_data( + self.record_path, + input_packet.content_hash(), + output_packet.as_table(include_source=True), + ignore_duplicates=ignore_duplicates, + ) + if result_flag is None: + # TODO: do more specific error handling + raise ValueError( + f"Failed to record packet {input_packet} in result store {self.result_store}" + ) + # TODO: make store return retrieved table + return output_packet + + def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | None: + """ + Retrieve the output packet from the result store based on the input packet. + If the output packet is not found, return None. + """ + result_table = self.result_store.get_recorded_data( + self.record_path, input_packet.content_hash() + ) + if result_table is None: + return None + + return ArrowPacket( + result_table, + semantic_converter=self.pod._output_semantic_converter, + data_context=self.data_context, + skip_source_info_extraction=True, + ) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index bce6585..ebe0249 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,9 +1,15 @@ -from orcapod.hashing.types import ArrowHasher from orcapod.protocols import data_protocols as dp -from orcapod.types import schemas, TypeSpec -from orcapod.types.semantic_types import SemanticTypeRegistry -from orcapod.data.datagrams import ArrowPacket, ArrowTag, DictTag, SemanticConverter +from orcapod.data.context import DataContext +from orcapod.data.datagrams import ( + ArrowPacket, + ArrowTag, + DictTag, + SemanticConverter, + SOURCE_INFO_PREFIX, +) +from orcapod.utils import arrow_utils from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.types import TypeSpec, schemas import pyarrow as pa from collections.abc import Iterator, Collection from abc import ABC, abstractmethod @@ -32,6 +38,7 @@ def __init__( self, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), + data_context: str | DataContext | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -39,6 +46,15 @@ def __init__( self._upstreams = upstreams self._last_modified: datetime | None = None self._set_modified_time() + self._data_context = DataContext.resolve_data_context(data_context) + + @property + def data_context(self) -> DataContext: + """ + Returns the data context for the stream. + This is used to resolve semantic types and other context-specific information. + """ + return self._data_context @property def source(self) -> dp.Kernel | None: @@ -121,7 +137,12 @@ def iter_packets( ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... @abstractmethod - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: ... + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: ... def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ @@ -255,12 +276,21 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) - return self._cached_stream.as_table(include_content_hash=include_content_hash) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() @@ -288,16 +318,32 @@ class ImmutableTableStream(StreamBase): def __init__( self, table: pa.Table, + source_info: dict[str, str | None] | None = None, tag_columns: Collection[str] = (), source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), - semantic_type_registry: SemanticTypeRegistry | None = None, - arrow_hasher: ArrowHasher | None = None, **kwargs, ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) + table, data_context_table = arrow_utils.split_by_column_groups( + table, [DataContext.get_data_context_column()] + ) + if data_context_table is None: + data_context_table = pa.table( + { + DataContext.get_data_context_column(): pa.nulls( + len(table), pa.large_string() + ) + } + ) + + prefix_info = {SOURCE_INFO_PREFIX: source_info} + + table, prefix_tables = arrow_utils.prepare_prefixed_columns(table, prefix_info) self._table = table + self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._data_context_table = data_context_table self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) self._packet_columns = tuple( @@ -318,16 +364,16 @@ def __init__( self._tag_schema = tag_schema self._packet_schema = packet_schema self._tag_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema(tag_schema, semantic_type_registry) + schemas.SemanticSchema.from_arrow_schema( + tag_schema, self._data_context.semantic_type_registry + ) ) self._packet_converter = SemanticConverter.from_semantic_schema( schemas.SemanticSchema.from_arrow_schema( - packet_schema, semantic_type_registry + packet_schema, self._data_context.semantic_type_registry ) ) - self._arrow_hasher = arrow_hasher - self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now @@ -353,21 +399,35 @@ def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: ), ) - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. """ - if not include_content_hash: - return self._table - hash_column_name = ( - "_content_hash" if include_content_hash is True else include_content_hash - ) - content_hashes = [packet.content_hash() for _, packet in self.iter_packets()] - table_with_hash = self._table.append_column( - hash_column_name, pa.array(content_hashes, type=pa.large_string()) - ) - return table_with_hash + output_table = self._table + if include_content_hash: + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + content_hashes = [ + packet.content_hash() for _, packet in self.iter_packets() + ] + output_table = output_table.append_column( + hash_column_name, pa.array(content_hashes, type=pa.large_string()) + ) + table_stack = (output_table,) + if include_data_context: + table_stack += (self._data_context_table,) + if include_source: + table_stack += (self._source_info_table,) + return arrow_utils.hstack_tables(*table_stack) def clear_cache(self) -> None: """ @@ -400,7 +460,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: tag = ArrowTag( tag_batch.slice(i, 1), # type: ignore semantic_converter=self._tag_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ) else: @@ -411,7 +471,7 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: ArrowPacket( packet_batch.slice(i, 1), semantic_converter=self._packet_converter, - arrow_hasher=self._arrow_hasher, + data_context=self._data_context, ), ) ) @@ -459,6 +519,7 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ + tag_keys, _ = self.input_stream.keys() packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys @@ -493,7 +554,12 @@ def invalidate(self) -> None: self.clear_cache() self._set_modified_time(invalidate=True) - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: # TODO: note that this is likely NOT multi-thread safe self.refresh() if self._cached_output_table is None: @@ -502,7 +568,9 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: for tag, packet in self.iter_packets(): # TODO: evaluate handling efficiency here all_tags.append(tag.as_dict()) - all_packets.append(packet.as_dict(include_source=True)) + all_packets.append( + packet.as_dict(include_data_context=True, include_source=True) + ) all_tags: pa.Table = pa.Table.from_pylist(all_tags) all_packets: pa.Table = pa.Table.from_pylist(all_packets) @@ -518,6 +586,17 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: all_tags.columns + all_packets.columns, names=all_tags.column_names + all_packets.column_names, ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{SOURCE_INFO_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(DataContext.get_data_context_column()) + + output_table = self._cached_output_table.drop(drop_columns) # lazily prepare content hash column if requested if include_content_hash: @@ -528,18 +607,18 @@ def as_table(self, include_content_hash: bool | str = False) -> pa.Table: self._cached_content_hash_column = pa.array( content_hashes, type=pa.large_string() ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." ) hash_column_name = ( "_content_hash" if include_content_hash is True else include_content_hash ) - return self._cached_output_table.append_column( + output_table = output_table.append_column( hash_column_name, self._cached_content_hash_column ) - return self._cached_output_table + return output_table def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 767ea0e..2e0e927 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -18,6 +18,27 @@ class Datagram(Protocol): enabling type checking and validation throughout the computational graph. """ + @property + def data_context_key(self) -> str: + """ + Return the data context key for this datagram. + + This key identifies the semantic type registry, arrow hasher, and other + contextual information needed to properly interpret and work with this + datagram across various operations (storage, visualization, processing, etc.). + + Context key formats: + - Standard contexts: "std:v1.2.3:fingerprint" + - Custom contexts: "custom:user_provided_id" + + Concrete implementation can make use of this context key to ensure necessary background + informaton / object is available for correct processing of the datagram. + + Returns: + str: Context key for proper datagram interpretation + """ + ... + def types(self) -> TypeSpec: """ Return the type specification for this datagram. @@ -42,7 +63,7 @@ def keys(self) -> Collection[str]: """ ... - def as_table(self) -> pa.Table: + def as_table(self, include_data_context: bool = False) -> pa.Table: """ Convert to PyArrow Table format. @@ -54,7 +75,7 @@ def as_table(self) -> pa.Table: """ ... - def as_dict(self) -> dict[str, DataValue]: + def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: """ Convert to dictionary format. @@ -123,7 +144,9 @@ class Packet(Datagram, Protocol): data flow: Tags provide context, Packets provide content. """ - def as_table(self, include_source: bool = False) -> pa.Table: + def as_table( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Table: """ Convert the packet to a PyArrow Table. @@ -136,7 +159,9 @@ def as_table(self, include_source: bool = False) -> pa.Table: """ ... - def as_dict(self, include_source: bool = False) -> dict[str, DataValue]: + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: """ Convert the packet to a dictionary. @@ -395,7 +420,12 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... - def as_table(self, include_content_hash: bool | str = False) -> pa.Table: + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: """ Convert the entire stream to a PyArrow Table. @@ -509,6 +539,34 @@ class Kernel(ContentIdentifiable, Labelable, Protocol): full tracking) and testing/debugging (without side effects). """ + @property + def data_context_key(self) -> str: + """ + Return the data context key for this kernel. + + This key identifies the semantic type registry, arrow hasher, and other + contextual information needed to properly interpret and work with this + kernel across various operations (storage, visualization, processing, etc.). + + Returns: + str: Context key for proper kernel interpretation + """ + ... + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return a unique identifier for this Pod. + + The pod_id is used for caching and tracking purposes. It should + uniquely identify the Pod's computational logic, parameters, and + any relevant metadata that affects its behavior. + + Returns: + tuple[str, ...]: Unique identifier for this Pod + """ + ... + def __call__( self, *streams: Stream, label: str | None = None, **kwargs ) -> LiveStream: diff --git a/src/orcapod/types/arrow_utils.py b/src/orcapod/types/arrow_utils.py index c446901..34a06a3 100644 --- a/src/orcapod/types/arrow_utils.py +++ b/src/orcapod/types/arrow_utils.py @@ -1,10 +1,123 @@ -import pyarrow as pa +# from collections.abc import Mapping, Collection +# import pyarrow as pa +# from typing import Any -def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: - """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, - no field names should collide.""" - merged_fields = [] - for schema in schemas: - merged_fields.extend(schema) - return pa.schema(merged_fields) +# def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: +# """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, +# no field names should collide.""" +# merged_fields = [] +# for schema in schemas: +# merged_fields.extend(schema) +# return pa.schema(merged_fields) + + +# def split_by_column_groups( +# self, *column_groups: Collection[str] +# ) -> tuple[pa.Table | None]: +# """ +# Split the table into multiple tables based on the provided column groups. +# Each group is a collection of column names that should be included in the same table. +# The remaining columns that are not part of any group will be returned as the first table/None. +# """ +# if not column_groups: +# return (self,) + +# tables = [] +# remaining_columns = set(self.column_names) + +# for group in column_groups: +# group_columns = [col for col in group if col in remaining_columns] +# if group_columns: +# tables.append(self.select(group_columns)) +# remaining_columns.difference_update(group_columns) +# else: +# tables.append(None) + +# remaining_table = None +# if remaining_columns: +# orderd_remaining_columns = self.column_names +# remaining_columns = [ +# col for col in orderd_remaining_columns if col in remaining_columns +# ] +# remaining_table = self.select(orderd_remaining_columns) +# return (remaining_table, *tables) + + +# def prepare_prefixed_columns( +# table: pa.Table, +# prefix_group: Collection[str] | Mapping[str, Any | None], +# ) -> tuple[pa.Table, pa.Table]: +# """ """ +# if isinstance(prefix_group, Mapping): +# prefix_group = {k: v if v is not None else {} for k, v in prefix_group.items()} +# elif isinstance(prefix_group, Collection): +# prefix_group = {name: {} for name in prefix_group} +# else: +# raise TypeError( +# "prefix_group must be a Collection of strings or a Mapping of string to string or None." +# ) + +# # Visit each prefix group and split them into separate tables +# member_columns = {} + +# for col_name in table.column_names: +# for prefix in prefix_group: +# if col_name.startswith(prefix): +# # Remove the prefix from the column name +# base_name = col_name.removeprefix(prefix) +# if base_name not in member_columns: +# member_columns[base_name] = [] +# member_columns[base_name].append(table.column(col_name)) + +# data_columns = [] +# data_column_names = [] +# existing_source_info = {} + +# for i, name in enumerate(table.column_names): +# if name.startswith(SOURCE_INFO_PREFIX): +# # Extract the base column name +# base_name = name.removeprefix(SOURCE_INFO_PREFIX) +# existing_source_info[base_name] = table.column(i) +# else: +# data_columns.append(table.column(i)) +# data_column_names.append(name) + +# # Step 2: Create source_info columns for each regular column +# source_info_columns = [] +# source_info_column_names = [] + +# # Create source_info columns for each regular column +# num_rows = table.num_rows + +# for col_name in data_column_names: +# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" + +# # if col_name is in source_info, use that value +# if col_name in source_info: +# # Use value from source_info dictionary +# source_value = source_info[col_name] +# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) +# # if col_name is in existing_source_info, use that column +# elif col_name in existing_source_info: +# # Use existing source_info column, but convert to large_string +# existing_col = existing_source_info[col_name] +# if existing_col.type == pa.large_string(): +# source_values = existing_col +# else: +# # Convert to large_string +# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + +# else: +# # Use null values +# source_values = pa.array([None] * num_rows, type=pa.large_string()) + +# source_info_columns.append(source_values) +# source_info_column_names.append(source_info_col_name) + +# # Step 3: Create the final table +# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) +# source_info_table: pa.Table = pa.Table.from_arrays( +# source_info_columns, names=source_info_column_names +# ) +# return data_table, source_info_table diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 8dc0df1..118b110 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -2,7 +2,8 @@ from orcapod.types.schemas import PythonSchema, SemanticSchema from orcapod.types import typespec_utils as tsutils -from typing import Any, Mapping, Self +from typing import Any, Self +from collections.abc import Mapping import pyarrow as pa import logging diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 940820f..9f66654 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -214,12 +214,15 @@ def extract_function_typespecs( return param_info, inferred_output_types -def get_typespec_from_dict(dict: Mapping) -> TypeSpec: +def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> TypeSpec: """ Returns a TypeSpec for the given dictionary. - The TypeSpec is a mapping from field name to Python type. + The TypeSpec is a mapping from field name to Python type. If typespec is provided, then + it is used as a base when inferring types for the fields in dict """ - return {key: type(value) for key, value in dict.items()} + if typespec is None: + typespec = {} + return {key: typespec.get(key, type(value)) for key, value in data.items()} def get_compatible_type(type1: Any, type2: Any) -> Any: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 5a072de..f9a6d7f 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1,6 +1,10 @@ # TODO: move this to a separate module +from collections import defaultdict +from matplotlib.pylab import f import pyarrow as pa +from collections.abc import Mapping, Collection +from typing import Any def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: @@ -124,3 +128,123 @@ def check_arrow_schema_compatibility( errors.append(f"Unexpected field '{field_name}' in incoming schema") return len(errors) == 0, errors + + +def split_by_column_groups( + table, + *column_groups: Collection[str], +) -> tuple[pa.Table | None, ...]: + """ + Split the table into multiple tables based on the provided column groups. + Each group is a collection of column names that should be included in the same table. + The remaining columns that are not part of any group will be returned as the first table/None. + """ + if not column_groups: + return (table,) + + tables = [] + remaining_columns = set(table.column_names) + + for group in column_groups: + group_columns = [col for col in group if col in remaining_columns] + if group_columns: + tables.append(table.select(group_columns)) + remaining_columns.difference_update(group_columns) + else: + tables.append(None) + + remaining_table = None + if remaining_columns: + ordered_remaining_columns = [ + col for col in table.column_names if col in remaining_columns + ] + remaining_table = table.select(ordered_remaining_columns) + return (remaining_table, *tables) + + +def prepare_prefixed_columns( + table: pa.Table, + prefix_info: Collection[str] + | Mapping[str, Any | None] + | Mapping[str, Mapping[str, Any | None]], +) -> tuple[pa.Table, dict[str, pa.Table]]: + """ """ + all_prefix_info = {} + if isinstance(prefix_info, Mapping): + for prefix, info in prefix_info.items(): + if isinstance(info, Mapping): + all_prefix_info[prefix] = info + else: + all_prefix_info[prefix] = info + elif isinstance(prefix_info, Collection): + for prefix in prefix_info: + all_prefix_info[prefix] = {} + else: + raise TypeError( + "prefix_group must be a Collection of strings or a Mapping of string to string or None." + ) + + # split column into prefix groups + data_column_names = [] + data_columns = [] + existing_prefixed_columns = defaultdict(list) + + for col_name in table.column_names: + prefix_found = False + for prefix in all_prefix_info: + if col_name.startswith(prefix): + # Remove the prefix from the column name + base_name = col_name.removeprefix(prefix) + existing_prefixed_columns[prefix].append(base_name) + prefix_found = True + if not prefix_found: + # if no prefix found, consider this as a data column + data_column_names.append(col_name) + data_columns.append(table[col_name]) + + # Create source_info columns for each regular column + num_rows = table.num_rows + + prefixed_column_names = defaultdict(list) + prefixed_columns = defaultdict(list) + + for prefix, value_lut in all_prefix_info.items(): + target_prefixed_column_names = prefixed_column_names[prefix] + target_prefixed_columns = prefixed_columns[prefix] + + for col_name in data_column_names: + prefixed_col_name = f"{prefix}{col_name}" + existing_columns = existing_prefixed_columns[prefix] + + if isinstance(value_lut, Mapping): + value = value_lut.get(col_name) + else: + value = value_lut + + if value is not None: + # Use value from source_info dictionary + column_values = pa.array([value] * num_rows, type=pa.large_string()) + # if col_name is in existing_source_info, use that column + elif col_name in existing_columns: + # Use existing source_info column, but convert to large_string + existing_col = table[prefixed_col_name] + + if existing_col.type == pa.string(): + # Convert to large_string + column_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + else: + column_values = existing_col + else: + # Use null values + column_values = pa.array([None] * num_rows, type=pa.large_string()) + target_prefixed_column_names.append(prefixed_col_name) + target_prefixed_columns.append(column_values) + + # Step 3: Create the final table + data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) + result_tables = {} + for prefix in all_prefix_info: + result_tables[prefix] = pa.Table.from_arrays( + prefixed_columns[prefix], names=prefixed_column_names[prefix] + ) + return data_table, result_tables From 1788c05c70259aad3b7ba9e2cbd588f19eedf93a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 19 Jul 2025 10:50:18 +0000 Subject: [PATCH 115/224] refactor: clean up protocol around types --- src/orcapod/data/datagrams.py | 337 ++++++++++++++--------- src/orcapod/data/kernels.py | 4 - src/orcapod/data/pods.py | 45 ++- src/orcapod/data/streams.py | 43 +-- src/orcapod/protocols/data_protocols.py | 59 +++- src/orcapod/protocols/store_protocols.py | 2 +- src/orcapod/stores/__init__.py | 26 +- src/orcapod/stores/delta_lake_stores.py | 14 +- src/orcapod/utils/arrow_utils.py | 19 +- 9 files changed, 340 insertions(+), 209 deletions(-) diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py index 0e8df1e..b506a56 100644 --- a/src/orcapod/data/datagrams.py +++ b/src/orcapod/data/datagrams.py @@ -127,6 +127,7 @@ def __init__( self._cached_table: pa.Table | None = None self._cached_content_hash: str | None = None + self._cached_arrow_schema: pa.Schema | None = None @property def data_context_key(self) -> str: @@ -137,10 +138,9 @@ def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if self._cached_table is None: - typespec = self.types() - typespec[DataContext.get_data_context_column()] = str self._cached_table = self.semantic_converter.from_python_to_arrow( - self.as_dict(include_data_context=True), typespec + self.as_dict(include_data_context=True), + self.types(include_data_context=True), ) assert self._cached_table is not None, "Cached table should not be None" if include_data_context: @@ -171,9 +171,35 @@ def content_hash( ) return self._cached_content_hash - def types(self) -> schemas.PythonSchema: + def types(self, include_data_context: bool = False) -> schemas.PythonSchema: """Return copy of the Python schema.""" - return self._python_schema.copy() + schema = self._python_schema.copy() + if include_data_context: + schema[DataContext.get_data_context_column()] = str + return schema + + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + if self._cached_arrow_schema is None: + self._cached_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self.types(include_data_context=True) + ) + ) + if not include_data_context: + return arrow_utils.drop_schema_columns( + self._cached_arrow_schema, + [DataContext.get_data_context_column()], + ) + return self._cached_arrow_schema @classmethod def _from_copy( @@ -261,6 +287,15 @@ def __init__( self._source_info = {**contained_source_info, **(source_info or {})} self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + {f"{SOURCE_INFO_PREFIX}{k}": pa.large_string() for k in self.keys()} + ) + return self._cached_source_info_schema def as_table( self, @@ -274,18 +309,19 @@ def as_table( source_info_data = { f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() } - source_info_schema = pa.schema( - {k: pa.large_string() for k in source_info_data} - ) self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=source_info_schema + [source_info_data], schema=self._source_info_schema ) assert self._cached_source_info_table is not None, ( "Cached source info table should not be None" ) # subselect the corresponding _source_info as the columns present in the data table source_info_table = self._cached_source_info_table.select( - [f"{SOURCE_INFO_PREFIX}{k}" for k in table.column_names] + [ + f"{SOURCE_INFO_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] ) table = arrow_utils.hstack_tables(table, source_info_table) return table @@ -308,6 +344,34 @@ def as_dict( dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value return dict_copy + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + def as_datagram(self, include_source: bool = False) -> DictDatagram: """ Convert the packet to a DictDatagram. @@ -319,11 +383,7 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: DictDatagram representation of the packet """ data = self.as_dict(include_source=include_source) - typespec = self.types() - # append source info to typespec if requested - if include_source: - for key in self.keys(): - typespec[f"{SOURCE_INFO_PREFIX}{key}"] = str + typespec = self.types(include_source=include_source) return DictDatagram( data, typespec=typespec, @@ -331,22 +391,6 @@ def as_datagram(self, include_source: bool = False) -> DictDatagram: data_context=self._data_context, ) - # def content_hash2(self) -> str: - # """ - # Calculate content hash excluding source information. - - # Returns: - # Hash string of the packet content - # """ - # # TODO: check if this is identical to DictDatagram.content_hash - # if self._cached_content_hash is None: - # self._cached_content_hash = self._arrow_hasher.hash_table( - # self.as_table(include_source=False), prefix_hasher_id=True - # ) - # return self._cached_content_hash - - # use keys() implementation from dict - def source_info(self) -> dict[str, str | None]: """ Return source information for all keys. @@ -364,76 +408,76 @@ def copy(self) -> Self: return instance -def prepare_system_data_tables( - table: pa.Table, - source_info: dict[str, str | None] | None = None, -) -> tuple[pa.Table, pa.Table]: - """ - Process a table to ensure proper source_info columns. - - Args: - table: Input PyArrow table - source_info: optional dictionary mapping column names to source info values. If present, - it will take precedence over existing source_info columns in the table. - - Returns: - tuple of table without any source info and another table only containing source info columns (with prefix) - """ - if source_info is None: - source_info = {} - - # Step 1: Separate source_info columns from regular columns - data_columns = [] - data_column_names = [] - existing_source_info = {} - - for i, name in enumerate(table.column_names): - if name.startswith(SOURCE_INFO_PREFIX): - # Extract the base column name - base_name = name.removeprefix(SOURCE_INFO_PREFIX) - existing_source_info[base_name] = table.column(i) - else: - data_columns.append(table.column(i)) - data_column_names.append(name) - - # Step 2: Create source_info columns for each regular column - source_info_columns = [] - source_info_column_names = [] - - # Create source_info columns for each regular column - num_rows = table.num_rows - - for col_name in data_column_names: - source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" - - # if col_name is in source_info, use that value - if col_name in source_info: - # Use value from source_info dictionary - source_value = source_info[col_name] - source_values = pa.array([source_value] * num_rows, type=pa.large_string()) - # if col_name is in existing_source_info, use that column - elif col_name in existing_source_info: - # Use existing source_info column, but convert to large_string - existing_col = existing_source_info[col_name] - if existing_col.type == pa.large_string(): - source_values = existing_col - else: - # Convert to large_string - source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore - - else: - # Use null values - source_values = pa.array([None] * num_rows, type=pa.large_string()) - - source_info_columns.append(source_values) - source_info_column_names.append(source_info_col_name) - - # Step 3: Create the final table - data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) - source_info_table: pa.Table = pa.Table.from_arrays( - source_info_columns, names=source_info_column_names - ) - return data_table, source_info_table +# def prepare_system_data_tables( +# table: pa.Table, +# source_info: dict[str, str | None] | None = None, +# ) -> tuple[pa.Table, pa.Table]: +# """ +# Process a table to ensure proper source_info columns. + +# Args: +# table: Input PyArrow table +# source_info: optional dictionary mapping column names to source info values. If present, +# it will take precedence over existing source_info columns in the table. + +# Returns: +# tuple of table without any source info and another table only containing source info columns (with prefix) +# """ +# if source_info is None: +# source_info = {} + +# # Step 1: Separate source_info columns from regular columns +# data_columns = [] +# data_column_names = [] +# existing_source_info = {} + +# for i, name in enumerate(table.column_names): +# if name.startswith(SOURCE_INFO_PREFIX): +# # Extract the base column name +# base_name = name.removeprefix(SOURCE_INFO_PREFIX) +# existing_source_info[base_name] = table.column(i) +# else: +# data_columns.append(table.column(i)) +# data_column_names.append(name) + +# # Step 2: Create source_info columns for each regular column +# source_info_columns = [] +# source_info_column_names = [] + +# # Create source_info columns for each regular column +# num_rows = table.num_rows + +# for col_name in data_column_names: +# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" + +# # if col_name is in source_info, use that value +# if col_name in source_info: +# # Use value from source_info dictionary +# source_value = source_info[col_name] +# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) +# # if col_name is in existing_source_info, use that column +# elif col_name in existing_source_info: +# # Use existing source_info column, but convert to large_string +# existing_col = existing_source_info[col_name] +# if existing_col.type == pa.large_string(): +# source_values = existing_col +# else: +# # Convert to large_string +# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore + +# else: +# # Use null values +# source_values = pa.array([None] * num_rows, type=pa.large_string()) + +# source_info_columns.append(source_values) +# source_info_column_names.append(source_info_col_name) + +# # Step 3: Create the final table +# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) +# source_info_table: pa.Table = pa.Table.from_arrays( +# source_info_columns, names=source_info_column_names +# ) +# return data_table, source_info_table class ArrowDatagram: @@ -482,10 +526,12 @@ def __init__( self._data_context = DataContext.resolve_data_context(data_context) - schema = pa.schema({DataContext.get_data_context_column(): pa.large_string()}) - self._context_info_table = pa.Table.from_pylist( + data_context_schema = pa.schema( + {DataContext.get_data_context_column(): pa.large_string()} + ) + self._data_context_table = pa.Table.from_pylist( [{DataContext.get_data_context_column(): self._data_context.context_key}], - schema=schema, + schema=data_context_schema, ) # create semantic converter @@ -510,7 +556,7 @@ def data_context_key(self) -> str: def as_table(self, include_data_context: bool = False) -> pa.Table: """Convert the packet to an Arrow table.""" if include_data_context: - return arrow_utils.hstack_tables(self._table, self._context_info_table) + return arrow_utils.hstack_tables(self._table, self._data_context_table) return self._table def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: @@ -546,13 +592,32 @@ def content_hash( def keys(self) -> tuple[str, ...]: return tuple(self._table.column_names) - def types(self) -> schemas.PythonSchema: + def types(self, include_data_context: bool = False) -> schemas.PythonSchema: """Return copy of the Python schema.""" if self._cached_python_schema is None: self._cached_python_schema = ( self._semantic_converter.from_arrow_to_python_schema(self._table.schema) ) - return self._cached_python_schema.copy() + schema = self._cached_python_schema.copy() + if include_data_context: + schema[DataContext.get_data_context_column()] = str + return schema + + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + if include_data_context: + return arrow_utils.join_arrow_schemas( + self._table.schema, self._data_context_table.schema + ) + return self._table.schema @classmethod def _from_copy( @@ -653,7 +718,6 @@ def __init__( self, data: pa.Table, source_info: dict[str, str | None] | None = None, - skip_source_info_extraction: bool = False, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -665,19 +729,13 @@ def __init__( if source_info is None: source_info = {} - if not skip_source_info_extraction: - # normalize the table to ensure it has the expected source_info columns - data_table, self._source_info_table = prepare_system_data_tables( - data, source_info - ) - else: # assume that data already contains source info columns with appropriate prefixes - data_columns: tuple[str, ...] = tuple( - [c for c in data.column_names if not c.startswith(SOURCE_INFO_PREFIX)] - ) - source_columns = [f"{SOURCE_INFO_PREFIX}{c}" for c in data_columns] - # Add conversion to large_string type - data_table = data.select(data_columns) - self._source_info_table = data.select(source_columns) + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + data, + {SOURCE_INFO_PREFIX: source_info}, + exclude_columns=[DataContext.get_data_context_column()], + ) + self._source_info_table = prefixed_tables[SOURCE_INFO_PREFIX] super().__init__( data_table, @@ -700,11 +758,45 @@ def as_table( table = arrow_utils.hstack_tables( table, self._source_info_table.select( - [f"{SOURCE_INFO_PREFIX}{c}" for c in table.column_names] + [ + f"{SOURCE_INFO_PREFIX}{c}" + for c in table.column_names + if c in self.keys() + ] ), ) return table + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + def as_dict( self, include_data_context: bool = False, include_source: bool = False ) -> dict[str, DataValue]: @@ -753,7 +845,6 @@ def copy(self) -> Self: self.source_info(), semantic_converter=self._semantic_converter, data_context=self._data_context, - skip_source_info_extraction=True, ) new_packet._cached_source_info = self._cached_source_info new_packet._cached_python_dict = self._cached_python_dict diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index f77f7e1..09cf09f 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -44,10 +44,6 @@ def __init__( self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self.fixed_input_streams = fixed_input_streams - @property - def data_context_key(self) -> str: - return self._data_context.context_key - @property def data_context(self) -> DataContext: return self._data_context diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a625a3a..cd06f34 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -20,6 +20,7 @@ from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter from orcapod.types import typespec_utils as tsutils +import pyarrow as pa logger = logging.getLogger(__name__) @@ -361,7 +362,7 @@ class WrappedPod(ActivatablePodBase): def __init__( self, - pod: FunctionPod, + pod: dp.Pod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, **kwargs, @@ -369,22 +370,6 @@ def __init__( super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) self.pod = pod - @property - def data_context_key(self) -> str: - """ - Return the data context for the wrapped pod. - This is used to resolve semantic types and other context-specific information. - """ - return self.pod.data_context_key - - @property - def data_context(self) -> DataContext: - """ - Return the data context for the wrapped pod. - This is used to resolve semantic types and other context-specific information. - """ - return self.pod.data_context - @property def kernel_id(self) -> tuple[str, ...]: """ @@ -431,24 +416,24 @@ class CachedPod(WrappedPod): def __init__( self, - pod: FunctionPod, + pod: dp.Pod, result_store: ArrowDataStore, lineage_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), - data_context: str | DataContext | None = None, **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store self.lineage_store = lineage_store + # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( self.pod, prefix_hasher_id=True ) @property - def pod_id(self) -> tuple[str, ...]: + def kernel_id(self) -> tuple[str, ...]: """ Return the pod ID, which is the function name of the wrapped pod. This is used to identify the pod in the system. @@ -461,7 +446,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.pod_id + return self.record_path_prefix + self.kernel_id def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: output_packet = self.get_recorded_output_packet(packet) @@ -481,10 +466,19 @@ def record_packet( """ Record the output packet against the input packet in the result store. """ + data_table = output_packet.as_table( + include_data_context=True, include_source=True + ) + + data_table = data_table.append_column( + f"_input_packet{DataContext.get_data_context_column()}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + result_flag = self.result_store.record_data( self.record_path, input_packet.content_hash(), - output_packet.as_table(include_source=True), + data_table, ignore_duplicates=ignore_duplicates, ) if result_flag is None: @@ -507,8 +501,7 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non return None return ArrowPacket( - result_table, - semantic_converter=self.pod._output_semantic_converter, - data_context=self.data_context, - skip_source_info_extraction=True, + result_table.drop( + [f"_input_packet{DataContext.get_data_context_column()}"] + ), ) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index ebe0249..a5c2434 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -318,8 +318,8 @@ class ImmutableTableStream(StreamBase): def __init__( self, table: pa.Table, - source_info: dict[str, str | None] | None = None, tag_columns: Collection[str] = (), + source_info: dict[str, str | None] | None = None, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), **kwargs, @@ -340,15 +340,19 @@ def __init__( prefix_info = {SOURCE_INFO_PREFIX: source_info} - table, prefix_tables = arrow_utils.prepare_prefixed_columns(table, prefix_info) - self._table = table - self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] - self._data_context_table = data_context_table - + # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + table, prefix_tables = arrow_utils.prepare_prefixed_columns( + table, prefix_info, exclude_columns=self._tag_columns + ) + # now table should only contain tag columns and packet columns self._packet_columns = tuple( c for c in table.column_names if c not in tag_columns ) + self._table = table + self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._data_context_table = data_context_table + if len(self._packet_columns) == 0: raise ValueError( "No packet columns found in the table. At least one packet column is required." @@ -565,27 +569,26 @@ def as_table( if self._cached_output_table is None: all_tags = [] all_packets = [] + tag_schema, packet_schema = None, None for tag, packet in self.iter_packets(): - # TODO: evaluate handling efficiency here + if tag_schema is None: + tag_schema = tag.arrow_schema() + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_data_context=True, + include_source=True, + ) all_tags.append(tag.as_dict()) all_packets.append( packet.as_dict(include_data_context=True, include_source=True) ) - all_tags: pa.Table = pa.Table.from_pylist(all_tags) - all_packets: pa.Table = pa.Table.from_pylist(all_packets) - # assert that column names do not overlap - overlapping_columns = set(all_tags.column_names) & set( - all_packets.column_names - ) - if overlapping_columns: - raise ValueError( - f"Column names overlap between tags and packets: {overlapping_columns}. Overlapping tag and packet columns are not supported yet." - ) - self._cached_output_table = pa.Table.from_arrays( - all_tags.columns + all_packets.columns, - names=all_tags.column_names + all_packets.column_names, + all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) + all_packets: pa.Table = pa.Table.from_pylist( + all_packets, schema=packet_schema ) + + self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) assert self._cached_output_table is not None, ( "_cached_output_table should not be None here." ) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 2e0e927..012edaa 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -39,7 +39,7 @@ def data_context_key(self) -> str: """ ... - def types(self) -> TypeSpec: + def types(self, include_data_context: bool = False) -> TypeSpec: """ Return the type specification for this datagram. @@ -51,6 +51,19 @@ def types(self) -> TypeSpec: """ ... + def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + The schema provides a structured representation of the datagram's + fields and their types, enabling efficient serialization and + deserialization with PyArrow. + + Returns: + pa.Schema: PyArrow schema representation of the datagram + """ + ... + def keys(self) -> Collection[str]: """ Return the available keys/fields in this datagram. @@ -203,6 +216,36 @@ def source_info(self) -> dict[str, str | None]: """ ... + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> TypeSpec: + """ + Return the type specification for this packet. + + Args: + include_source: If True, source information is included in the typespec + for debugging and lineage tracking + + Returns: + TypeSpec: Dictionary mapping field names to Python types + """ + ... + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this packet. + + Args: + include_source: If True, source information is included in the schema + for debugging and lineage tracking + + Returns: + pa.Schema: PyArrow schema representation of packet data + """ + ... + # def join(self, other: "Packet") -> "Packet": ... # def get_as(self, packet_type: PacketType) -> PacketType: ... @@ -539,20 +582,6 @@ class Kernel(ContentIdentifiable, Labelable, Protocol): full tracking) and testing/debugging (without side effects). """ - @property - def data_context_key(self) -> str: - """ - Return the data context key for this kernel. - - This key identifies the semantic type registry, arrow hasher, and other - contextual information needed to properly interpret and work with this - kernel across various operations (storage, visualization, processing, etc.). - - Returns: - str: Context key for proper kernel interpretation - """ - ... - @property def kernel_id(self) -> tuple[str, ...]: """ diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index d5ca902..618d7a4 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -12,7 +12,7 @@ def record_data( record_path: tuple[str, ...], record_id: str, data: pa.Table, - ignore_duplicates: bool = False, + ignore_duplicates: bool | None = None, ) -> str | None: ... def get_recorded_data( diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 573a316..434e2f4 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -1,14 +1,14 @@ -from .legacy.types import DataStore, ArrowDataStore -from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore -from .legacy.dict_data_stores import DirDataStore, NoOpDataStore -from .legacy.safe_dir_data_store import SafeDirDataStore +# from .legacy.types import DataStore, ArrowDataStore +# from .legacy.legacy_arrow_data_stores import MockArrowDataStore, SimpleParquetDataStore +# from .legacy.dict_data_stores import DirDataStore, NoOpDataStore +# from .legacy.safe_dir_data_store import SafeDirDataStore -__all__ = [ - "DataStore", - "ArrowDataStore", - "DirDataStore", - "SafeDirDataStore", - "NoOpDataStore", - "MockArrowDataStore", - "SimpleParquetDataStore", -] +# __all__ = [ +# "DataStore", +# "ArrowDataStore", +# "DirDataStore", +# "SafeDirDataStore", +# "NoOpDataStore", +# "MockArrowDataStore", +# "SimpleParquetDataStore", +# ] diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index f8f0451..f04a7b7 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -385,15 +385,15 @@ def record_data( entry_id: str, data: pa.Table, force_flush: bool = False, - error_on_duplicate: bool | None = None, + ignore_duplicates: bool | None = None, ) -> pa.Table: self._validate_source_path(record_path) source_key = self._get_source_key(record_path) # Check for existing entry - if error_on_duplicate is None: - error_on_duplicate = self.duplicate_entry_behavior == "error" - if error_on_duplicate: + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + if not ignore_duplicates: pending_table = self._pending_batches[source_key].get(entry_id, None) if pending_table is not None: raise ValueError( @@ -480,8 +480,10 @@ def get_recorded_data( # check if entry_id is found in pending batches source_key = self._get_source_key(record_path) if entry_id in self._pending_batches[source_key]: - # Return the pending record directly - return self._pending_batches[source_key][entry_id] + # Return the pending record after removing the entry id column + return self._remove_entry_id_column( + self._pending_batches[source_key][entry_id] + ) delta_table = self._get_existing_delta_table(record_path) if delta_table is None: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index f9a6d7f..5237eb3 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -167,6 +167,7 @@ def prepare_prefixed_columns( prefix_info: Collection[str] | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], + exclude_columns: Collection[str] = (), ) -> tuple[pa.Table, dict[str, pa.Table]]: """ """ all_prefix_info = {} @@ -208,11 +209,13 @@ def prepare_prefixed_columns( prefixed_column_names = defaultdict(list) prefixed_columns = defaultdict(list) + target_column_names = [c for c in data_column_names if c not in exclude_columns] + for prefix, value_lut in all_prefix_info.items(): target_prefixed_column_names = prefixed_column_names[prefix] target_prefixed_columns = prefixed_columns[prefix] - for col_name in data_column_names: + for col_name in target_column_names: prefixed_col_name = f"{prefix}{col_name}" existing_columns = existing_prefixed_columns[prefix] @@ -248,3 +251,17 @@ def prepare_prefixed_columns( prefixed_columns[prefix], names=prefixed_column_names[prefix] ) return data_table, result_tables + + +def drop_schema_columns(schema: pa.Schema, columns: Collection[str]) -> pa.Schema: + """ + Drop specified columns from a PyArrow schema. + + Args: + schema (pa.Schema): The original schema. + columns (list[str]): List of column names to drop. + + Returns: + pa.Schema: New schema with specified columns removed. + """ + return pa.schema([field for field in schema if field.name not in columns]) From 6f8f996dec28321380efb248969e8fe7eca7f7e0 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 04:42:25 +0000 Subject: [PATCH 116/224] wip: further refinement of datagram implementations --- src/orcapod/__init__.py | 46 +- src/orcapod/data/__init__.py | 6 + src/orcapod/data/context.py | 10 - src/orcapod/data/datagrams.py | 859 ------- src/orcapod/data/datagrams/__init__.py | 13 + src/orcapod/data/datagrams/arrow_datagram.py | 867 +++++++ .../data/datagrams/arrow_tag_packet.py | 268 ++ src/orcapod/data/datagrams/base.py | 301 +++ src/orcapod/data/datagrams/dict_datagram.py | 835 ++++++ src/orcapod/data/datagrams/dict_tag_packet.py | 256 ++ src/orcapod/data/kernels.py | 5 + src/orcapod/data/old_datagrams.py | 2281 +++++++++++++++++ src/orcapod/data/pods.py | 156 +- src/orcapod/data/streams.py | 48 +- src/orcapod/data/system_constants.py | 25 + src/orcapod/hashing/arrow_hashers.py | 3 +- src/orcapod/hashing/object_hashers.py | 2 +- src/orcapod/protocols/data_protocols.py | 778 +++++- src/orcapod/protocols/hashing_protocols.py | 12 +- src/orcapod/protocols/store_protocols.py | 29 +- src/orcapod/stores/delta_lake_stores.py | 413 ++- src/orcapod/types/semantic_converter.py | 29 +- src/orcapod/types/semantic_types.py | 48 +- src/orcapod/types/typespec_utils.py | 9 +- src/orcapod/utils/arrow_utils.py | 9 +- 25 files changed, 5989 insertions(+), 1319 deletions(-) delete mode 100644 src/orcapod/data/datagrams.py create mode 100644 src/orcapod/data/datagrams/__init__.py create mode 100644 src/orcapod/data/datagrams/arrow_datagram.py create mode 100644 src/orcapod/data/datagrams/arrow_tag_packet.py create mode 100644 src/orcapod/data/datagrams/base.py create mode 100644 src/orcapod/data/datagrams/dict_datagram.py create mode 100644 src/orcapod/data/datagrams/dict_tag_packet.py create mode 100644 src/orcapod/data/old_datagrams.py create mode 100644 src/orcapod/data/system_constants.py diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index b4de8e1..b49b19c 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,45 +1,17 @@ from .data import DEFAULT_TRACKER_MANAGER +from .data.pods import function_pod, FunctionPod, CachedPod +from .data import streams +from .stores.delta_lake_stores import BasicDeltaTableArrowStore + no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking __all__ = [ "DEFAULT_TRACKER_MANAGER", "no_tracking", + "function_pod", + "FunctionPod", + "CachedPod", + "streams", + "BasicDeltaTableArrowStore", ] - -# from .core import operators, sources, streams -# from .core.streams import SyncStreamFromLists, SyncStreamFromGenerator -# from . import hashing, stores -# from .core.operators import Join, MapPackets, MapTags, packet, tag -# from .core.pod import FunctionPod, function_pod -# from .core.sources import GlobSource -# from .stores import DirDataStore, SafeDirDataStore -# from .core.tracker import GraphTracker -# from .pipeline import Pipeline - -# DEFAULT_TRACKER = GraphTracker() -# DEFAULT_TRACKER.activate() - - -# __all__ = [ -# "hashing", -# "stores", -# "pod", -# "operators", -# "streams", -# "sources", -# "MapTags", -# "MapPackets", -# "Join", -# "tag", -# "packet", -# "FunctionPod", -# "function_pod", -# "GlobSource", -# "DirDataStore", -# "SafeDirDataStore", -# "DEFAULT_TRACKER", -# "SyncStreamFromLists", -# "SyncStreamFromGenerator", -# "Pipeline", -# ] diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py index 6d7e206..eb005c1 100644 --- a/src/orcapod/data/__init__.py +++ b/src/orcapod/data/__init__.py @@ -1 +1,7 @@ from .trackers import DEFAULT_TRACKER_MANAGER +from .system_constants import orcapod_constants as constants + +__all__ = [ + "DEFAULT_TRACKER_MANAGER", + "constants", +] diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py index cc47cff..85261d2 100644 --- a/src/orcapod/data/context.py +++ b/src/orcapod/data/context.py @@ -6,9 +6,6 @@ from dataclasses import dataclass -DATA_CONTEXT_COLUMN = "_orcapod_context_key" - - @dataclass class DataContext: context_key: str @@ -16,13 +13,6 @@ class DataContext: arrow_hasher: hp.ArrowHasher object_hasher: hp.ObjectHasher - @staticmethod - def get_data_context_column() -> str: - """ - Returns the column name used to store the data context key in Arrow tables. - """ - return DATA_CONTEXT_COLUMN - @staticmethod def resolve_data_context(data_context: "str | DataContext | None") -> "DataContext": """ diff --git a/src/orcapod/data/datagrams.py b/src/orcapod/data/datagrams.py deleted file mode 100644 index b506a56..0000000 --- a/src/orcapod/data/datagrams.py +++ /dev/null @@ -1,859 +0,0 @@ -""" -Data structures and utilities for working with datagrams in OrcaPod. - -This module provides classes and functions for handling packet-like data structures -that can represent data in various formats (Python dicts, Arrow tables, etc.) while -maintaining type information, source metadata, and semantic type conversion capability. - -Key classes: -- SemanticConverter: Converts between different data representations. Intended for internal use. -- DictDatagram: Immutable dict-based data structure -- PythonDictPacket: Python dict-based packet with source info -- ArrowPacket: Arrow table-based packet implementation -- PythonDictTag/ArrowTag: Tag implementations for data identification - -The module also provides utilities for schema validation, table operations, -and type conversions between semantic stores, Python stores, and Arrow tables. -""" - -from orcapod.types.core import DataValue -from typing import TypeAlias, Self -from collections.abc import Mapping, Collection -from orcapod.types import TypeSpec -from orcapod.types.semantic_converter import SemanticConverter -from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.types import schemas -from orcapod.types import typespec_utils as tsutils -from orcapod.data.context import ( - DataContext, -) -import pyarrow as pa -import logging -from orcapod.utils import arrow_utils - - -# Constants used for source info keys -SOURCE_INFO_PREFIX = "_source_info_" - - -logger = logging.getLogger(__name__) -# A conveniece packet-like type that defines a value that can be -# converted to a packet. It's broader than Packet and a simple mapping -# from string keys to DataValue (e.g., int, float, str) can be regarded -# as PacketLike, allowing for more flexible interfaces. -# Anything that requires Packet-like data but without the strict features -# of a Packet should accept PacketLike. -# One should be careful when using PacketLike as a return type as it does not -# enforce the typespec or source_info, which are important for packet integrity. -PacketLike: TypeAlias = Mapping[str, DataValue] - -PythonStore: TypeAlias = Mapping[str, DataValue] - - -class ImmutableDict(Mapping[str, DataValue]): - """ - An immutable dictionary-like container for DataValues. - - Provides a read-only view of a dictionary mapping strings to DataValues, - implementing the Mapping protocol for compatibility with dict-like operations. - - Initialize with data from a mapping. - Args: - data: Source mapping to copy data from - """ - - def __init__(self, data: Mapping[str, DataValue]): - self._data = dict(data) - - def __getitem__(self, key: str) -> DataValue: - return self._data[key] - - def __iter__(self): - return iter(self._data) - - def __len__(self) -> int: - return len(self._data) - - def __repr__(self) -> str: - return self._data.__repr__() - - def __str__(self) -> str: - return self._data.__str__() - - -# TODO: Inherit from Mapping instead to provide immutable datagram -class DictDatagram(ImmutableDict): - """ - An immutable datagram implementation using a dictionary backend. - - Extends ImmutableDict to provide additional functionality for type handling, - semantic conversion, and Arrow table representation while maintaining - immutability of the underlying data. - - - Initialize DictDatagram with data and optional type information. - - Args: - data: Source data mapping - typespec: Optional type specification for fields - semantic_converter: Optional converter for semantic types - semantic_type_registry: Registry for semantic type lookup - arrow_hasher: Optional hasher for Arrow table content - """ - - def __init__( - self, - data: Mapping[str, DataValue], - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - super().__init__(data) - # normalize the data content and remove any source info keys - self._data_context = DataContext.resolve_data_context(data_context) - - # combine provided typespec info with inferred typespec from content - inferred_typespec = tsutils.get_typespec_from_dict(self, typespec) - self._python_schema = schemas.PythonSchema(inferred_typespec) - - # create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - self._python_schema.to_semantic_schema( - semantic_type_registry=self._data_context.semantic_type_registry - ), - ) - self.semantic_converter = semantic_converter - - self._cached_table: pa.Table | None = None - self._cached_content_hash: str | None = None - self._cached_arrow_schema: pa.Schema | None = None - - @property - def data_context_key(self) -> str: - """Return the context key of the datagram.""" - return self._data_context.context_key - - def as_table(self, include_data_context: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - - if self._cached_table is None: - self._cached_table = self.semantic_converter.from_python_to_arrow( - self.as_dict(include_data_context=True), - self.types(include_data_context=True), - ) - assert self._cached_table is not None, "Cached table should not be None" - if include_data_context: - return self._cached_table - - return self._cached_table.drop([DataContext.get_data_context_column()]) - - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - data = dict(self) - if include_data_context: - data[DataContext.get_data_context_column()] = self._data_context.context_key - return data - - def content_hash( - self, - ) -> str: - """ - Calculate and return content hash of the datagram. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_data_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - def types(self, include_data_context: bool = False) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = self._python_schema.copy() - if include_data_context: - schema[DataContext.get_data_context_column()] = str - return schema - - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - if self._cached_arrow_schema is None: - self._cached_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self.types(include_data_context=True) - ) - ) - if not include_data_context: - return arrow_utils.drop_schema_columns( - self._cached_arrow_schema, - [DataContext.get_data_context_column()], - ) - return self._cached_arrow_schema - - @classmethod - def _from_copy( - cls, - data: Mapping[str, DataValue], - python_schema: schemas.PythonSchema, - semantic_converter: SemanticConverter, - data_context: DataContext, - ) -> Self: - """Create a new instance from copy without full initialization.""" - instance = cls.__new__(cls) - ImmutableDict.__init__(instance, data) - - # Set attributes directly - instance._python_schema = python_schema - instance.semantic_converter = semantic_converter - instance._data_context = data_context - instance._cached_table = None - instance._cached_content_hash = None - - return instance - - def copy(self) -> Self: - """Return a copy of the datagram.""" - return self._from_copy( - self, - self._python_schema.copy(), - self.semantic_converter, - self._data_context, - ) - - -class DictTag(DictDatagram): - """ - A simple tag implementation using Python dictionary. - - Represents a tag (metadata) as a dictionary that can be converted - to different representations like Arrow tables. - """ - - -class DictPacket(DictDatagram): - """ - Enhanced packet implementation with source information support. - - Extends DictDatagram to include source information tracking and - enhanced table conversion capabilities that can include or exclude - source metadata. - - Initialize packet with data and optional source information. - - Args: - data: Primary data content - source_info: Optional mapping of field names to source information - typespec: Optional type specification - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types. Defaults to system default registry. - arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. - """ - - def __init__( - self, - data: Mapping[str, DataValue], - source_info: Mapping[str, str | None] | None = None, - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data_only = { - k: v for k, v in data.items() if not k.startswith(SOURCE_INFO_PREFIX) - } - contained_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in data.items() - if k.startswith(SOURCE_INFO_PREFIX) - } - - super().__init__( - data_only, - typespec=typespec, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._source_info = {**contained_source_info, **(source_info or {})} - self._cached_source_info_table: pa.Table | None = None - self._cached_source_info_schema: pa.Schema | None = None - - @property - def _source_info_schema(self) -> pa.Schema: - if self._cached_source_info_schema is None: - self._cached_source_info_schema = pa.schema( - {f"{SOURCE_INFO_PREFIX}{k}": pa.large_string() for k in self.keys()} - ) - return self._cached_source_info_schema - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - """Convert the packet to an Arrow table.""" - table = super().as_table(include_data_context=include_data_context) - if include_source: - if self._cached_source_info_table is None: - source_info_data = { - f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items() - } - self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=self._source_info_schema - ) - assert self._cached_source_info_table is not None, ( - "Cached source info table should not be None" - ) - # subselect the corresponding _source_info as the columns present in the data table - source_info_table = self._cached_source_info_table.select( - [ - f"{SOURCE_INFO_PREFIX}{k}" - for k in table.column_names - if k in self.keys() - ] - ) - table = arrow_utils.hstack_tables(table, source_info_table) - return table - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Return dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - dict_copy = super().as_dict(include_data_context=include_data_context) - if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{SOURCE_INFO_PREFIX}{key}"] = value - return dict_copy - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) - return schema - - def as_datagram(self, include_source: bool = False) -> DictDatagram: - """ - Convert the packet to a DictDatagram. - - Args: - include_source: Whether to include source info fields - - Returns: - DictDatagram representation of the packet - """ - data = self.as_dict(include_source=include_source) - typespec = self.types(include_source=include_source) - return DictDatagram( - data, - typespec=typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info - """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> Self: - """Return a shallow copy of the packet.""" - instance = super().copy() - instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table - return instance - - -# def prepare_system_data_tables( -# table: pa.Table, -# source_info: dict[str, str | None] | None = None, -# ) -> tuple[pa.Table, pa.Table]: -# """ -# Process a table to ensure proper source_info columns. - -# Args: -# table: Input PyArrow table -# source_info: optional dictionary mapping column names to source info values. If present, -# it will take precedence over existing source_info columns in the table. - -# Returns: -# tuple of table without any source info and another table only containing source info columns (with prefix) -# """ -# if source_info is None: -# source_info = {} - -# # Step 1: Separate source_info columns from regular columns -# data_columns = [] -# data_column_names = [] -# existing_source_info = {} - -# for i, name in enumerate(table.column_names): -# if name.startswith(SOURCE_INFO_PREFIX): -# # Extract the base column name -# base_name = name.removeprefix(SOURCE_INFO_PREFIX) -# existing_source_info[base_name] = table.column(i) -# else: -# data_columns.append(table.column(i)) -# data_column_names.append(name) - -# # Step 2: Create source_info columns for each regular column -# source_info_columns = [] -# source_info_column_names = [] - -# # Create source_info columns for each regular column -# num_rows = table.num_rows - -# for col_name in data_column_names: -# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" - -# # if col_name is in source_info, use that value -# if col_name in source_info: -# # Use value from source_info dictionary -# source_value = source_info[col_name] -# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) -# # if col_name is in existing_source_info, use that column -# elif col_name in existing_source_info: -# # Use existing source_info column, but convert to large_string -# existing_col = existing_source_info[col_name] -# if existing_col.type == pa.large_string(): -# source_values = existing_col -# else: -# # Convert to large_string -# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore - -# else: -# # Use null values -# source_values = pa.array([None] * num_rows, type=pa.large_string()) - -# source_info_columns.append(source_values) -# source_info_column_names.append(source_info_col_name) - -# # Step 3: Create the final table -# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) -# source_info_table: pa.Table = pa.Table.from_arrays( -# source_info_columns, names=source_info_column_names -# ) -# return data_table, source_info_table - - -class ArrowDatagram: - """ - An immutable datagram implementation using a PyArrow Table backend. - TODO: handle RecordBatch in addition to table - - This basic datagram provides functionality for type handling, - semantic conversion, and dict-based content representation while maintaining - immutability of the underlying data. - - - Initialize ArrowDatagram with a PyArrow table. - - Args: - data: Source data mapping - typespec: Optional type specification for fields - semantic_converter: Optional converter for semantic types - semantic_type_registry: Registry for semantic type lookup - arrow_hasher: Optional hasher for Arrow table content - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the table to ensure it contains proper source columns - if len(table) != 1: - raise ValueError( - "Table must contain exactly one row to be a valid datagram." - ) - - # TODO: add check for compatible types, especially of str being pa.large_string - table, data_context_table = arrow_utils.split_by_column_groups( - table, [DataContext.get_data_context_column()] - ) - - self._table = table - - if data_context is None and data_context_table is not None: - data_context = data_context_table[ - DataContext.get_data_context_column() - ].to_pylist()[0] - - self._data_context = DataContext.resolve_data_context(data_context) - - data_context_schema = pa.schema( - {DataContext.get_data_context_column(): pa.large_string()} - ) - self._data_context_table = pa.Table.from_pylist( - [{DataContext.get_data_context_column(): self._data_context.context_key}], - schema=data_context_schema, - ) - - # create semantic converter - # TODO: consider some validation of passed semantic_converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - self._table.schema, - self._data_context.semantic_type_registry, - ) - ) - self._semantic_converter = semantic_converter - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_content_hash: str | None = None - - @property - def data_context_key(self) -> str: - """Return the context key of the datagram.""" - return self._data_context.context_key - - def as_table(self, include_data_context: bool = False) -> pa.Table: - """Convert the packet to an Arrow table.""" - if include_data_context: - return arrow_utils.hstack_tables(self._table, self._data_context_table) - return self._table - - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - if self._cached_python_dict is None: - self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self.as_table(include_data_context=False) - )[0] - assert self._cached_python_dict is not None, "Cached dict should not be None" - output = dict(self._cached_python_dict) - if include_data_context: - output[DataContext.get_data_context_column()] = ( - self._data_context.context_key - ) - return output - - def content_hash( - self, - ) -> str: - """ - Calculate and return content hash of the datagram. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_data_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - def keys(self) -> tuple[str, ...]: - return tuple(self._table.column_names) - - def types(self, include_data_context: bool = False) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - if self._cached_python_schema is None: - self._cached_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema(self._table.schema) - ) - schema = self._cached_python_schema.copy() - if include_data_context: - schema[DataContext.get_data_context_column()] = str - return schema - - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - if include_data_context: - return arrow_utils.join_arrow_schemas( - self._table.schema, self._data_context_table.schema - ) - return self._table.schema - - @classmethod - def _from_copy( - cls, - table: pa.Table, - python_schema: schemas.PythonSchema, - semantic_converter: SemanticConverter, - arrow_hasher: hp.ArrowHasher, - ) -> Self: - """Create a new instance from copy without full initialization.""" - instance = cls.__new__(cls) - instance._table = table - instance._semantic_converter = semantic_converter - instance._data_context = arrow_hasher - - # Set attributes directly - instance._cached_content_hash = None - - return instance - - def copy(self) -> Self: - """Return a copy of the datagram.""" - new_datagram = self.__class__( - self._table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_python_dict = self._cached_python_dict - return new_datagram - - def __repr__(self) -> str: - """Return string representation.""" - return f"{self.as_dict()}" - - -class ArrowTag(ArrowDatagram): - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - super().__init__( - table=table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - -class ArrowPacket(ArrowDatagram): - """ - Arrow table-based packet implementation with comprehensive features. - - A packet implementation that uses Arrow tables as the primary storage format, - providing efficient memory usage and columnar data operations while supporting - source information tracking and content hashing. - - - Initialize ArrowPacket with Arrow table and configuration. - - Args: - table: Single-row Arrow table representing the packet - source_info: Optional source information mapping - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint for tracking - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - skip_source_info_extraction: Whether to skip source info processing - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - data: pa.Table, - source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(data) != 1: - raise ValueError( - "ArrowPacket should only contain a single row, " - "as it represents a single packet." - ) - if source_info is None: - source_info = {} - - # normalize the table to ensure it has the expected source_info columns - data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( - data, - {SOURCE_INFO_PREFIX: source_info}, - exclude_columns=[DataContext.get_data_context_column()], - ) - self._source_info_table = prefixed_tables[SOURCE_INFO_PREFIX] - - super().__init__( - data_table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - table = super().as_table(include_data_context=include_data_context) - if include_source: - # add source_info only for existing data columns - table = arrow_utils.hstack_tables( - table, - self._source_info_table.select( - [ - f"{SOURCE_INFO_PREFIX}{c}" - for c in table.column_names - if c in self.keys() - ] - ), - ) - return table - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas( - schema, self._source_info_table.schema - ) - return schema - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Convert to dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - return_dict = super().as_dict(include_data_context=include_data_context) - if include_source: - return_dict.update( - {f"{SOURCE_INFO_PREFIX}{k}": v for k, v in self.source_info().items()} - ) - return return_dict - - def as_datagram(self, include_source: bool = False) -> ArrowDatagram: - table = self.as_table(include_source=include_source) - return ArrowDatagram( - table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Copy of the dictionary mapping field names to their source info - """ - if self._cached_source_info is None: - self._cached_source_info = { - k.removeprefix(SOURCE_INFO_PREFIX): v - for k, v in self._source_info_table.to_pylist()[0].items() - } - return self._cached_source_info.copy() - - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash - - return new_packet - - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] -"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/datagrams/__init__.py b/src/orcapod/data/datagrams/__init__.py new file mode 100644 index 0000000..0c255e3 --- /dev/null +++ b/src/orcapod/data/datagrams/__init__.py @@ -0,0 +1,13 @@ +from .arrow_datagram import ArrowDatagram +from .arrow_tag_packet import ArrowTag, ArrowPacket +from .dict_datagram import DictDatagram +from .dict_tag_packet import DictTag, DictPacket + +__all__ = [ + "ArrowDatagram", + "ArrowTag", + "ArrowPacket", + "DictDatagram", + "DictTag", + "DictPacket", +] diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py new file mode 100644 index 0000000..5ed5307 --- /dev/null +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -0,0 +1,867 @@ +import logging +from collections.abc import Collection, Iterator, Mapping +from typing import Any, Self + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.data.datagrams.base import BaseDatagram +from orcapod.types import schemas, typespec_utils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class ArrowDatagram(BaseDatagram): + """ + Immutable datagram implementation using PyArrow Table as storage backend. + + This implementation provides high-performance columnar data operations while + maintaining the datagram interface. It efficiently handles type conversions, + semantic processing, and interoperability with Arrow-based tools. + + The underlying table is split into separate components: + - Data table: Primary business data columns + - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context table: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> table = pa.Table.from_pydict({ + ... "user_id": [123], + ... "name": ["Alice"], + ... "__pipeline_version": ["v2.1.0"], + ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] + ... }) + >>> datagram = ArrowDatagram(table) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize ArrowDatagram from PyArrow Table. + + Args: + table: PyArrow Table containing the data. Must have exactly one row. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on the data context and table schema. + data_context: Context key string or DataContext object. + If None and table contains context column, will extract from table. + + Raises: + ValueError: If table doesn't contain exactly one row. + + Note: + The input table is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Validate table has exactly one row for datagram + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # Split table into data, meta, and context components + context_columns = ( + [constants.CONTEXT_KEY] + if constants.CONTEXT_KEY in table.column_names + else [] + ) + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] + + # Extract context table if present + if constants.CONTEXT_KEY in table.column_names and data_context is None: + context_table = table.select([constants.CONTEXT_KEY]) + data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] + + # Initialize base class with data context + super().__init__(data_context) + + # Split table into components + self._data_table = table.drop_columns(context_columns + meta_columns) + self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: + raise ValueError("Data table must contain at least one data column.") + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._data_table.schema, + self._data_context.semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + + # Create data context table + data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) + self._data_context_table = pa.Table.from_pylist( + [{constants.CONTEXT_KEY: self._data_context.context_key}], + schema=data_context_schema, + ) + + # Initialize caches + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_meta_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def _core_info(self) -> dict[str, Any]: + core_info = { + "data_table": self._data_table, + "meta_table": self._meta_table, + "data_context_table": self._data_context_table, + "semantic_converter": self._semantic_converter, + "cached_python_schema": self._cached_python_schema, + "cached_python_dict": self._cached_python_dict, + "cached_meta_python_schema": self._cached_meta_python_schema, + "cached_content_hash": self._cached_content_hash, + } + return core_info + + def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: + new_copy = object.__new__(self.__class__) + new_copy._data_table = core_info["data_table"] + new_copy._meta_table = core_info["meta_table"] + new_copy._data_context_table = core_info["data_context_table"] + new_copy._semantic_converter = core_info["semantic_converter"] + new_copy._cached_python_schema = core_info["cached_python_schema"] + new_copy._cached_python_dict = core_info["cached_python_dict"] + new_copy._cached_meta_python_schema = core_info["cached_meta_python_schema"] + new_copy._cached_content_hash = core_info["cached_content_hash"] + return new_copy + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + if self._meta_table is None: + return () + return tuple(self._meta_table.column_names) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data_table.column_names: + raise KeyError(f"Data column '{key}' not found") + + return self._data_table[key].to_pylist()[0] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data_table.column_names + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data_table.column_names) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + if key in self._data_table.column_names: + return self.as_dict()[key] + return default + + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + result_keys = list(self._data_table.column_names) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Get data schema (cached) + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._data_table.schema + ) + ) + + schema = dict(self._cached_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if self._cached_meta_python_schema is None: + self._cached_meta_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._meta_table.schema + ) + ) + meta_schema = dict(self._cached_meta_python_schema) + if include_meta_columns is True: + schema.update(meta_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in meta_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + # order matters + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + all_schemas = [self._data_table.schema] + + # Add context schema if requested + if include_context: + # TODO: reassess the efficiency of this approach + all_schemas.append(self._data_context_table.schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_schema = self._meta_table.schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._meta_table.schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self._data_table, + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Get data dict (cached) + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._data_table + )[0] + + result_dict = dict(self._cached_python_dict) + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta data if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_dict = self._meta_table.to_pylist()[0] + elif isinstance(include_meta_columns, Collection): + meta_dict = self._meta_table.to_pylist()[0] + # Include only meta columns matching prefixes + meta_dict = { + k: v + for k, v in meta_dict.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + if meta_dict is not None: + result_dict.update(meta_dict) + + return result_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + all_tables = [self._data_table] + + # Add context if requested + if include_context: + all_tables.append(self._data_context_table) + + # Add meta columns if requested + if include_meta_columns and self._meta_table is not None: + meta_table = None + if include_meta_columns is True: + meta_table = self._meta_table + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + # ensure all given prefixes start with the meta prefix + prefixes = ( + f"{constants.META_PREFIX}{prefix}" + if not prefix.startswith(constants.META_PREFIX) + else prefix + for prefix in include_meta_columns + ) + + matched_cols = [ + col + for col in self._meta_table.column_names + if any(col.startswith(prefix) for prefix in prefixes) + ] + if matched_cols: + meta_table = self._meta_table.select(matched_cols) + else: + meta_table = None + + if meta_table is not None: + all_tables.append(meta_table) + + return arrow_utils.hstack_tables(*all_tables) + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get a meta column value. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) + default: Default value if not found + + Returns: + Meta column value + """ + if self._meta_table is None: + return default + + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + if key not in self._meta_table.column_names: + return default + + return self._meta_table[key].to_pylist()[0] + + def with_meta_columns(self, **meta_updates: DataValue) -> Self: + """ + Create a new ArrowDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New ArrowDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + meta_dict = {} + if self._meta_table is not None: + meta_dict = self._meta_table.to_pylist()[0] + + # Apply updates + meta_dict.update(prefixed_updates) + + # Create new meta table + new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None + + # Combine all tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: + """ + Create a new ArrowDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + + Returns: + New ArrowDatagram instance without specified meta columns + """ + if self._meta_table is None: + return self # No meta columns to drop + + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_table.column_names) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter meta columns + remaining_cols = [ + col for col in self._meta_table.column_names if col not in prefixed_keys + ] + + # Create new meta table + new_meta_table = ( + self._meta_table.select(remaining_cols) if remaining_cols else None + ) + + # Combine tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> Self: + """ + Create a new ArrowDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New ArrowDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data_table.column_names) + if missing_cols: + raise ValueError(f"Columns not found: {missing_cols}") + + new_data_table = self._data_table.select(list(column_names)) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: + """ + Create a new ArrowDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New ArrowDatagram instance without specified data columns + """ + + # Filter out specified data columns + missing = set(column_names) - set(self._data_table.column_names) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + # Filter data columns + remaining_cols = [ + col for col in self._data_table.column_names if col not in column_names + ] + + if not remaining_cols: + raise ValueError("Cannot drop all data columns") + + new_data_table = self._data_table.select(remaining_cols) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """ + Create a new ArrowDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New ArrowDatagram instance with renamed data columns + """ + # Create new schema with renamed fields, preserving original types + new_fields = [] + for field in self._data_table.schema: + old_name = field.name + new_name = column_mapping.get(old_name, old_name) + new_field = pa.field(new_name, field.type) + new_fields.append(new_field) + + # Create new data table with renamed columns + new_schema = pa.schema(new_fields) + new_data_table = self._data_table.rename_columns( + [column_mapping.get(name, name) for name in self._data_table.column_names] + ).cast(new_schema) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> Self: + """ + Create a new ArrowDatagram with specific column values updated. + + Args: + **updates: Column names and their new values + + Returns: + New ArrowDatagram instance with updated values + + Raises: + KeyError: If any specified column doesn't exist + + Example: + # Convert relative path to absolute path + updated = datagram.update(file_path="/absolute/path/to/file.txt") + + # Update multiple values + updated = datagram.update(status="processed", file_path="/new/path") + """ + # Only update if there are columns to update + if not updates: + return self + + # Validate all columns exist + missing_cols = set(updates.keys()) - set(self._data_table.column_names) + if missing_cols: + raise KeyError( + f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" + ) + + updates_typespec = schemas.PythonSchema( + {k: v for k, v in self.types().items() if k in updates} + ) + + update_table = self._semantic_converter.from_python_to_arrow( + updates, updates_typespec + ) + all_tables = [self._data_table.drop_columns(list(updates.keys())), update_table] + + if self._meta_table is not None: + all_tables.append(self._meta_table) + + return self.__class__( + table=arrow_utils.hstack_tables(*all_tables), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create a new ArrowDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New ArrowDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # TODO: consider simplifying this conversion logic + typespec = typespec_utils.get_typespec_from_dict(updates, column_types) + + updates_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_typespec( + typespec, self._data_context.semantic_type_registry + ) + ) + # TODO: cleanup the handling of typespec python schema and various conversion points + new_data_table = updates_converter.from_python_to_arrow(updates, typespec) + + # Combine with meta table for reconstruction + all_tables = [self._data_table, new_data_table] + if self._meta_table is not None: + all_tables.append(self._meta_table) + + combined_table = arrow_utils.hstack_tables(*all_tables) + + # prepare the joined converter + total_converter = self._semantic_converter.join(updates_converter) + + return self.__class__( + table=combined_table, + semantic_converter=total_converter, + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> Self: + """ + Create a new ArrowDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New ArrowDatagram instance with new context + """ + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return self.__class__( + table=combined_table, + data_context=new_context_key, + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """Return a copy of the datagram.""" + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + new_datagram = self.__class__( + combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + + Example: + >>> str(datagram) + "{'user_id': 123, 'name': 'Alice'}" + >>> print(datagram) + {'user_id': 123, 'name': 'Alice'} + """ + return str(self.as_dict()) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + + Example: + >>> repr(datagram) + "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" + """ + data_dict = self.as_dict() + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"{self.__class__.__name__}(" + f"data={data_dict}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py new file mode 100644 index 0000000..f776365 --- /dev/null +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -0,0 +1,268 @@ +import logging +from collections.abc import Collection +from typing import Self + + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.types import schemas +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +from orcapod.data.datagrams.arrow_datagram import ArrowDatagram + +logger = logging.getLogger(__name__) + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + +class ArrowPacket(ArrowDatagram): + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + table, + {constants.SOURCE_PREFIX: source_info}, + exclude_columns=[constants.CONTEXT_KEY], + exclude_prefixes=[constants.META_PREFIX], + ) + self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] + + super().__init__( + data_table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + keys += tuple(f"{constants.SOURCE_PREFIX}{k}" for k in self.keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + return_dict = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return_dict.update( + { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + ) + return return_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables(table, self._source_info_table) + return table + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> ArrowDatagram: + table = self.as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ + if self._cached_source_info is None: + self._cached_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() + } + return self._cached_source_info.copy() + + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema + new_packet._cached_content_hash = self._cached_content_hash + + return new_packet diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py new file mode 100644 index 0000000..0ec1501 --- /dev/null +++ b/src/orcapod/data/datagrams/base.py @@ -0,0 +1,301 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + +import logging +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator, Mapping +from typing import Any, Self, TypeAlias + +import pyarrow as pa + +from orcapod.data.context import ( + DataContext, +) +from orcapod.types import TypeSpec +from orcapod.types.core import DataValue + +logger = logging.getLogger(__name__) + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +PythonStore: TypeAlias = Mapping[str, DataValue] + + +class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + def __or__(self, other: Mapping[str, DataValue]) -> Self: + """ + Create a new ImmutableDict by merging with another mapping. + + Args: + other: Another mapping to merge with + + Returns: + A new ImmutableDict containing the combined data + """ + return self.__class__(self._data | dict(other)) + + +def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: + """ + Check if a column name matches any of the given prefixes. + + Args: + column: Column name to check + prefixes: Collection of prefixes to match against + + Returns: + True if the column starts with any of the prefixes, False otherwise + """ + for prefix in prefixes: + if column.startswith(prefix): + return True + return False + + +class BaseDatagram(ABC): + """ + Abstract base class for immutable datagram implementations. + + Provides shared functionality and enforces consistent interface across + different storage backends (dict, Arrow table, etc.). Concrete subclasses + must implement the abstract methods to handle their specific storage format. + + The base class only manages the data context key string - how that key + is interpreted and used is left to concrete implementations. + """ + + def __init__(self, data_context: DataContext | str | None = None) -> None: + """ + Initialize base datagram with data context. + + Args: + data_context: Context for semantic interpretation. Can be a string key + or a DataContext object, or None for default. + """ + self._data_context = DataContext.resolve_data_context(data_context) + + # 1. Core Properties (Identity & Structure) + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + + @property + @abstractmethod + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + ... + + # 2. Dict-like Interface (Data Access) + @abstractmethod + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + ... + + @abstractmethod + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + ... + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + ... + + @abstractmethod + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + ... + + # 3. Structural Information + @abstractmethod + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + ... + + @abstractmethod + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: + """Return type specification for the datagram.""" + ... + + @abstractmethod + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """Return the PyArrow schema for this datagram.""" + ... + + @abstractmethod + def content_hash(self) -> str: + """Calculate and return content hash of the datagram.""" + ... + + # 4. Format Conversions (Export) + @abstractmethod + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + ... + + @abstractmethod + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """Convert the datagram to an Arrow table.""" + ... + + # 5. Meta Column Operations + @abstractmethod + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """Get a meta column value.""" + ... + + @abstractmethod + def with_meta_columns(self, **updates: DataValue) -> Self: + """Create new datagram with updated meta columns.""" + ... + + @abstractmethod + def drop_meta_columns(self, *keys: str) -> Self: + """Create new datagram with specified meta columns removed.""" + ... + + # 6. Data Column Operations + @abstractmethod + def select(self, *column_names: str) -> Self: + """Create new datagram with only specified data columns.""" + ... + + @abstractmethod + def drop(self, *column_names: str) -> Self: + """Create new datagram with specified data columns removed.""" + ... + + @abstractmethod + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Create new datagram with data columns renamed.""" + ... + + @abstractmethod + def update(self, **updates: DataValue) -> Self: + """Create new datagram with existing column values updated.""" + ... + + @abstractmethod + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """Create new datagram with additional data columns.""" + ... + + # 7. Context Operations + @abstractmethod + def with_context_key(self, new_context_key: str) -> Self: + """Create new datagram with different data context.""" + ... + + # 8. Utility Operations + @abstractmethod + def copy(self) -> Self: + """Create a shallow copy of the datagram.""" + ... + + @abstractmethod + def _core_info(self) -> dict[str, Any]: + """ + Return core information about the datagram. + This is meant to be used for internal purposes only and is not part of the public API. + It provides necessary information to create an efficient copy of the datagram + and in a manner that works across inheritance hierarchies. + + Returns: + Dictionary with all information necessary to recreate the datagram in a copy. + """ + ... + + @abstractmethod + def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: + """ + Create a new datagram instance from core information. + + Args: + core_info: Dictionary with core information about the datagram + + Returns: + New datagram instance + """ + ... diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py new file mode 100644 index 0000000..5ebd926 --- /dev/null +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -0,0 +1,835 @@ +import logging +from collections.abc import Collection, Iterator, Mapping +from typing import Self, cast + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.data.datagrams.base import BaseDatagram +from orcapod.types import TypeSpec, schemas +from orcapod.types import typespec_utils as tsutils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class DictDatagram(BaseDatagram): + """ + Immutable datagram implementation using dictionary as storage backend. + + This implementation uses composition (not inheritance from Mapping) to maintain + control over the interface while leveraging dictionary efficiency for data access. + Provides clean separation between data, meta, and context components. + + The underlying data is split into separate components: + - Data dict: Primary business data columns + - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> data = {{ + ... "user_id": 123, + ... "name": "Alice", + ... "__pipeline_version": "v2.1.0", + ... "{orcapod.CONTEXT_KEY}": "financial_v1" + ... }} + >>> datagram = DictDatagram(data) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize DictDatagram from dictionary data. + + Args: + data: Source data mapping containing all column data. + typespec: Optional type specification for fields. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on data context and inferred types. + data_context: Data context for semantic type resolution. + If None and data contains context column, will extract from data. + + Note: + The input data is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Parse through data and extract different column types + data_columns = {} + meta_columns = {} + extracted_context = None + + for k, v in data.items(): + if k == constants.CONTEXT_KEY: + # Extract data context but keep it separate from meta data + if data_context is None: + extracted_context = v + # Don't store context in meta_data - it's managed separately + elif k.startswith(constants.META_PREFIX): + # Double underscore = meta metadata + meta_columns[k] = v + else: + # Everything else = user data (including _source_ and semantic types) + data_columns[k] = v + + # Initialize base class with data context + final_context = data_context or cast(str, extracted_context) + super().__init__(final_context) + + # Store data and meta components separately (immutable) + self._data = dict(data_columns) + self._meta_data = dict(meta_columns) + + # Combine provided typespec info with inferred typespec from content + # If the column value is None and no type spec is provided, defaults to str. + self._data_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._data, + typespec, + ) + ) + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._data_python_schema.to_semantic_schema( + semantic_type_registry=self._data_context.semantic_type_registry + ), + ) + self.semantic_converter = semantic_converter + + # Create schema for meta data + self._meta_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._meta_data, + typespec=typespec, + ) + ) + + # Initialize caches + self._cached_data_table: pa.Table | None = None + self._cached_meta_table: pa.Table | None = None + self._cached_content_hash: str | None = None + self._cached_data_arrow_schema: pa.Schema | None = None + self._cached_meta_arrow_schema: pa.Schema | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + return tuple(self._meta_data.keys()) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data: + raise KeyError(f"Data column '{key}' not found") + return self._data[key] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + return self._data.get(key, default) + + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + # Start with data columns + result_keys = list(self._data.keys()) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Start with data schema + schema = dict(self._data_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + schema.update(self._meta_python_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in self._meta_python_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + include_meta_columns = include_all_info or include_meta_columns + include_context = include_all_info or include_context + + # Build data schema (cached) + if self._cached_data_arrow_schema is None: + self._cached_data_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._data_python_schema + ) + ) + + all_schemas = [self._cached_data_arrow_schema] + + # Add context schema if requested + if include_context: + context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) + all_schemas.append(context_schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + + if include_meta_columns is True: + meta_schema = self._cached_meta_arrow_schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._cached_meta_arrow_schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_meta_columns=False, include_context=False), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + include_context = include_all_info or include_context + include_meta_columns = include_all_info or include_meta_columns + + result_dict = dict(self._data) # Start with user data + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta columns if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + # Include all meta columns + result_dict.update(self._meta_data) + elif isinstance(include_meta_columns, Collection): + # Include only meta columns matching prefixes + filtered_meta_data = { + k: v + for k, v in self._meta_data.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + result_dict.update(filtered_meta_data) + + return result_dict + + def _get_meta_arrow_table(self) -> pa.Table: + if self._cached_meta_table is None: + arrow_schema = self._get_meta_arrow_schema() + self._cached_meta_table = pa.Table.from_pylist( + [self._meta_data], + schema=arrow_schema, + ) + assert self._cached_meta_table is not None, ( + "Meta Arrow table should be initialized by now" + ) + return self._cached_meta_table + + def _get_meta_arrow_schema(self) -> pa.Schema: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + return self._cached_meta_arrow_schema + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + include_context = include_all_info or include_context + include_meta_columns = include_all_info or include_meta_columns + + # Build data table (cached) + if self._cached_data_table is None: + self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._data, + self._data_python_schema, + ) + assert self._cached_data_table is not None, ( + "Data Arrow table should be initialized by now" + ) + result_table = self._cached_data_table + + # Add context if requested + if include_context: + result_table = result_table.append_column( + constants.CONTEXT_KEY, + pa.array([self._data_context.context_key], type=pa.large_string()), + ) + + # Add meta columns if requested + meta_table = None + if include_meta_columns and self._meta_data: + meta_table = self._get_meta_arrow_table() + # Select appropriate meta columns + if isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_data.keys() + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = meta_table.select(matched_cols) + else: + meta_table = None + + # Combine tables if we have meta columns to add + if meta_table is not None: + result_table = arrow_utils.hstack_tables(result_table, meta_table) + + return result_table + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + """ + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + return self._meta_data.get(key, default) + + def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New DictDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + new_meta_data = dict(self._meta_data) + new_meta_data.update(prefixed_updates) + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = False + ) -> "DictDatagram": + """ + Create a new DictDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + ignore_missing: If True, ignore missing meta columns without raising an error. + + Raises: + KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Returns: + New DictDatagram instance without specified meta columns + """ + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_data.keys()) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter out specified meta columns + new_meta_data = { + k: v for k, v in self._meta_data.items() if k not in prefixed_keys + } + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Filtered meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "DictDatagram": + """ + Create a new DictDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New DictDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data.keys()) + if missing_cols: + raise KeyError(f"Columns not found: {missing_cols}") + + # Keep only specified data columns + new_data = {k: v for k, v in self._data.items() if k in column_names} + + # Reconstruct full data dict for new instance + full_data = new_data # Selected user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": + """ + Create a new DictDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New DictDatagram instance without specified data columns + """ + # Filter out specified data columns + missing = set(column_names) - set(self._data.keys()) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + new_data = {k: v for k, v in self._data.items() if k not in column_names} + + if not new_data: + raise ValueError("Cannot drop all data columns") + + # Reconstruct full data dict for new instance + full_data = new_data # Filtered user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": + """ + Create a new DictDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New DictDatagram instance with renamed data columns + """ + # Rename data columns according to mapping, preserving original types + new_data = {} + for old_name, value in self._data.items(): + new_name = column_mapping.get(old_name, old_name) + new_data[new_name] = value + + # Handle typespec updates for renamed columns + new_typespec = None + if self._data_python_schema: + existing_typespec = dict(self._data_python_schema) + + # Rename types according to column mapping + renamed_typespec = {} + for old_name, old_type in existing_typespec.items(): + new_name = column_mapping.get(old_name, old_name) + renamed_typespec[new_name] = old_type + + new_typespec = renamed_typespec + + # Reconstruct full data dict for new instance + full_data = new_data # Renamed user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with existing column values updated. + Maintains immutability by returning a new instance. + + Args: + **updates: Column names and their new values (columns must exist) + + Returns: + New DictDatagram instance with updated values + + Raises: + KeyError: If any column doesn't exist (use with_columns() to add new columns) + """ + if not updates: + return self + + # Error if any column doesn't exist + missing_columns = set(updates.keys()) - set(self._data.keys()) + if missing_columns: + raise KeyError( + f"Columns not found: {sorted(missing_columns)}. " + f"Use with_columns() to add new columns." + ) + + # Update existing columns + new_data = dict(self._data) + new_data.update(updates) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, # Keep existing converter + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "DictDatagram": + """ + Create a new DictDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New DictDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data.keys()) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # Update user data with new columns + new_data = dict(self._data) + new_data.update(updates) + + # Create updated typespec - handle None values by defaulting to str + typespec = self.types() + if column_types is not None: + typespec.update(column_types) + + new_typespec = tsutils.get_typespec_from_dict( + new_data, + typespec=typespec, + ) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + # semantic converter needs to be rebuilt for new columns + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "DictDatagram": + """ + Create a new DictDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New DictDatagram instance with new context + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + return DictDatagram( + data=full_data, + data_context=new_context_key, # New context + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New DictDatagram instance with copied data and caches. + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + new_datagram = self.__class__( + full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + """ + return str(self._data) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + """ + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"{self.__class__.__name__}(" + f"data={self._data}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py new file mode 100644 index 0000000..92bf6aa --- /dev/null +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -0,0 +1,256 @@ +import logging +from collections.abc import Collection, Mapping +from typing import Self + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import DataContext +from orcapod.data.datagrams.dict_datagram import DictDatagram +from orcapod.types import TypeSpec, schemas +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + + +class DictTag(DictDatagram): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + + +class DictPacket(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) + } + contained_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in data.items() + if k.startswith(constants.SOURCE_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + { + f"{constants.SOURCE_PREFIX}{k}": pa.large_string() + for k in self.keys() + } + ) + return self._cached_source_info_schema + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=self._source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [ + f"{constants.SOURCE_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] + ) + table = arrow_utils.hstack_tables(table, source_info_table) + return table + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + dict_copy = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key, value in self.source_info().items(): + dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value + return dict_copy + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: + """Return keys of the Python schema.""" + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + keys += tuple(f"{constants.SOURCE_PREFIX}{key}" for key in super().keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> DictDatagram: + """ + Convert the packet to a DictDatagram. + + Args: + include_source: Whether to include source info fields + + Returns: + DictDatagram representation of the packet + """ + + data = self.as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + typespec = self.types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_source=include_source, + ) + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> Self: + """Return a shallow copy of the packet.""" + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 09cf09f..58a920f 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -48,6 +48,11 @@ def __init__( def data_context(self) -> DataContext: return self._data_context + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + @property @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... diff --git a/src/orcapod/data/old_datagrams.py b/src/orcapod/data/old_datagrams.py new file mode 100644 index 0000000..a0386c8 --- /dev/null +++ b/src/orcapod/data/old_datagrams.py @@ -0,0 +1,2281 @@ +""" +Data structures and utilities for working with datagrams in OrcaPod. + +This module provides classes and functions for handling packet-like data structures +that can represent data in various formats (Python dicts, Arrow tables, etc.) while +maintaining type information, source metadata, and semantic type conversion capability. + +Key classes: +- SemanticConverter: Converts between different data representations. Intended for internal use. +- DictDatagram: Immutable dict-based data structure +- PythonDictPacket: Python dict-based packet with source info +- ArrowPacket: Arrow table-based packet implementation +- PythonDictTag/ArrowTag: Tag implementations for data identification + +The module also provides utilities for schema validation, table operations, +and type conversions between semantic stores, Python stores, and Arrow tables. +""" + +from hmac import new +import logging +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator, Mapping +from types import new_class +from typing import Self, TypeAlias, cast + +from matplotlib.pyplot import arrow +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.context import ( + DataContext, +) +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import hashing_protocols as hp +from orcapod.types import TypeSpec, schemas, typespec_utils +from orcapod.types import typespec_utils as tsutils +from orcapod.types.core import DataValue +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils + +logger = logging.getLogger(__name__) + +# A conveniece packet-like type that defines a value that can be +# converted to a packet. It's broader than Packet and a simple mapping +# from string keys to DataValue (e.g., int, float, str) can be regarded +# as PacketLike, allowing for more flexible interfaces. +# Anything that requires Packet-like data but without the strict features +# of a Packet should accept PacketLike. +# One should be careful when using PacketLike as a return type as it does not +# enforce the typespec or source_info, which are important for packet integrity. +PacketLike: TypeAlias = Mapping[str, DataValue] + +PythonStore: TypeAlias = Mapping[str, DataValue] + + +class ImmutableDict(Mapping[str, DataValue]): + """ + An immutable dictionary-like container for DataValues. + + Provides a read-only view of a dictionary mapping strings to DataValues, + implementing the Mapping protocol for compatibility with dict-like operations. + + Initialize with data from a mapping. + Args: + data: Source mapping to copy data from + """ + + def __init__(self, data: Mapping[str, DataValue]): + self._data = dict(data) + + def __getitem__(self, key: str) -> DataValue: + return self._data[key] + + def __iter__(self): + return iter(self._data) + + def __len__(self) -> int: + return len(self._data) + + def __repr__(self) -> str: + return self._data.__repr__() + + def __str__(self) -> str: + return self._data.__str__() + + def __or__(self, other: Mapping[str, DataValue]) -> Self: + """ + Create a new ImmutableDict by merging with another mapping. + + Args: + other: Another mapping to merge with + + Returns: + A new ImmutableDict containing the combined data + """ + return self.__class__(self._data | dict(other)) + + +def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: + """ + Check if a column name matches any of the given prefixes. + + Args: + column: Column name to check + prefixes: Collection of prefixes to match against + + Returns: + True if the column starts with any of the prefixes, False otherwise + """ + for prefix in prefixes: + if column.startswith(prefix): + return True + return False + + +class BaseDatagram(ABC): + """ + Abstract base class for immutable datagram implementations. + + Provides shared functionality and enforces consistent interface across + different storage backends (dict, Arrow table, etc.). Concrete subclasses + must implement the abstract methods to handle their specific storage format. + + The base class only manages the data context key string - how that key + is interpreted and used is left to concrete implementations. + """ + + def __init__(self, data_context: DataContext | str | None = None) -> None: + """ + Initialize base datagram with data context. + + Args: + data_context: Context for semantic interpretation. Can be a string key + or a DataContext object, or None for default. + """ + self._data_context = DataContext.resolve_data_context(data_context) + + # 1. Core Properties (Identity & Structure) + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + + @property + @abstractmethod + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + ... + + # 2. Dict-like Interface (Data Access) + @abstractmethod + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + ... + + @abstractmethod + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + ... + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + ... + + @abstractmethod + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + ... + + # 3. Structural Information + @abstractmethod + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + ... + + @abstractmethod + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: + """Return type specification for the datagram.""" + ... + + @abstractmethod + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """Return the PyArrow schema for this datagram.""" + ... + + @abstractmethod + def content_hash(self) -> str: + """Calculate and return content hash of the datagram.""" + ... + + # 4. Format Conversions (Export) + @abstractmethod + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """Return dictionary representation of the datagram.""" + ... + + @abstractmethod + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """Convert the datagram to an Arrow table.""" + ... + + # 5. Meta Column Operations + @abstractmethod + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """Get a meta column value.""" + ... + + @abstractmethod + def with_meta_columns(self, **updates: DataValue) -> Self: + """Create new datagram with updated meta columns.""" + ... + + @abstractmethod + def drop_meta_columns(self, *keys: str) -> Self: + """Create new datagram with specified meta columns removed.""" + ... + + # 6. Data Column Operations + @abstractmethod + def select(self, *column_names: str) -> Self: + """Create new datagram with only specified data columns.""" + ... + + @abstractmethod + def drop(self, *column_names: str) -> Self: + """Create new datagram with specified data columns removed.""" + ... + + @abstractmethod + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Create new datagram with data columns renamed.""" + ... + + @abstractmethod + def update(self, **updates: DataValue) -> Self: + """Create new datagram with existing column values updated.""" + ... + + @abstractmethod + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """Create new datagram with additional data columns.""" + ... + + # 7. Context Operations + @abstractmethod + def with_context_key(self, new_context_key: str) -> Self: + """Create new datagram with different data context.""" + ... + + # 8. Utility Operations + @abstractmethod + def copy(self) -> Self: + """Create a shallow copy of the datagram.""" + ... + + +class DictDatagram(BaseDatagram): + """ + Immutable datagram implementation using dictionary as storage backend. + + This implementation uses composition (not inheritance from Mapping) to maintain + control over the interface while leveraging dictionary efficiency for data access. + Provides clean separation between data, meta, and context components. + + The underlying data is split into separate components: + - Data dict: Primary business data columns + - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> data = {{ + ... "user_id": 123, + ... "name": "Alice", + ... "__pipeline_version": "v2.1.0", + ... "{orcapod.CONTEXT_KEY}": "financial_v1" + ... }} + >>> datagram = DictDatagram(data) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + data: Mapping[str, DataValue], + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize DictDatagram from dictionary data. + + Args: + data: Source data mapping containing all column data. + typespec: Optional type specification for fields. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on data context and inferred types. + data_context: Data context for semantic type resolution. + If None and data contains context column, will extract from data. + + Note: + The input data is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Parse through data and extract different column types + data_columns = {} + meta_columns = {} + extracted_context = None + + for k, v in data.items(): + if k == constants.CONTEXT_KEY: + # Extract data context but keep it separate from meta data + if data_context is None: + extracted_context = v + # Don't store context in meta_data - it's managed separately + elif k.startswith(constants.META_PREFIX): + # Double underscore = meta metadata + meta_columns[k] = v + else: + # Everything else = user data (including _source_ and semantic types) + data_columns[k] = v + + # Initialize base class with data context + final_context = data_context or cast(str, extracted_context) + super().__init__(final_context) + + # Store data and meta components separately (immutable) + self._data = dict(data_columns) + self._meta_data = dict(meta_columns) + + # Combine provided typespec info with inferred typespec from content + # If the column value is None and no type spec is provided, defaults to str. + self._data_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._data, + typespec, + ) + ) + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + self._data_python_schema.to_semantic_schema( + semantic_type_registry=self._data_context.semantic_type_registry + ), + ) + self.semantic_converter = semantic_converter + + # Create schema for meta data + self._meta_python_schema = schemas.PythonSchema( + tsutils.get_typespec_from_dict( + self._meta_data, + typespec=typespec, + ) + ) + + # Initialize caches + self._cached_data_table: pa.Table | None = None + self._cached_meta_table: pa.Table | None = None + self._cached_content_hash: str | None = None + self._cached_data_arrow_schema: pa.Schema | None = None + self._cached_meta_arrow_schema: pa.Schema | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + return tuple(self._meta_data.keys()) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data: + raise KeyError(f"Data column '{key}' not found") + return self._data[key] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + return self._data.get(key, default) + + # 3. Structural Information + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + result_keys = list(self._data.keys()) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + # Start with data schema + schema = dict(self._data_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + schema.update(self._meta_python_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in self._meta_python_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + # Build data schema (cached) + if self._cached_data_arrow_schema is None: + self._cached_data_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._data_python_schema + ) + ) + + all_schemas = [self._cached_data_arrow_schema] + + # Add context schema if requested + if include_context: + context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) + all_schemas.append(context_schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_data: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + + if include_meta_columns is True: + meta_schema = self._cached_meta_arrow_schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._cached_meta_arrow_schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_meta_columns=False, include_context=False), + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + result_dict = dict(self._data) # Start with user data + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta columns if requested + if include_meta_columns and self._meta_data: + if include_meta_columns is True: + # Include all meta columns + result_dict.update(self._meta_data) + elif isinstance(include_meta_columns, Collection): + # Include only meta columns matching prefixes + filtered_meta_data = { + k: v + for k, v in self._meta_data.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + result_dict.update(filtered_meta_data) + + return result_dict + + def _get_meta_arrow_table(self) -> pa.Table: + if self._cached_meta_table is None: + arrow_schema = self._get_meta_arrow_schema() + self._cached_meta_table = pa.Table.from_pylist( + [self._meta_data], + schema=arrow_schema, + ) + assert self._cached_meta_table is not None, ( + "Meta Arrow table should be initialized by now" + ) + return self._cached_meta_table + + def _get_meta_arrow_schema(self) -> pa.Schema: + if self._cached_meta_arrow_schema is None: + self._cached_meta_arrow_schema = ( + self.semantic_converter.from_python_to_arrow_schema( + self._meta_python_schema + ) + ) + assert self._cached_meta_arrow_schema is not None, ( + "Meta Arrow schema should be initialized by now" + ) + return self._cached_meta_arrow_schema + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + # Build data table (cached) + if self._cached_data_table is None: + self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._data, + self._data_python_schema, + ) + assert self._cached_data_table is not None, ( + "Data Arrow table should be initialized by now" + ) + result_table = self._cached_data_table + + # Add context if requested + if include_context: + result_table = result_table.append_column( + constants.CONTEXT_KEY, + pa.array([self._data_context.context_key], type=pa.large_string()), + ) + + # Add meta columns if requested + meta_table = None + if include_meta_columns and self._meta_data: + meta_table = self._get_meta_arrow_table() + # Select appropriate meta columns + if isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_data.keys() + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = meta_table.select(matched_cols) + else: + meta_table = None + + # Combine tables if we have meta columns to add + if meta_table is not None: + result_table = arrow_utils.hstack_tables(result_table, meta_table) + + return result_table + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + """ + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + return self._meta_data.get(key, default) + + def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New DictDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + new_meta_data = dict(self._meta_data) + new_meta_data.update(prefixed_updates) + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = False + ) -> "DictDatagram": + """ + Create a new DictDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + ignore_missing: If True, ignore missing meta columns without raising an error. + + Raises: + KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Returns: + New DictDatagram instance without specified meta columns + """ + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_data.keys()) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter out specified meta columns + new_meta_data = { + k: v for k, v in self._meta_data.items() if k not in prefixed_keys + } + + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(new_meta_data) # Filtered meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "DictDatagram": + """ + Create a new DictDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New DictDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data.keys()) + if missing_cols: + raise KeyError(f"Columns not found: {missing_cols}") + + # Keep only specified data columns + new_data = {k: v for k, v in self._data.items() if k in column_names} + + # Reconstruct full data dict for new instance + full_data = new_data # Selected user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": + """ + Create a new DictDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New DictDatagram instance without specified data columns + """ + # Filter out specified data columns + missing = set(column_names) - set(self._data.keys()) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + new_data = {k: v for k, v in self._data.items() if k not in column_names} + + if not new_data: + raise ValueError("Cannot drop all data columns") + + # Reconstruct full data dict for new instance + full_data = new_data # Filtered user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": + """ + Create a new DictDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New DictDatagram instance with renamed data columns + """ + # Rename data columns according to mapping, preserving original types + new_data = {} + for old_name, value in self._data.items(): + new_name = column_mapping.get(old_name, old_name) + new_data[new_name] = value + + # Handle typespec updates for renamed columns + new_typespec = None + if self._data_python_schema: + existing_typespec = dict(self._data_python_schema) + + # Rename types according to column mapping + renamed_typespec = {} + for old_name, old_type in existing_typespec.items(): + new_name = column_mapping.get(old_name, old_name) + renamed_typespec[new_name] = old_type + + new_typespec = renamed_typespec + + # Reconstruct full data dict for new instance + full_data = new_data # Renamed user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "DictDatagram": + """ + Create a new DictDatagram with existing column values updated. + Maintains immutability by returning a new instance. + + Args: + **updates: Column names and their new values (columns must exist) + + Returns: + New DictDatagram instance with updated values + + Raises: + KeyError: If any column doesn't exist (use with_columns() to add new columns) + """ + if not updates: + return self + + # Error if any column doesn't exist + missing_columns = set(updates.keys()) - set(self._data.keys()) + if missing_columns: + raise KeyError( + f"Columns not found: {sorted(missing_columns)}. " + f"Use with_columns() to add new columns." + ) + + # Update existing columns + new_data = dict(self._data) + new_data.update(updates) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + semantic_converter=self.semantic_converter, # Keep existing converter + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "DictDatagram": + """ + Create a new DictDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New DictDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data.keys()) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # Update user data with new columns + new_data = dict(self._data) + new_data.update(updates) + + # Create updated typespec - handle None values by defaulting to str + typespec = self.types() + if column_types is not None: + typespec.update(column_types) + + new_typespec = tsutils.get_typespec_from_dict( + new_data, + typespec=typespec, + ) + + # Reconstruct full data dict for new instance + full_data = new_data # Updated user data + full_data.update(self._meta_data) # Keep existing meta data + + return DictDatagram( + data=full_data, + typespec=new_typespec, + # semantic converter needs to be rebuilt for new columns + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "DictDatagram": + """ + Create a new DictDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New DictDatagram instance with new context + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + return DictDatagram( + data=full_data, + data_context=new_context_key, # New context + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New DictDatagram instance with copied data and caches. + """ + # Reconstruct full data dict for new instance + full_data = dict(self._data) # User data + full_data.update(self._meta_data) # Meta data + + new_datagram = self.__class__( + full_data, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + """ + return str(self._data) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + """ + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"DictDatagram(" + f"data={self._data}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + + +class ArrowDatagram(BaseDatagram): + """ + Immutable datagram implementation using PyArrow Table as storage backend. + + This implementation provides high-performance columnar data operations while + maintaining the datagram interface. It efficiently handles type conversions, + semantic processing, and interoperability with Arrow-based tools. + + The underlying table is split into separate components: + - Data table: Primary business data columns + - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes + - Context table: Data context information with {orcapod.CONTEXT_KEY} + + Future Packet subclass will also handle: + - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes + + When exposing to external tools, semantic types are encoded as + `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). + + All operations return new instances, preserving immutability. + + Example: + >>> table = pa.Table.from_pydict({ + ... "user_id": [123], + ... "name": ["Alice"], + ... "__pipeline_version": ["v2.1.0"], + ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] + ... }) + >>> datagram = ArrowDatagram(table) + >>> updated = datagram.update(name="Alice Smith") + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + """ + Initialize ArrowDatagram from PyArrow Table. + + Args: + table: PyArrow Table containing the data. Must have exactly one row. + semantic_converter: Optional converter for semantic type handling. + If None, will be created based on the data context and table schema. + data_context: Context key string or DataContext object. + If None and table contains context column, will extract from table. + + Raises: + ValueError: If table doesn't contain exactly one row. + + Note: + The input table is automatically split into data, meta, and context + components based on column naming conventions. + """ + # Validate table has exactly one row for datagram + if len(table) != 1: + raise ValueError( + "Table must contain exactly one row to be a valid datagram." + ) + + # Split table into data, meta, and context components + context_columns = [constants.CONTEXT_KEY] + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] + + # Extract context table if present + if constants.CONTEXT_KEY in table.column_names and data_context is None: + context_table = table.select([constants.CONTEXT_KEY]) + data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] + + # Initialize base class with data context + super().__init__(data_context) + + # Split table into components + self._data_table = table.drop(context_columns + meta_columns) + self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: + raise ValueError("Data table must contain at least one data column.") + + # Create semantic converter + if semantic_converter is None: + semantic_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_arrow_schema( + self._data_table.schema, + self._data_context.semantic_type_registry, + ) + ) + self._semantic_converter = semantic_converter + + # Create data context table + data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) + self._data_context_table = pa.Table.from_pylist( + [{constants.CONTEXT_KEY: self._data_context.context_key}], + schema=data_context_schema, + ) + + # Initialize caches + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_dict: dict[str, DataValue] | None = None + self._cached_meta_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + # 1. Core Properties (Identity & Structure) + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names.""" + if self._meta_table is None: + return () + return tuple(self._meta_table.column_names) + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """Get data column value by key.""" + if key not in self._data_table.column_names: + raise KeyError(f"Data column '{key}' not found") + + return self._data_table[key].to_pylist()[0] + + def __contains__(self, key: str) -> bool: + """Check if data column exists.""" + return key in self._data_table.column_names + + def __iter__(self) -> Iterator[str]: + """Iterate over data column names.""" + return iter(self._data_table.column_names) + + def get(self, key: str, default: DataValue = None) -> DataValue: + """Get data column value with default.""" + if key in self._data_table.column_names: + return self.as_dict()[key] + return default + + # 3. Structural Information + def keys( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """Return tuple of column names.""" + # Start with data columns + result_keys = list(self._data_table.column_names) + + # Add context if requested + if include_context: + result_keys.append(constants.CONTEXT_KEY) + + # Add meta columns if requested + if include_meta_columns: + if include_meta_columns is True: + result_keys.extend(self.meta_columns) + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + filtered_meta_cols = [ + col + for col in self.meta_columns + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + result_keys.extend(filtered_meta_cols) + + return tuple(result_keys) + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> schemas.PythonSchema: + """ + Return Python schema for the datagram. + + Args: + include_meta_columns: Whether to include meta column types. + - True: include all meta column types + - Collection[str]: include meta column types matching these prefixes + - False: exclude meta column types + include_context: Whether to include context type + + Returns: + Python schema + """ + # Get data schema (cached) + if self._cached_python_schema is None: + self._cached_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._data_table.schema + ) + ) + + schema = dict(self._cached_python_schema) + + # Add context if requested + if include_context: + schema[constants.CONTEXT_KEY] = str + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if self._cached_meta_python_schema is None: + self._cached_meta_python_schema = ( + self._semantic_converter.from_arrow_to_python_schema( + self._meta_table.schema + ) + ) + meta_schema = dict(self._cached_meta_python_schema) + if include_meta_columns is True: + schema.update(meta_schema) + elif isinstance(include_meta_columns, Collection): + filtered_meta_schema = { + k: v + for k, v in meta_schema.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + schema.update(filtered_meta_schema) + + return schemas.PythonSchema(schema) + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_meta_columns: Whether to include meta columns in the schema. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context column in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + all_schemas = [self._data_table.schema] + + # Add context schema if requested + if include_context: + # TODO: reassess the efficiency of this approach + all_schemas.append(self._data_context_table.schema) + + # Add meta schema if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_schema = self._meta_table.schema + elif isinstance(include_meta_columns, Collection): + # Filter meta schema by prefix matching + matched_fields = [ + field + for field in self._meta_table.schema + if any( + field.name.startswith(prefix) for prefix in include_meta_columns + ) + ] + if matched_fields: + meta_schema = pa.schema(matched_fields) + else: + meta_schema = None + else: + meta_schema = None + + if meta_schema is not None: + all_schemas.append(meta_schema) + + return arrow_utils.join_arrow_schemas(*all_schemas) + + def content_hash(self) -> str: + """ + Calculate and return content hash of the datagram. + Only includes data columns, not meta columns or context. + + Returns: + Hash string of the datagram content + """ + if self._cached_content_hash is None: + self._cached_content_hash = self._data_context.arrow_hasher.hash_table( + self._data_table, + prefix_hasher_id=True, + ) + return self._cached_content_hash + + # 4. Format Conversions (Export) + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation of the datagram. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation + """ + # Get data dict (cached) + if self._cached_python_dict is None: + self._cached_python_dict = self._semantic_converter.from_arrow_to_python( + self._data_table + )[0] + + result_dict = dict(self._cached_python_dict) + + # Add context if requested + if include_context: + result_dict[constants.CONTEXT_KEY] = self._data_context.context_key + + # Add meta data if requested + if include_meta_columns and self._meta_table is not None: + if include_meta_columns is True: + meta_dict = self._meta_table.to_pylist()[0] + elif isinstance(include_meta_columns, Collection): + meta_dict = self._meta_table.to_pylist()[0] + # Include only meta columns matching prefixes + meta_dict = { + k: v + for k, v in meta_dict.items() + if any(k.startswith(prefix) for prefix in include_meta_columns) + } + if meta_dict is not None: + result_dict.update(meta_dict) + + return result_dict + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> pa.Table: + """ + Convert the datagram to an Arrow table. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include the context column + + Returns: + Arrow table representation + """ + all_tables = [self._data_table] + + # Add context if requested + if include_context: + all_tables.append(self._data_context_table) + + # Add meta columns if requested + if include_meta_columns and self._meta_table is not None: + meta_table = None + if include_meta_columns is True: + meta_table = self._meta_table + elif isinstance(include_meta_columns, Collection): + # Filter meta columns by prefix matching + matched_cols = [ + col + for col in self._meta_table.column_names + if any(col.startswith(prefix) for prefix in include_meta_columns) + ] + if matched_cols: + meta_table = self._meta_table.select(matched_cols) + else: + meta_table = None + + if meta_table is not None: + all_tables.append(meta_table) + + return arrow_utils.hstack_tables(*all_tables) + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get a meta column value. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) + default: Default value if not found + + Returns: + Meta column value + """ + if self._meta_table is None: + return default + + # Handle both prefixed and unprefixed keys + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + + if key not in self._meta_table.column_names: + return default + + return self._meta_table[key].to_pylist()[0] + + def with_meta_columns(self, **meta_updates: DataValue) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with updated meta columns. + Maintains immutability by returning a new instance. + + Args: + **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) + + Returns: + New ArrowDatagram instance + """ + # Prefix the keys and prepare updates + prefixed_updates = {} + for k, v in meta_updates.items(): + if not k.startswith(constants.META_PREFIX): + k = constants.META_PREFIX + k + prefixed_updates[k] = v + + # Start with existing meta data + meta_dict = {} + if self._meta_table is not None: + meta_dict = self._meta_table.to_pylist()[0] + + # Apply updates + meta_dict.update(prefixed_updates) + + # Create new meta table + new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None + + # Combine all tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop_meta_columns( + self, *keys: str, ignore_missing: bool = True + ) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specified meta columns dropped. + Maintains immutability by returning a new instance. + + Args: + *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) + + Returns: + New ArrowDatagram instance without specified meta columns + """ + if self._meta_table is None: + return self # No meta columns to drop + + # Normalize keys to have prefixes + prefixed_keys = set() + for key in keys: + if not key.startswith(constants.META_PREFIX): + key = constants.META_PREFIX + key + prefixed_keys.add(key) + + missing_keys = prefixed_keys - set(self._meta_table.column_names) + if missing_keys and not ignore_missing: + raise KeyError( + f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" + ) + + # Filter meta columns + remaining_cols = [ + col for col in self._meta_table.column_names if col not in prefixed_keys + ] + + # Create new meta table + new_meta_table = ( + self._meta_table.select(remaining_cols) if remaining_cols else None + ) + + # Combine tables for reconstruction + combined_table = self._data_table + if new_meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # 6. Data Column Operations + def select(self, *column_names: str) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with only specified data columns. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to keep + + Returns: + New ArrowDatagram instance with only specified data columns + """ + # Validate columns exist + missing_cols = set(column_names) - set(self._data_table.column_names) + if missing_cols: + raise ValueError(f"Columns not found: {missing_cols}") + + new_data_table = self._data_table.select(list(column_names)) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def drop(self, *column_names: str, ignore_missing: bool = False) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specified data columns dropped. + Maintains immutability by returning a new instance. + + Args: + *column_names: Data column names to drop + + Returns: + New ArrowDatagram instance without specified data columns + """ + + # Filter out specified data columns + missing = set(column_names) - set(self._data_table.column_names) + if missing and not ignore_missing: + raise KeyError( + f"Following columns do not exist and cannot be dropped: {sorted(missing)}" + ) + + # Filter data columns + remaining_cols = [ + col for col in self._data_table.column_names if col not in column_names + ] + + if not remaining_cols: + raise ValueError("Cannot drop all data columns") + + new_data_table = self._data_table.select(remaining_cols) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def rename(self, column_mapping: Mapping[str, str]) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New ArrowDatagram instance with renamed data columns + """ + # Create new schema with renamed fields, preserving original types + new_fields = [] + for field in self._data_table.schema: + old_name = field.name + new_name = column_mapping.get(old_name, old_name) + new_field = pa.field(new_name, field.type) + new_fields.append(new_field) + + # Create new data table with renamed columns + new_schema = pa.schema(new_fields) + new_data_table = self._data_table.rename_columns( + [column_mapping.get(name, name) for name in self._data_table.column_names] + ).cast(new_schema) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def update(self, **updates: DataValue) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with specific column values updated. + + Args: + **updates: Column names and their new values + + Returns: + New ArrowDatagram instance with updated values + + Raises: + KeyError: If any specified column doesn't exist + + Example: + # Convert relative path to absolute path + updated = datagram.update(file_path="/absolute/path/to/file.txt") + + # Update multiple values + updated = datagram.update(status="processed", file_path="/new/path") + """ + # Only update if there are columns to update + if not updates: + return self + + # Validate all columns exist + missing_cols = set(updates.keys()) - set(self._data_table.column_names) + if missing_cols: + raise KeyError( + f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" + ) + + updates_typespec = schemas.PythonSchema( + {k: v for k, v in self.types().items() if k in updates} + ) + + update_table = self._semantic_converter.from_python_to_arrow( + updates, updates_typespec + ) + all_tables = [self._data_table.drop(list(updates.keys())), update_table] + + if self._meta_table is not None: + all_tables.append(self._meta_table) + + return ArrowDatagram( + table=arrow_utils.hstack_tables(*all_tables), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with new data columns added. + Maintains immutability by returning a new instance. + + Args: + column_updates: New data columns as a mapping + column_types: Optional type specifications for new columns + **kwargs: New data columns as keyword arguments + + Returns: + New ArrowDatagram instance with new data columns added + + Raises: + ValueError: If any column already exists (use update() instead) + """ + # Combine explicit updates with kwargs + + if not updates: + return self + + # Error if any column already exists + existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) + if existing_overlaps: + raise ValueError( + f"Columns already exist: {sorted(existing_overlaps)}. " + f"Use update() to modify existing columns." + ) + + # TODO: consider simplifying this conversion logic + typespec = typespec_utils.get_typespec_from_dict(updates, column_types) + + updates_converter = SemanticConverter.from_semantic_schema( + schemas.SemanticSchema.from_typespec( + typespec, self._data_context.semantic_type_registry + ) + ) + # TODO: cleanup the handling of typespec python schema and various conversion points + new_data_table = updates_converter.from_python_to_arrow(updates, typespec) + + # Combine with meta table for reconstruction + combined_table = new_data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + # prepare the joined converter + total_converter = self._semantic_converter.join(updates_converter) + + return ArrowDatagram( + table=combined_table, + semantic_converter=total_converter, + data_context=self._data_context, + ) + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> "ArrowDatagram": + """ + Create a new ArrowDatagram with a different data context key. + Maintains immutability by returning a new instance. + + Args: + new_context_key: New data context key string + + Returns: + New ArrowDatagram instance with new context + """ + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + return ArrowDatagram( + table=combined_table, + data_context=new_context_key, + # Note: semantic_converter will be rebuilt for new context + ) + + # 8. Utility Operations + def copy(self) -> Self: + """Return a copy of the datagram.""" + # Combine all tables for reconstruction + combined_table = self._data_table + if self._meta_table is not None: + combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) + + new_datagram = self.__class__( + combined_table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + # Copy caches + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + + return new_datagram + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + + Example: + >>> str(datagram) + "{'user_id': 123, 'name': 'Alice'}" + >>> print(datagram) + {'user_id': 123, 'name': 'Alice'} + """ + return str(self.as_dict()) + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information including + data columns, meta columns count, and context for debugging purposes. + + Returns: + Detailed representation with type and metadata information. + + Example: + >>> repr(datagram) + "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" + """ + data_dict = self.as_dict() + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"ArrowDatagram(" + f"data={data_dict}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + + +class DictTag(DictDatagram): + """ + A simple tag implementation using Python dictionary. + + Represents a tag (metadata) as a dictionary that can be converted + to different representations like Arrow tables. + """ + + +class DictPacket(DictDatagram): + """ + Enhanced packet implementation with source information support. + + Extends DictDatagram to include source information tracking and + enhanced table conversion capabilities that can include or exclude + source metadata. + + Initialize packet with data and optional source information. + + Args: + data: Primary data content + source_info: Optional mapping of field names to source information + typespec: Optional type specification + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types. Defaults to system default registry. + arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. + """ + + def __init__( + self, + data: Mapping[str, DataValue], + source_info: Mapping[str, str | None] | None = None, + typespec: TypeSpec | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + # normalize the data content and remove any source info keys + data_only = { + k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) + } + contained_source_info = { + k.removeprefix(constants.SOURCE_PREFIX): v + for k, v in data.items() + if k.startswith(constants.SOURCE_PREFIX) + } + + super().__init__( + data_only, + typespec=typespec, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._source_info = {**contained_source_info, **(source_info or {})} + self._cached_source_info_table: pa.Table | None = None + self._cached_source_info_schema: pa.Schema | None = None + + @property + def _source_info_schema(self) -> pa.Schema: + if self._cached_source_info_schema is None: + self._cached_source_info_schema = pa.schema( + { + f"{constants.SOURCE_PREFIX}{k}": pa.large_string() + for k in self.keys() + } + ) + return self._cached_source_info_schema + + def as_table( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table( + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_source: + if self._cached_source_info_table is None: + source_info_data = { + f"{constants.SOURCE_PREFIX}{k}": v + for k, v in self.source_info().items() + } + self._cached_source_info_table = pa.Table.from_pylist( + [source_info_data], schema=self._source_info_schema + ) + assert self._cached_source_info_table is not None, ( + "Cached source info table should not be None" + ) + # subselect the corresponding _source_info as the columns present in the data table + source_info_table = self._cached_source_info_table.select( + [ + f"{constants.SOURCE_PREFIX}{k}" + for k in table.column_names + if k in self.keys() + ] + ) + table = arrow_utils.hstack_tables(table, source_info_table) + return table + + def as_dict( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + dict_copy = super().as_dict( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + for key, value in self.source_info().items(): + dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value + return dict_copy + + def types( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_meta_columns=include_meta_columns, include_context=include_context + ) + if include_source: + return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return schema + + def as_datagram( + self, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> DictDatagram: + """ + Convert the packet to a DictDatagram. + + Args: + include_source: Whether to include source info fields + + Returns: + DictDatagram representation of the packet + """ + data = self.as_dict( + include_meta_columns=include_meta_columns, include_source=include_source + ) + typespec = self.types(include_source=include_source) + return DictDatagram( + data, + typespec=typespec, + semantic_converter=self.semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ + return {key: self._source_info.get(key, None) for key in self.keys()} + + def copy(self) -> Self: + """Return a shallow copy of the packet.""" + instance = super().copy() + instance._source_info = self._source_info.copy() + instance._cached_source_info_table = self._cached_source_info_table + return instance + + +class ArrowTag(ArrowDatagram): + """ + A tag implementation using Arrow table backend. + + Represents a single-row Arrow table that can be converted to Python + dictionary representation while caching computed values for efficiency. + + Initialize with an Arrow table. + + Args: + table: Single-row Arrow table representing the tag + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + table: pa.Table, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(table) != 1: + raise ValueError( + "ArrowTag should only contain a single row, " + "as it represents a single tag." + ) + super().__init__( + table=table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + +class ArrowPacket(ArrowDatagram): + """ + Arrow table-based packet implementation with comprehensive features. + + A packet implementation that uses Arrow tables as the primary storage format, + providing efficient memory usage and columnar data operations while supporting + source information tracking and content hashing. + + + Initialize ArrowPacket with Arrow table and configuration. + + Args: + table: Single-row Arrow table representing the packet + source_info: Optional source information mapping + semantic_converter: Optional semantic converter + semantic_type_registry: Registry for semantic types + finger_print: Optional fingerprint for tracking + arrow_hasher: Optional Arrow hasher + post_hash_callback: Optional callback after hash calculation + skip_source_info_extraction: Whether to skip source info processing + + Raises: + ValueError: If table doesn't contain exactly one row + """ + + def __init__( + self, + data: pa.Table, + source_info: dict[str, str | None] | None = None, + semantic_converter: SemanticConverter | None = None, + data_context: str | DataContext | None = None, + ) -> None: + if len(data) != 1: + raise ValueError( + "ArrowPacket should only contain a single row, " + "as it represents a single packet." + ) + if source_info is None: + source_info = {} + + # normalize the table to ensure it has the expected source_info columns + data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( + data, + {constants.SOURCE_PREFIX: source_info}, + exclude_columns=[constants.CONTEXT_KEY], + ) + self._source_info_table = prefixed_tables[constants.SOURCE_INFO_PREFIX] + + super().__init__( + data_table, + semantic_converter=semantic_converter, + data_context=data_context, + ) + + self._cached_source_info: dict[str, str | None] | None = None + self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_content_hash: str | None = None + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + ) -> pa.Table: + table = super().as_table(include_data_context=include_data_context) + if include_source: + # add source_info only for existing data columns + table = arrow_utils.hstack_tables( + table, + self._source_info_table.select( + [ + f"{constants.SOURCE_INFO_PREFIX}{c}" + for c in table.column_names + if c in self.keys() + ] + ), + ) + return table + + def types( + self, include_data_context: bool = False, include_source: bool = False + ) -> schemas.PythonSchema: + """Return copy of the Python schema.""" + schema = super().types(include_data_context=include_data_context) + if include_source: + for key in self.keys(): + schema[f"{constants.SOURCE_INFO_PREFIX}{key}"] = str + return schema + + def arrow_schema( + self, include_data_context: bool = False, include_source: bool = False + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema(include_data_context=include_data_context) + if include_source: + return arrow_utils.join_arrow_schemas( + schema, self._source_info_table.schema + ) + return schema + + def as_dict( + self, include_data_context: bool = False, include_source: bool = False + ) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + return_dict = super().as_dict(include_data_context=include_data_context) + if include_source: + return_dict.update( + { + f"{constants.SOURCE_INFO_PREFIX}{k}": v + for k, v in self.source_info().items() + } + ) + return return_dict + + def as_datagram(self, include_source: bool = False) -> ArrowDatagram: + table = self.as_table(include_source=include_source) + return ArrowDatagram( + table, + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + + def source_info(self) -> dict[str, str | None]: + """ + Return source information for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ + if self._cached_source_info is None: + self._cached_source_info = { + k.removeprefix(constants.SOURCE_INFO_PREFIX): v + for k, v in self._source_info_table.to_pylist()[0].items() + } + return self._cached_source_info.copy() + + def copy(self) -> Self: + # TODO: restructure copy to allow for better inheritance and expansion + new_packet = self.__class__( + self.as_table(), + self.source_info(), + semantic_converter=self._semantic_converter, + data_context=self._data_context, + ) + new_packet._cached_source_info = self._cached_source_info + new_packet._cached_python_dict = self._cached_python_dict + new_packet._cached_python_schema = self._cached_python_schema + new_packet._cached_content_hash = self._cached_content_hash + + return new_packet + + +# a batch is a tuple of a tag and a list of packets +Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] +"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index cd06f34..f22b9fe 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -20,6 +20,8 @@ from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter from orcapod.types import typespec_utils as tsutils +from orcapod.utils import arrow_utils +from orcapod.data.system_constants import orcapod_constants as constants import pyarrow as pa logger = logging.getLogger(__name__) @@ -255,7 +257,10 @@ def __init__( @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name,) + return ( + self.function_name, + self.data_context.object_hasher.hash_to_hex(self), + ) def input_packet_types(self) -> PythonSchema: """ @@ -284,6 +289,8 @@ def __str__(self) -> str: return f"FunctionPod:{func_sig}" def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: + v: dp.Packet = DictPacket({}) + print(v) if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -311,9 +318,12 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" ) - # TODO: add source info based on this function call + output_data = {k: v for k, v in zip(self.output_keys, output_values)} + source_info = {k: ":".join(self.kernel_id + (k,)) for k in output_data} + output_packet = DictPacket( {k: v for k, v in zip(self.output_keys, output_values)}, + source_info=source_info, typespec=self.output_packet_types(), semantic_converter=self._output_semantic_converter, data_context=self._data_context, @@ -365,9 +375,17 @@ def __init__( pod: dp.Pod, fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, + data_context: str | DataContext | None = None, **kwargs, ) -> None: - super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) + if data_context is None: + data_context = pod.data_context_key + super().__init__( + fixed_input_streams=fixed_input_streams, + label=label, + data_context=data_context, + **kwargs, + ) self.pod = pod @property @@ -414,32 +432,27 @@ class CachedPod(WrappedPod): This is useful for pods that are expensive to compute and can benefit from caching. """ + # name of the column in the tag store that contains the packet hash + PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" + def __init__( self, pod: dp.Pod, result_store: ArrowDataStore, - lineage_store: ArrowDataStore | None, + pipeline_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store - self.lineage_store = lineage_store + self.pipeline_store = pipeline_store # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( self.pod, prefix_hasher_id=True ) - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Return the pod ID, which is the function name of the wrapped pod. - This is used to identify the pod in the system. - """ - return self.pod.kernel_id + (self.pod_hash,) - @property def record_path(self) -> tuple[str, ...]: """ @@ -448,14 +461,65 @@ def record_path(self) -> tuple[str, ...]: """ return self.record_path_prefix + self.kernel_id - def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: + def call( + self, + tag: dp.Tag, + packet: dp.Packet, + skip_recording: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: output_packet = self.get_recorded_output_packet(packet) + if output_packet is None: + tag, output_packet = self.pod.call(tag, packet) + if output_packet is not None and not skip_recording: + self.record_packet(packet, output_packet) + if output_packet is not None: - return tag, output_packet - output_tag, output_packet = self.pod.call(tag, packet) - if output_packet is not None: - self.record_packet(packet, output_packet) - return output_tag, output_packet + self.add_pipeline_record(tag, input_packet=packet) + return tag, output_packet + + def add_pipeline_record(self, tag: dp.Tag, input_packet: dp.Packet) -> None: + if self.pipeline_store is None: + # no pipeline store configured, skip recording + return + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.record_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.record_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) def record_packet( self, @@ -466,16 +530,9 @@ def record_packet( """ Record the output packet against the input packet in the result store. """ - data_table = output_packet.as_table( - include_data_context=True, include_source=True - ) + data_table = output_packet.as_table(include_context=True, include_source=True) - data_table = data_table.append_column( - f"_input_packet{DataContext.get_data_context_column()}", - pa.array([input_packet.data_context_key], type=pa.large_string()), - ) - - result_flag = self.result_store.record_data( + result_flag = self.result_store.add_record( self.record_path, input_packet.content_hash(), data_table, @@ -494,14 +551,47 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non Retrieve the output packet from the result store based on the input packet. If the output packet is not found, return None. """ - result_table = self.result_store.get_recorded_data( + result_table = self.result_store.get_record_by_id( self.record_path, input_packet.content_hash() ) if result_table is None: return None - return ArrowPacket( - result_table.drop( - [f"_input_packet{DataContext.get_data_context_column()}"] - ), + return ArrowPacket(result_table) + + def _get_all_records(self) -> "pa.Table | None": + results = self.result_store.get_all_records( + self.record_path, record_id_column=self.PACKET_HASH_COLUMN + ) + + if self.pipeline_store is None: + raise ValueError( + "Pipeline store is not configured, cannot retrieve tag info" + ) + taginfo = self.pipeline_store.get_all_records( + self.record_path, + ) + + if results is None or taginfo is None: + return None + + tag_columns = [ + c + for c in taginfo.column_names + if not c.startswith(constants.META_PREFIX) + and not c.startswith(constants.SOURCE_PREFIX) + ] + + packet_columns = [ + c for c in results.column_names if c != self.PACKET_HASH_COLUMN + ] + + # TODO: do not hardcode the join keys + joined_info = taginfo.join( + results, + self.PACKET_HASH_COLUMN, + join_type="inner", ) + + joined_info = joined_info.select([*tag_columns, *packet_columns]) + return joined_info diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index a5c2434..b8ce85d 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,23 +1,25 @@ -from orcapod.protocols import data_protocols as dp +import logging +import warnings +from abc import ABC, abstractmethod +from collections.abc import Collection, Iterator +from datetime import datetime, timezone +from itertools import repeat +from typing import Any, Literal + +import pyarrow as pa + +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext from orcapod.data.datagrams import ( ArrowPacket, ArrowTag, DictTag, - SemanticConverter, - SOURCE_INFO_PREFIX, ) -from orcapod.utils import arrow_utils -from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas -import pyarrow as pa -from collections.abc import Iterator, Collection -from abc import ABC, abstractmethod -from datetime import timezone, datetime -from typing import Any, Literal -import logging -import warnings -from itertools import repeat +from orcapod.types.semantic_converter import SemanticConverter +from orcapod.utils import arrow_utils # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -327,18 +329,14 @@ def __init__( super().__init__(source=source, upstreams=upstreams, **kwargs) table, data_context_table = arrow_utils.split_by_column_groups( - table, [DataContext.get_data_context_column()] + table, [constants.CONTEXT_KEY] ) if data_context_table is None: data_context_table = pa.table( - { - DataContext.get_data_context_column(): pa.nulls( - len(table), pa.large_string() - ) - } + {constants.CONTEXT_KEY: pa.nulls(len(table), pa.large_string())} ) - prefix_info = {SOURCE_INFO_PREFIX: source_info} + prefix_info = {constants.SOURCE_PREFIX: source_info} # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) @@ -350,7 +348,7 @@ def __init__( c for c in table.column_names if c not in tag_columns ) self._table = table - self._source_info_table = prefix_tables[SOURCE_INFO_PREFIX] + self._source_info_table = prefix_tables[constants.SOURCE_PREFIX] self._data_context_table = data_context_table if len(self._packet_columns) == 0: @@ -575,12 +573,12 @@ def as_table( tag_schema = tag.arrow_schema() if packet_schema is None: packet_schema = packet.arrow_schema( - include_data_context=True, + include_context=True, include_source=True, ) all_tags.append(tag.as_dict()) all_packets.append( - packet.as_dict(include_data_context=True, include_source=True) + packet.as_dict(include_context=True, include_source=True) ) all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) @@ -595,9 +593,9 @@ def as_table( drop_columns = [] if not include_source: - drop_columns.extend(f"{SOURCE_INFO_PREFIX}{c}" for c in self.keys()[1]) + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) if not include_data_context: - drop_columns.append(DataContext.get_data_context_column()) + drop_columns.append(constants.CONTEXT_KEY) output_table = self._cached_output_table.drop(drop_columns) diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py new file mode 100644 index 0000000..de1bebc --- /dev/null +++ b/src/orcapod/data/system_constants.py @@ -0,0 +1,25 @@ +# Constants used for source info keys +SYSTEM_COLUMN_PREFIX = "__" +SOURCE_INFO_PREFIX = "_source_" + +DATA_CONTEXT_KEY = "_context_key" + + +class SystemConstant: + def __init__(self, global_prefix: str = ""): + self._global_prefix = global_prefix + + @property + def META_PREFIX(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}" + + @property + def SOURCE_PREFIX(self) -> str: + return f"{self._global_prefix}{SOURCE_INFO_PREFIX}" + + @property + def CONTEXT_KEY(self) -> str: + return f"{self._global_prefix}{DATA_CONTEXT_KEY}" + + +orcapod_constants = SystemConstant() diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index a89ab4e..695ffe8 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -1,7 +1,6 @@ import hashlib from typing import Any import pyarrow as pa -import polars as pl import json from orcapod.protocols.hashing_protocols import SemanticTypeHasher, StringCacher from orcapod.hashing import arrow_serialization @@ -214,6 +213,8 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: # normalize all string to large strings by passing through polars # TODO: consider cleaner approach in the future + import polars as pl + sorted_table = pl.DataFrame(sorted_table).to_arrow() # Step 3: Serialize using Arrow IPC format diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 97568f5..2a92f69 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,4 +1,4 @@ -from orcapod.protocols.hashing_protocols import FunctionInfoExtractor, ObjectHasher +from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from orcapod.hashing import legacy_core from orcapod.hashing import hash_utils from typing import Any diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 012edaa..968d70e 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,23 +1,43 @@ -from typing import Protocol, Any, ContextManager -from orcapod.types import DataValue, TypeSpec -from orcapod.protocols.hashing_protocols import ContentIdentifiable -from collections.abc import Iterator, Collection -import pyarrow as pa +from collections.abc import Collection, Iterator, Mapping from datetime import datetime +from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import DataValue, TypeSpec + +if TYPE_CHECKING: + import pyarrow as pa class Datagram(Protocol): """ - Base protocol for all data containers in Orcapod. + Protocol for immutable datagram containers in Orcapod. Datagrams are the fundamental units of data that flow through the system. - They provide a unified interface for data access and conversion, ensuring - consistent behavior across different data types and sources. + They provide a unified interface for data access, conversion, and manipulation, + ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). + + Each datagram contains: + - **Data columns**: The primary business data (user_id, name, etc.) + - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes (__processed_at, etc.) + - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) + + Future Packet subclass will also include: + - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) + + When exposing to external tools without field metadata support, semantic types + are encoded as `_{semantic_type}_` prefixes (_path_config_file, _id_user_name, etc.). + + All operations are immutable - methods return new datagram instances rather than + modifying existing ones. - TypeSpec is a dict[str, type] mapping field names to their Python types, - enabling type checking and validation throughout the computational graph. + Example: + >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) + >>> updated = datagram.update(name="Alice Smith") + >>> filtered = datagram.select("user_id", "name") + >>> table = datagram.as_table() """ + # 1. Core Properties (Identity & Structure) @property def data_context_key(self) -> str: """ @@ -27,91 +47,527 @@ def data_context_key(self) -> str: contextual information needed to properly interpret and work with this datagram across various operations (storage, visualization, processing, etc.). - Context key formats: - - Standard contexts: "std:v1.2.3:fingerprint" - - Custom contexts: "custom:user_provided_id" + Returns: + str: Context key for proper datagram interpretation + """ + ... + + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" + ... + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """ + Get data column value by key. + + Provides dict-like access to data columns only. Meta columns + are not accessible through this method (use `get_meta_value()` instead). - Concrete implementation can make use of this context key to ensure necessary background - informaton / object is available for correct processing of the datagram. + Args: + key: Data column name. Returns: - str: Context key for proper datagram interpretation + The value stored in the specified data column. + + Raises: + KeyError: If the column doesn't exist in data columns. + + Example: + >>> datagram["user_id"] + 123 + >>> datagram["name"] + 'Alice' """ ... - def types(self, include_data_context: bool = False) -> TypeSpec: + def __contains__(self, key: str) -> bool: """ - Return the type specification for this datagram. + Check if data column exists. - The TypeSpec maps field names to their Python types, enabling - type checking and validation throughout the system. + Args: + key: Column name to check. Returns: - TypeSpec: Dictionary mapping field names to Python types + True if column exists in data columns, False otherwise. + + Example: + >>> "user_id" in datagram + True + >>> "nonexistent" in datagram + False + """ + ... + + def __iter__(self) -> Iterator[str]: + """ + Iterate over data column names. + + Provides for-loop support over column names, enabling natural iteration + patterns without requiring conversion to dict. + + Yields: + Data column names in no particular order. + + Example: + >>> for column in datagram: + ... value = datagram[column] + ... print(f"{column}: {value}") """ ... - def arrow_schema(self, include_data_context: bool = False) -> pa.Schema: + def get(self, key: str, default: DataValue = None) -> DataValue: """ - Return the PyArrow schema for this datagram. + Get data column value with default fallback. - The schema provides a structured representation of the datagram's - fields and their types, enabling efficient serialization and - deserialization with PyArrow. + Args: + key: Data column name. + default: Value to return if column doesn't exist. Returns: - pa.Schema: PyArrow schema representation of the datagram + Column value if exists, otherwise the default value. + + Example: + >>> datagram.get("user_id") + 123 + >>> datagram.get("missing", "default") + 'default' """ ... - def keys(self) -> Collection[str]: + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: """ - Return the available keys/fields in this datagram. + Return tuple of column names. - This provides a way to inspect the structure of the datagram - without accessing the actual data values. + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. Returns: - Collection[str]: Available field names + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') """ ... - def as_table(self, include_data_context: bool = False) -> pa.Table: + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: """ - Convert to PyArrow Table format. + Return type specification mapping field names to Python types. - Provides a standardized way to convert datagram content to - a columnar format suitable for analysis and processing. + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. Returns: - pa.Table: PyArrow table representation + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } """ ... - def as_dict(self, include_data_context: bool = False) -> dict[str, DataValue]: + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Schema": """ - Convert to dictionary format. + Return PyArrow schema representation. - Provides a simple key-value representation of the datagram - content, useful for debugging and simple data access. + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. Returns: - dict[str, DataValue]: Dictionary representation + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] """ ... def content_hash(self) -> str: """ - Return a hash of the packet content for caching/comparison. + Return deterministic hash of datagram content. + + The hash should reflect the data content, typically excluding meta columns + and context. Used for caching, comparison, and deduplication. For exact details of + hash computation, refer to the implementation in the specific datagram class/subclass. + + Returns: + Deterministic content hash string. + + Note: + Two datagrams with identical data columns will have the same hash, + even if they differ in meta columns or context. + + Example: + >>> datagram.content_hash() + 'sha256:abc123def456...' + """ + ... + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. + This method handles both prefixed and unprefixed key formats. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + + Example: + >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed + 'v2.1.0' + >>> datagram.get_meta_value("__pipeline_version") # Already prefixed + 'v2.1.0' + >>> datagram.get_meta_value("missing", "default") + 'default' + """ + ... + + def with_meta_columns(self, **updates: DataValue) -> Self: + """ + Create new datagram with updated meta columns. + + Adds or updates operational metadata while preserving all data columns. + Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. + + Args: + **updates: Meta column updates as keyword arguments. + + Returns: + New datagram instance with updated meta columns. + + Example: + >>> tracked = datagram.with_meta_columns( + ... processed_by="pipeline_v2", + ... timestamp="2024-01-15T10:30:00Z" + ... ) + """ + ... + + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified meta columns removed. + + Args: + *keys: Meta column keys to remove (prefixes optional). + ignore_missing: If True, ignore missing columns without raising an error. + + + Returns: + New datagram instance without specified meta columns. + + Raises: + KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Example: + >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") + """ + ... + + # 6. Data Column Operations + def select(self, *column_names: str) -> Self: + """ + Create new datagram with only specified data columns. + + Args: + *column_names: Data column names to keep. + + + Returns: + New datagram instance with only specified data columns. All other columns including + meta columns and context are preserved. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> subset = datagram.select("user_id", "name", "email") + """ + ... + + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified data columns removed. Note that this does not + remove meta columns or context column. Refer to `drop_meta_columns()` for dropping + specific meta columns. Context key column can never be dropped but a modified copy + can be created with a different context key using `with_data_context()`. + + Args: + *column_names: Data column names to remove. + ignore_missing: If True, ignore missing columns without raising an error. + + Returns: + New datagram instance without specified data columns. + + Raises: + KeryError: If any specified column to drop doesn't exist and ignore_missing=False. + + Example: + >>> filtered = datagram.drop("temp_field", "debug_info") + """ + ... + + def rename( + self, + column_mapping: Mapping[str, str], + ) -> Self: + """ + Create new datagram with data columns renamed. + + Args: + column_mapping: Mapping from old names to new names. + + Returns: + New datagram instance with renamed data columns. + + Example: + >>> renamed = datagram.rename( + ... {"old_id": "user_id", "old_name": "full_name"}, + ... column_types={"user_id": int} + ... ) + """ + ... + + def update(self, **updates: DataValue) -> Self: + """ + Create new datagram with existing column values updated. + + Updates values in existing data columns. Will error if any specified + column doesn't exist - use with_columns() to add new columns. + + Args: + **updates: Column names and their new values. + + Returns: + New datagram instance with updated values. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> updated = datagram.update( + ... file_path="/new/absolute/path.txt", + ... status="processed" + ... ) + """ + ... + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create new datagram with additional data columns. + + Adds new data columns to the datagram. Will error if any specified + column already exists - use update() to modify existing columns. + + Args: + column_types: Optional type specifications for new columns. If not provided, the column type is + inferred from the provided values. If value is None, the column type defaults to `str`. + **kwargs: New columns as keyword arguments. + + Returns: + New datagram instance with additional data columns. + + Raises: + ValueError: If any specified column already exists. + + Example: + >>> expanded = datagram.with_columns( + ... status="active", + ... score=95.5, + ... column_types={"score": float} + ... ) + """ + ... + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> Self: + """ + Create new datagram with different context key. + + Changes the semantic interpretation context while preserving all data. + The context key affects how columns are processed and converted. + + Args: + new_context_key: New context key string. + + Returns: + New datagram instance with updated context key. + + Note: + How the context is interpreted depends on the datagram implementation. + Semantic processing may be rebuilt for the new context. + + Example: + >>> financial_datagram = datagram.with_context_key("financial_v1") + """ + ... + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New datagram instance with copied data and caches. + + Example: + >>> copied = datagram.copy() + >>> copied is datagram # False - different instance + False + """ + ... - This hash should be deterministic and based only on the packet content, - not on source information or metadata. Used for: - - Caching computation results - - Detecting data changes - - Deduplication operations + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. Returns: - str: Deterministic hash of packet content + Dictionary-style string representation of data columns only. + """ + ... + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information for debugging. + + Returns: + Detailed representation with type and metadata information. """ ... @@ -157,98 +613,218 @@ class Packet(Datagram, Protocol): data flow: Tags provide context, Packets provide content. """ - def as_table( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Table: + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: """ - Convert the packet to a PyArrow Table. + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. Args: - include_source: If True, source information is included in the table - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + Returns: - pa.Table: PyArrow table representation of packet data + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') """ ... - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> TypeSpec: """ - Convert the packet to a dictionary. + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. Args: - include_source: If True, source information is included in the dictionary - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + include_source: Whether to include source info fields. Returns: - dict[str, DataValue]: Dictionary representation of packet data + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } """ ... - def as_datagram(self, include_source: bool = False) -> Datagram: + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Schema": """ - Convert the packet to a Datagram. + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. Args: - include_source: If True, source information is included in the datagram - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + Returns: - Datagram: Datagram representation of packet data + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] """ ... - def source_info(self) -> dict[str, str | None]: + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: """ - Return metadata about the packet's source/origin. + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_source: Whether to include source info fields. - Provides debugging and lineage information about where the packet - originated. May include information like: - - File paths for file-based sources - - Database connection strings - - API endpoints - - Processing pipeline information Returns: - dict[str, str | None]: Source metadata for debugging/lineage + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) """ ... - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> TypeSpec: + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Table": """ - Return the type specification for this packet. + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. Args: - include_source: If True, source information is included in the typespec - for debugging and lineage tracking + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_source: Whether to include source info columns in the schema. Returns: - TypeSpec: Dictionary mapping field names to Python types + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" """ ... - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> Datagram: """ - Return the PyArrow schema for this packet. + Convert the packet to a Datagram. Args: - include_source: If True, source information is included in the schema - for debugging and lineage tracking + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. Returns: - pa.Schema: PyArrow schema representation of packet data + Datagram: Datagram representation of packet data """ ... - # def join(self, other: "Packet") -> "Packet": ... + def source_info(self) -> dict[str, str | None]: + """ + Return metadata about the packet's source/origin. + + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information - # def get_as(self, packet_type: PacketType) -> PacketType: ... + Returns: + dict[str, str | None]: Source information for each data column as key-value pairs. + """ + ... class PodFunction(Protocol): @@ -468,7 +1044,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": """ Convert the entire stream to a PyArrow Table. @@ -596,6 +1172,20 @@ def kernel_id(self) -> tuple[str, ...]: """ ... + @property + def data_context_key(self) -> str: + """ + Return the context key for this kernel's data processing. + + The context key is used to interpret how data columns should be + processed and converted. It provides semantic meaning to the data + being processed by this kernel. + + Returns: + str: Context key for this kernel's data processing + """ + ... + def __call__( self, *streams: Stream, label: str | None = None, **kwargs ) -> LiveStream: diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 1767509..16c96cd 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -1,11 +1,13 @@ """Hash strategy protocols for dependency injection.""" from collections.abc import Callable -from typing import Any, Protocol, runtime_checkable +from typing import Any, Protocol, runtime_checkable, TYPE_CHECKING import uuid from orcapod.types import TypeSpec, PathLike -import pyarrow as pa + +if TYPE_CHECKING: + import pyarrow as pa @runtime_checkable @@ -101,7 +103,7 @@ class ArrowHasher(Protocol): def get_hasher_id(self) -> str: ... - def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... + def hash_table(self, table: "pa.Table", prefix_hasher_id: bool = True) -> str: ... class StringCacher(Protocol): @@ -134,8 +136,8 @@ def hasher_id(self) -> str: def hash_column( self, - column: pa.Array, - ) -> pa.Array: + column: "pa.Array", + ) -> "pa.Array": """Hash a column with this semantic type and return the hash bytes.""" ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 618d7a4..d51ead8 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -1,13 +1,10 @@ -from typing import Collection, Protocol, TYPE_CHECKING -from orcapod.protocols import data_protocols as dp +from typing import Protocol +from collections.abc import Collection import pyarrow as pa -if TYPE_CHECKING: - import polars as pl - class ArrowDataStore(Protocol): - def record_data( + def add_record( self, record_path: tuple[str, ...], record_id: str, @@ -15,13 +12,26 @@ def record_data( ignore_duplicates: bool | None = None, ) -> str | None: ... - def get_recorded_data( + def add_records( + self, + record_path: tuple[str, ...], + records: pa.Table, + record_id_column: str | None = None, + ignore_duplicates: bool | None = None, + ) -> list[str]: ... + + def get_record_by_id( self, record_path: tuple[str, ...], record_id: str, + record_id_column: str | None = None, ) -> pa.Table | None: ... - def get_all_records(self, record_path: tuple[str, ...]) -> pa.Table | None: + def get_all_records( + self, + record_path: tuple[str, ...], + record_id_column: str | None = None, + ) -> pa.Table | None: """Retrieve all records for a given path as a stream.""" ... @@ -29,6 +39,5 @@ def get_records_by_ids( self, record_path: tuple[str, ...], record_ids: Collection[str], - add_entry_id_column: bool | str = False, - preseve_input_order: bool = False, + record_id_column: str | None = None, ) -> pa.Table: ... diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index f04a7b7..218c0e0 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -1,5 +1,4 @@ import pyarrow as pa -import pyarrow.dataset as ds import polars as pl from pathlib import Path from typing import Any @@ -7,6 +6,7 @@ from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict +from orcapod.data import constants # Module-level logger @@ -28,6 +28,8 @@ class BasicDeltaTableArrowStore: - ("year", "month", "day", "experiment") -> year/month/day/experiment/ """ + RECORD_ID_COLUMN = f"{constants.META_PREFIX}record_id" + def __init__( self, base_path: str | Path, @@ -41,8 +43,8 @@ def __init__( Args: base_path: Base directory path where Delta tables will be stored - duplicate_entry_behavior: How to handle duplicate entry_ids: - - 'error': Raise ValueError when entry_id already exists + duplicate_entry_behavior: How to handle duplicate record_ids: + - 'error': Raise ValueError when record_id already exists - 'overwrite': Replace existing entry with new data create_base_path: Whether to create the base path if it doesn't exist max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) @@ -87,15 +89,15 @@ def flush(self) -> None: except Exception as e: logger.error(f"Error during flush: {e}") - def flush_batch(self, source_path: tuple[str, ...]) -> None: + def flush_batch(self, record_path: tuple[str, ...]) -> None: """ Flush pending batch for a specific source path. Args: - source_path: Tuple of path components + record_path: Tuple of path components """ logger.debug("Flushing triggered!!") - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) if ( source_key not in self._pending_batches @@ -111,11 +113,11 @@ def flush_batch(self, source_path: tuple[str, ...]) -> None: # Combine all tables in the batch combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() - table_path = self._get_table_path(source_path) + table_path = self._get_table_path(record_path) table_path.mkdir(parents=True, exist_ok=True) # Check if table exists - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: # TODO: reconsider mode="overwrite" here @@ -130,27 +132,31 @@ def flush_batch(self, source_path: tuple[str, ...]) -> None: else: if self.duplicate_entry_behavior == "overwrite": # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) + record_ids = combined_table.column( + self.RECORD_ID_COLUMN + ).to_pylist() + unique_record_ids = list(set(record_ids)) # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" + if unique_record_ids: + record_ids_str = "', '".join(unique_record_ids) + delete_predicate = ( + f"{self.RECORD_ID_COLUMN} IN ('{record_ids_str}')" + ) try: delta_table.delete(delete_predicate) logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" + f"Deleted {len(unique_record_ids)} existing records from {source_key}" ) except Exception as e: logger.debug( f"No existing records to delete from {source_key}: {e}" ) - # otherwise, only insert if same entry_id does not exist yet + # otherwise, only insert if same record_id does not exist yet delta_table.merge( source=combined_table, - predicate="target.__entry_id = source.__entry_id", + predicate=f"target.{self.RECORD_ID_COLUMN} = source.{self.RECORD_ID_COLUMN}", source_alias="source", target_alias="target", ).when_not_matched_insert_all().execute() @@ -174,9 +180,9 @@ def flush_all_batches(self) -> None: # TODO: capture and re-raise exceptions at the end for source_key in source_keys: - source_path = tuple(source_key.split("/")) + record_path = tuple(source_key.split("/")) try: - self.flush_batch(source_path) + self.flush_batch(record_path) except Exception as e: logger.error(f"Error flushing batch for {source_key}: {e}") @@ -184,27 +190,27 @@ def __del__(self): """Cleanup when object is destroyed.""" self.flush() - def _validate_source_path(self, source_path: tuple[str, ...]) -> None: + def _validate_record_path(self, record_path: tuple[str, ...]) -> None: # TODO: consider removing this as path creation can be tried directly """ Validate source path components. Args: - source_path: Tuple of path components + record_path: Tuple of path components Raises: ValueError: If path is invalid """ - if not source_path: + if not record_path: raise ValueError("Source path cannot be empty") - if len(source_path) > self.max_hierarchy_depth: + if len(record_path) > self.max_hierarchy_depth: raise ValueError( - f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" + f"Source path depth {len(record_path)} exceeds maximum {self.max_hierarchy_depth}" ) # Validate path components - for i, component in enumerate(source_path): + for i, component in enumerate(record_path): if not component or not isinstance(component, str): raise ValueError( f"Source path component {i} is invalid: {repr(component)}" @@ -217,31 +223,31 @@ def _validate_source_path(self, source_path: tuple[str, ...]) -> None: f"Source path component contains invalid characters: {repr(component)}" ) - def _get_source_key(self, source_path: tuple[str, ...]) -> str: + def _get_source_key(self, record_path: tuple[str, ...]) -> str: """Generate cache key for source storage.""" - return "/".join(source_path) + return "/".join(record_path) - def _get_table_path(self, source_path: tuple[str, ...]) -> Path: + def _get_table_path(self, record_path: tuple[str, ...]) -> Path: """Get the filesystem path for a given source path.""" path = self.base_path - for subpath in source_path: + for subpath in record_path: path = path / subpath return path def _get_existing_delta_table( - self, source_path: tuple[str, ...] + self, record_path: tuple[str, ...] ) -> DeltaTable | None: """ Get or create a Delta table, handling schema initialization properly. Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: DeltaTable instance or None if table doesn't exist """ - source_key = self._get_source_key(source_path) - table_path = self._get_table_path(source_path) + source_key = self._get_source_key(record_path) + table_path = self._get_table_path(record_path) # Check cache first if dt := self._delta_table_cache.get(source_key): @@ -263,75 +269,79 @@ def _get_existing_delta_table( del self._delta_table_cache[source_key] return None - def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: - """Ensure the table has an __entry_id column.""" - if "__entry_id" not in arrow_data.column_names: - # Add entry_id column at the beginning - key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) - arrow_data = arrow_data.add_column(0, "__entry_id", key_array) + def _ensure_record_id_column( + self, arrow_data: pa.Table, record_id: str + ) -> pa.Table: + """Ensure the table has an record id column.""" + if self.RECORD_ID_COLUMN not in arrow_data.column_names: + # Add record_id column at the beginning + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data - def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: - """Remove the __entry_id column if it exists.""" - if "__entry_id" in arrow_data.column_names: + def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: + """Remove the record id column if it exists.""" + if self.RECORD_ID_COLUMN in arrow_data.column_names: column_names = arrow_data.column_names indices_to_keep = [ - i for i, name in enumerate(column_names) if name != "__entry_id" + i + for i, name in enumerate(column_names) + if name != self.RECORD_ID_COLUMN ] arrow_data = arrow_data.select(indices_to_keep) return arrow_data - def _handle_entry_id_column( - self, arrow_data: pa.Table, add_entry_id_column: bool | str = False + def _handle_record_id_column( + self, arrow_data: pa.Table, record_id_column: str | None = None ) -> pa.Table: """ - Handle entry_id column based on add_entry_id_column parameter. + Handle record_id column based on add_record_id_column parameter. Args: - arrow_data: Arrow table with __entry_id column - add_entry_id_column: Control entry ID column inclusion: - - False: Remove __entry_id column - - True: Keep __entry_id column as is - - str: Rename __entry_id column to custom name - """ - if add_entry_id_column is False: - # Remove the __entry_id column - return self._remove_entry_id_column(arrow_data) - elif isinstance(add_entry_id_column, str): - # Rename __entry_id to custom name - if "__entry_id" in arrow_data.column_names: - schema = arrow_data.schema - new_names = [ - add_entry_id_column if name == "__entry_id" else name - for name in schema.names - ] - return arrow_data.rename_columns(new_names) - # If add_entry_id_column is True, keep __entry_id as is - return arrow_data + arrow_data: Arrow table with record id column + record_id_column: Control entry ID column inclusion: - def _create_entry_id_filter(self, entry_id: str) -> list: + """ + if not record_id_column: + # Remove the record id column + return self._remove_record_id_column(arrow_data) + + # Rename record id column + if self.RECORD_ID_COLUMN in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + record_id_column if name == self.RECORD_ID_COLUMN else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + else: + raise ValueError( + f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." + ) + + def _create_record_id_filter(self, record_id: str) -> list: """ Create a proper filter expression for Delta Lake. Args: - entry_id: The entry ID to filter by + record_id: The entry ID to filter by Returns: List containing the filter expression for Delta Lake """ - return [("__entry_id", "=", entry_id)] + return [(self.RECORD_ID_COLUMN, "=", record_id)] - def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: + def _create_record_ids_filter(self, record_ids: list[str]) -> list: """ Create a proper filter expression for multiple entry IDs. Args: - entry_ids: List of entry IDs to filter by + record_ids: List of entry IDs to filter by Returns: List containing the filter expression for Delta Lake """ - return [("__entry_id", "in", entry_ids)] + return [(self.RECORD_ID_COLUMN, "in", record_ids)] def _read_table_with_filter( self, @@ -349,7 +359,7 @@ def _read_table_with_filter( Arrow table with preserved schema """ # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading - dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + dataset = delta_table.to_pyarrow_dataset(as_large_types=True) if filters: # Apply filters at dataset level for better performance import pyarrow.compute as pc @@ -379,36 +389,36 @@ def _read_table_with_filter( return dataset.to_table() - def record_data( + def add_record( self, record_path: tuple[str, ...], - entry_id: str, + record_id: str, data: pa.Table, - force_flush: bool = False, ignore_duplicates: bool | None = None, + force_flush: bool = False, ) -> pa.Table: - self._validate_source_path(record_path) + self._validate_record_path(record_path) source_key = self._get_source_key(record_path) # Check for existing entry if ignore_duplicates is None: ignore_duplicates = self.duplicate_entry_behavior != "error" if not ignore_duplicates: - pending_table = self._pending_batches[source_key].get(entry_id, None) + pending_table = self._pending_batches[source_key].get(record_id, None) if pending_table is not None: raise ValueError( - f"Entry '{entry_id}' already exists in pending batch for {source_key}. " + f"Entry '{record_id}' already exists in pending batch for {source_key}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - existing_record = self.get_recorded_data(record_path, entry_id, flush=False) + existing_record = self.get_record_by_id(record_path, record_id, flush=False) if existing_record is not None: raise ValueError( - f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " + f"Entry '{record_id}' already exists in {'/'.join(record_path)}. " f"Use duplicate_entry_behavior='overwrite' to allow updates." ) - # Add entry_id column to the data - data_with_entry_id = self._ensure_entry_id_column(data, entry_id) + # Add record_id column to the data + data_with_record_id = self._ensure_record_id_column(data, record_id) if force_flush: # Write immediately @@ -419,25 +429,25 @@ def record_data( if delta_table is None: # Create new table - save original schema first - write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") + write_deltalake(str(table_path), data_with_record_id, mode="overwrite") logger.debug(f"Created new Delta table for {source_key}") else: if self.duplicate_entry_behavior == "overwrite": try: delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" ) logger.debug( - f"Deleted existing record {entry_id} from {source_key}" + f"Deleted existing record {record_id} from {source_key}" ) except Exception as e: logger.debug( - f"No existing record to delete for {entry_id}: {e}" + f"No existing record to delete for {record_id}: {e}" ) write_deltalake( table_path, - data_with_entry_id, + data_with_record_id, mode="append", schema_mode="merge", ) @@ -446,28 +456,41 @@ def record_data( self._delta_table_cache[source_key] = DeltaTable(str(table_path)) else: # Add to the batch for later flushing - self._pending_batches[source_key][entry_id] = data_with_entry_id + self._pending_batches[source_key][record_id] = data_with_record_id batch_size = len(self._pending_batches[source_key]) # Check if we need to flush if batch_size >= self.batch_size: self.flush_batch(record_path) - logger.debug(f"Added record {entry_id} to {source_key}") + logger.debug(f"Added record {record_id} to {source_key}") return data - def get_recorded_data( + def add_records( self, record_path: tuple[str, ...], - entry_id: str, + records: pa.Table, + record_id_column: str | None = None, + ignore_duplicates: bool | None = None, + ) -> list[str]: + raise NotImplementedError( + "add_records is not implemented in BasicDeltaTableArrowStore yet. " + "Use add_record for single record insertion." + ) + + def get_record_by_id( + self, + record_path: tuple[str, ...], + record_id: str, + record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: """ - Get a specific record by entry_id with schema preservation. + Get a specific record by record_id with schema preservation. Args: - source_path: Tuple of path components - entry_id: Unique identifier for the record + record_path: Tuple of path components + record_id: Unique identifier for the record Returns: Arrow table for the record or None if not found @@ -475,14 +498,14 @@ def get_recorded_data( if flush: self.flush_batch(record_path) - self._validate_source_path(record_path) + self._validate_record_path(record_path) - # check if entry_id is found in pending batches + # check if record_id is found in pending batches source_key = self._get_source_key(record_path) - if entry_id in self._pending_batches[source_key]: + if record_id in self._pending_batches[source_key]: # Return the pending record after removing the entry id column - return self._remove_entry_id_column( - self._pending_batches[source_key][entry_id] + return self._remove_record_id_column( + self._pending_batches[source_key][record_id] ) delta_table = self._get_existing_delta_table(record_path) @@ -491,25 +514,25 @@ def get_recorded_data( try: # Use schema-preserving read - filter_expr = self._create_entry_id_filter(entry_id) + filter_expr = self._create_record_id_filter(record_id) result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None - # Remove the __entry_id column before returning - return self._remove_entry_id_column(result) + # Handle (remove/rename) the record id column before returning + return self._handle_record_id_column(result, record_id_column) except Exception as e: logger.error( - f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" + f"Error getting record {record_id} from {'/'.join(record_path)}: {e}" ) raise e def get_all_records( self, record_path: tuple[str, ...], - add_entry_id_column: bool | str = False, + record_id_column: str | None = None, retrieve_pending: bool = True, flush: bool = False, ) -> pa.Table | None: @@ -517,11 +540,8 @@ def get_all_records( Retrieve all records for a given source path as a single table with schema preservation. Args: - source_path: Tuple of path components - add_entry_id_column: Control entry ID column inclusion: - - False: Don't include entry ID column (default) - - True: Include entry ID column as "__entry_id" - - str: Include entry ID column with custom name + record_path: Tuple of path components + record_id_column: If not None or empty, record id is returned in the result with the specified column name Returns: Arrow table containing all records with original schema, or None if no records found @@ -530,16 +550,16 @@ def get_all_records( if flush: self.flush_batch(record_path) - self._validate_source_path(record_path) + self._validate_record_path(record_path) collected_tables = [] if retrieve_pending: # Check if there are pending records in the batch - for entry_id, arrow_table in self._pending_batches[ + for record_id, arrow_table in self._pending_batches[ self._get_source_key(record_path) ].items(): collected_tables.append( - self._ensure_entry_id_column(arrow_table, entry_id) + self._ensure_record_id_column(arrow_table, record_id) ) delta_table = self._get_existing_delta_table(record_path) @@ -558,44 +578,25 @@ def get_all_records( if collected_tables: total_table = pa.concat_tables(collected_tables) - # Handle entry_id column based on parameter - return self._handle_entry_id_column(total_table, add_entry_id_column) + # Handle record_id column based on parameter + return self._handle_record_id_column(total_table, record_id_column) return None - # def get_all_records_as_polars( - # self, source_path: tuple[str, ...], flush: bool = True - # ) -> pl.LazyFrame | None: - # """ - # Retrieve all records for a given source path as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - - # Returns: - # Polars LazyFrame containing all records, or None if no records found - # """ - # all_records = self.get_all_records(source_path, flush=flush) - # if all_records is None: - # return None - # # TODO: take care of converting semantics to Python objects - # return pl.LazyFrame(all_records.as_table()) - def get_records_by_ids( self, - source_path: tuple[str, ...], - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, + record_path: tuple[str, ...], + record_ids: list[str] | pl.Series | pa.Array, + record_id_column: str | None = None, flush: bool = False, ) -> pa.Table | None: """ Retrieve records by entry IDs as a single table with schema preservation. Args: - source_path: Tuple of path components - entry_ids: Entry IDs to retrieve - add_entry_id_column: Control entry ID column inclusion + record_path: Tuple of path components + record_ids: Entry IDs to retrieve + add_record_id_column: Control entry ID column inclusion preserve_input_order: If True, return results in input order with nulls for missing Returns: @@ -603,99 +604,49 @@ def get_records_by_ids( """ if flush: - self.flush_batch(source_path) + self.flush_batch(record_path) - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Convert input to list of strings for consistency - if isinstance(entry_ids, list): - if not entry_ids: + if isinstance(record_ids, list): + if not record_ids: return None - entry_ids_list = entry_ids - elif isinstance(entry_ids, pl.Series): - if len(entry_ids) == 0: + record_ids_list = record_ids + elif isinstance(record_ids, pl.Series): + if len(record_ids) == 0: return None - entry_ids_list = entry_ids.to_list() - elif isinstance(entry_ids, pa.Array): - if len(entry_ids) == 0: + record_ids_list = record_ids.to_list() + elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): + if len(record_ids) == 0: return None - entry_ids_list = entry_ids.to_pylist() + record_ids_list = record_ids.to_pylist() else: raise TypeError( - f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" + f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" ) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return None try: # Use schema-preserving read with filters - filter_expr = self._create_entry_ids_filter(entry_ids_list) + filter_expr = self._create_record_ids_filter(record_ids_list) result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: return None - if preserve_input_order: - raise NotImplementedError("Preserve input order is not yet implemented") - # Need to reorder results and add nulls for missing entries - import pandas as pd - - df = result.to_pandas() - df = df.set_index("__entry_id") - - # Create a DataFrame with the desired order, filling missing with NaN - ordered_df = df.reindex(entry_ids_list) - - # Convert back to Arrow - result = pa.Table.from_pandas(ordered_df.reset_index()) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) + # Handle record_id column based on parameter + return self._handle_record_id_column(result, record_id_column) except Exception as e: logger.error( - f"Error getting records by IDs from {'/'.join(source_path)}: {e}" + f"Error getting records by IDs from {'/'.join(record_path)}: {e}" ) return None - # def get_records_by_ids_as_polars( - # self, - # source_path: tuple[str, ...], - # entry_ids: list[str] | pl.Series | pa.Array, - # add_entry_id_column: bool | str = False, - # preserve_input_order: bool = False, - # flush: bool = False, - # ) -> pl.LazyFrame | None: - # """ - # Retrieve records by entry IDs as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - # entry_ids: Entry IDs to retrieve - # add_entry_id_column: Control entry ID column inclusion - # preserve_input_order: If True, return results in input order with nulls for missing - - # Returns: - # Polars LazyFrame containing all found records, or None if no records found - # """ - # arrow_result = self.get_records_by_ids( - # source_path, - # entry_ids, - # add_entry_id_column, - # preserve_input_order, - # flush=flush, - # ) - - # if arrow_result is None: - # return None - - # # Convert to Polars LazyFrame - # return pl.LazyFrame(arrow_result) - - # Additional utility methods - def get_pending_batch_info(self) -> dict[str, int]: """ Get information about pending batches. @@ -738,23 +689,23 @@ def _scan_directory(current_path: Path, path_components: tuple[str, ...]): _scan_directory(self.base_path, ()) return sources - def delete_source(self, source_path: tuple[str, ...]) -> bool: + def delete_source(self, record_path: tuple[str, ...]) -> bool: """ Delete an entire source (all records for a source path). Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: True if source was deleted, False if it didn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Flush any pending batches first - self.flush_batch(source_path) + self.flush_batch(record_path) - table_path = self._get_table_path(source_path) - source_key = self._get_source_key(source_path) + table_path = self._get_table_path(record_path) + source_key = self._get_source_key(record_path) if not table_path.exists(): return False @@ -776,64 +727,64 @@ def delete_source(self, source_path: tuple[str, ...]) -> bool: logger.error(f"Error deleting source {source_key}: {e}") return False - def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: + def delete_record(self, record_path: tuple[str, ...], record_id: str) -> bool: """ Delete a specific record. Args: - source_path: Tuple of path components - entry_id: ID of the record to delete + record_path: Tuple of path components + record_id: ID of the record to delete Returns: True if record was deleted, False if it didn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) # Flush any pending batches first - self.flush_batch(source_path) + self.flush_batch(record_path) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return False try: # Check if record exists using proper filter - filter_expr = self._create_entry_id_filter(entry_id) + filter_expr = self._create_record_id_filter(record_id) existing = self._read_table_with_filter(delta_table, filters=filter_expr) if len(existing) == 0: return False # Delete the record using SQL-style predicate (this is correct for delete operations) delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" ) # Update cache - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) self._delta_table_cache[source_key] = delta_table - logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") + logger.debug(f"Deleted record {record_id} from {'/'.join(record_path)}") return True except Exception as e: logger.error( - f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" + f"Error deleting record {record_id} from {'/'.join(record_path)}: {e}" ) return False - def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: + def get_table_info(self, record_path: tuple[str, ...]) -> dict[str, Any] | None: """ Get metadata information about a Delta table. Args: - source_path: Tuple of path components + record_path: Tuple of path components Returns: Dictionary with table metadata, or None if table doesn't exist """ - self._validate_source_path(source_path) + self._validate_record_path(record_path) - delta_table = self._get_existing_delta_table(source_path) + delta_table = self._get_existing_delta_table(record_path) if delta_table is None: return None @@ -841,15 +792,15 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: # Get basic info schema = delta_table.schema() history = delta_table.history() - source_key = self._get_source_key(source_path) + source_key = self._get_source_key(record_path) # Add pending batch info pending_info = self.get_pending_batch_info() pending_count = pending_info.get(source_key, 0) return { - "path": str(self._get_table_path(source_path)), - "source_path": source_path, + "path": str(self._get_table_path(record_path)), + "record_path": record_path, "schema": schema, "version": delta_table.version(), "num_files": len(delta_table.files()), @@ -859,5 +810,5 @@ def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: } except Exception as e: - logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") + logger.error(f"Error getting table info for {'/'.join(record_path)}: {e}") return None diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 118b110..817c249 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -1,6 +1,6 @@ from orcapod.types.semantic_types import PythonArrowConverter from orcapod.types.schemas import PythonSchema, SemanticSchema -from orcapod.types import typespec_utils as tsutils +from orcapod.types import TypeSpec, typespec_utils as tsutils from typing import Any, Self from collections.abc import Mapping @@ -29,9 +29,11 @@ def __init__( ): self._converter_lut = converter_lut - def from_python_to_arrow_schema(self, python_schema: PythonSchema) -> pa.Schema: + def from_python_to_arrow_schema(self, python_schema: TypeSpec) -> pa.Schema: """Convert a Python schema to an Arrow schema""" - return python_schema.to_arrow_schema(converters=self._converter_lut) + return PythonSchema(python_schema).to_arrow_schema( + converters=self._converter_lut + ) def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: """Convert an Arrow schema to a Python schema""" @@ -40,7 +42,7 @@ def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: ) def from_python_to_arrow( - self, python_data: Mapping[str, Any], python_schema: PythonSchema | None = None + self, python_data: Mapping[str, Any], python_schema: TypeSpec | None = None ) -> pa.Table: """Convert a dictionary of Python values to Arrow arrays""" if python_schema is None: @@ -85,3 +87,22 @@ def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: def as_dict(self) -> dict[str, PythonArrowConverter]: """Return the converter lookup table as a dictionary.""" return self._converter_lut.copy() + + def join(self, other: Self, strict: bool = False) -> Self: + """Join two SemanticConverters by merging their converter lookup tables.""" + if not isinstance(other, SemanticConverter): + raise TypeError("Can only join with another SemanticConverter.") + + new_converter_lut = self._converter_lut.copy() + for key, converter in other._converter_lut.items(): + if key in new_converter_lut: + if strict: + raise ValueError( + f"Key '{key}' already exists in the converter lookup table. Cannot overwrite in strict mode." + ) + logger.warning( + f"Key '{key}' already exists in the converter lookup table. Overwriting with new converter." + ) + new_converter_lut[key] = converter + + return self.__class__(new_converter_lut) diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py index 169da69..258617a 100644 --- a/src/orcapod/types/semantic_types.py +++ b/src/orcapod/types/semantic_types.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from pathlib import Path import pyarrow as pa + from collections.abc import Collection @@ -344,7 +345,7 @@ def to_canonical_from_arrow(self, value: pa.Array) -> list[T]: def from_canonical_to_arrow( self, value: T, target_type: pa.DataType | None = None - ) -> Any: + ) -> pa.Array: """Convert from canonical to Arrow representation using explicit Arrow DataType""" if target_type is None: @@ -438,7 +439,45 @@ def get_semantic_type_for_python_type( self, python_type: type ) -> SemanticType | None: """Get a semantic type by Python type""" - return self._python_to_semantic_lut.get(python_type) + + # check if it's directly registered + semantic_type = self._python_to_semantic_lut.get(python_type) + if semantic_type is None: + # check if it's a subclass + for ( + registered_type, + registered_semantic_type, + ) in self._python_to_semantic_lut.items(): + if issubclass(python_type, registered_type): + return registered_semantic_type + return semantic_type + + def get_arrow_type_for_semantic_type( + self, semantic_type_name: str + ) -> pa.DataType | None: + """Get the default Arrow DataType for a semantic type by name""" + semantic_type = self._semantic_type_lut.get(semantic_type_name) + if semantic_type: + return semantic_type.get_default_arrow_type() + return None + + def get_arrow_type_for_python_type( + self, python_type: type + ) -> tuple[str | None, pa.DataType] | None: + """Get the default Arrow DataType for a Python type""" + semantic_type = self.get_semantic_type_for_python_type(python_type) + if semantic_type: + return semantic_type.name, semantic_type.get_default_arrow_type() + return None + + def from_python_to_arrow(self, python_value: Any) -> tuple[str | None, Any]: + """Convert a Python value to Arrow-targetting representation using the semantic type registry""" + semantic_type = self.get_semantic_type_for_python_type(type(python_value)) + if semantic_type: + return semantic_type.name, semantic_type.convert_python_to_arrow( + python_value + ) + return None, python_value def get_semantic_type(self, name: str) -> SemanticType | None: """Get a semantic type by name""" @@ -448,11 +487,10 @@ def list_semantic_types(self) -> list[SemanticType]: """Get all registered semantic types""" return list(self._semantic_type_lut.values()) - def supports_python_type(self, python_type: type) -> bool: - """Check if registry supports the given Python type""" + def registered_with_semantic_type(self, python_type: type) -> bool: + """Check if registry has the Python type registered with a semantic type""" return python_type in self._python_to_semantic_lut - # Python-specific registry methods def supports_semantic_and_arrow_type( self, semantic_type_name: str, arrow_type: pa.DataType ) -> bool: diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py index 9f66654..609a6a0 100644 --- a/src/orcapod/types/typespec_utils.py +++ b/src/orcapod/types/typespec_utils.py @@ -214,7 +214,9 @@ def extract_function_typespecs( return param_info, inferred_output_types -def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> TypeSpec: +def get_typespec_from_dict( + data: Mapping, typespec: TypeSpec | None = None, default=str +) -> TypeSpec: """ Returns a TypeSpec for the given dictionary. The TypeSpec is a mapping from field name to Python type. If typespec is provided, then @@ -222,7 +224,10 @@ def get_typespec_from_dict(data: Mapping, typespec: TypeSpec | None = None) -> T """ if typespec is None: typespec = {} - return {key: typespec.get(key, type(value)) for key, value in data.items()} + return { + key: typespec.get(key, type(value) if value is not None else default) + for key, value in data.items() + } def get_compatible_type(type1: Any, type2: Any) -> Any: diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 5237eb3..0947499 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1,7 +1,6 @@ # TODO: move this to a separate module from collections import defaultdict -from matplotlib.pylab import f import pyarrow as pa from collections.abc import Mapping, Collection from typing import Any @@ -168,6 +167,7 @@ def prepare_prefixed_columns( | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], exclude_columns: Collection[str] = (), + exclude_prefixes: Collection[str] = (), ) -> tuple[pa.Table, dict[str, pa.Table]]: """ """ all_prefix_info = {} @@ -209,7 +209,12 @@ def prepare_prefixed_columns( prefixed_column_names = defaultdict(list) prefixed_columns = defaultdict(list) - target_column_names = [c for c in data_column_names if c not in exclude_columns] + target_column_names = [ + c + for c in data_column_names + if not any(c.startswith(prefix) for prefix in exclude_prefixes) + and c not in exclude_columns + ] for prefix, value_lut in all_prefix_info.items(): target_prefixed_column_names = prefixed_column_names[prefix] From ebe401a22c4c6b79039c5cdafb3003876392a02a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 07:37:45 +0000 Subject: [PATCH 117/224] fix: handling of schema when merging tables --- src/orcapod/data/datagrams/arrow_datagram.py | 216 ++++++------------ src/orcapod/data/datagrams/base.py | 29 +-- src/orcapod/data/pods.py | 2 +- src/orcapod/data/trackers.py | 3 +- .../pipeline/{nodes.py => legacy_nodes.py} | 0 .../{pipeline.py => legacy_pipeline.py} | 0 src/orcapod/types/semantic_converter.py | 10 + src/orcapod/utils/arrow_utils.py | 27 ++- 8 files changed, 98 insertions(+), 189 deletions(-) rename src/orcapod/pipeline/{nodes.py => legacy_nodes.py} (100%) rename src/orcapod/pipeline/{pipeline.py => legacy_pipeline.py} (100%) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5ed5307..5eb158c 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -1,14 +1,14 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Any, Self +from typing import Self import pyarrow as pa -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.context import ( DataContext, ) from orcapod.data.datagrams.base import BaseDatagram +from orcapod.data.system_constants import orcapod_constants as constants from orcapod.types import schemas, typespec_utils from orcapod.types.core import DataValue from orcapod.types.semantic_converter import SemanticConverter @@ -125,31 +125,6 @@ def __init__( self._cached_meta_python_schema: schemas.PythonSchema | None = None self._cached_content_hash: str | None = None - def _core_info(self) -> dict[str, Any]: - core_info = { - "data_table": self._data_table, - "meta_table": self._meta_table, - "data_context_table": self._data_context_table, - "semantic_converter": self._semantic_converter, - "cached_python_schema": self._cached_python_schema, - "cached_python_dict": self._cached_python_dict, - "cached_meta_python_schema": self._cached_meta_python_schema, - "cached_content_hash": self._cached_content_hash, - } - return core_info - - def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: - new_copy = object.__new__(self.__class__) - new_copy._data_table = core_info["data_table"] - new_copy._meta_table = core_info["meta_table"] - new_copy._data_context_table = core_info["data_context_table"] - new_copy._semantic_converter = core_info["semantic_converter"] - new_copy._cached_python_schema = core_info["cached_python_schema"] - new_copy._cached_python_dict = core_info["cached_python_dict"] - new_copy._cached_meta_python_schema = core_info["cached_meta_python_schema"] - new_copy._cached_content_hash = core_info["cached_content_hash"] - return new_copy - # 1. Core Properties (Identity & Structure) @property def meta_columns(self) -> tuple[str, ...]: @@ -492,6 +467,8 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: k = constants.META_PREFIX + k prefixed_updates[k] = v + new_datagram = self.copy(include_cache=False) + # Start with existing meta data meta_dict = {} if self._meta_table is not None: @@ -501,18 +478,10 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: meta_dict.update(prefixed_updates) # Create new meta table - new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None - - # Combine all tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, + new_datagram._meta_table = ( + pa.Table.from_pylist([meta_dict]) if meta_dict else None ) + return new_datagram def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: """ @@ -541,26 +510,10 @@ def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" ) - # Filter meta columns - remaining_cols = [ - col for col in self._meta_table.column_names if col not in prefixed_keys - ] + new_datagram = self.copy(include_cache=False) + new_datagram._meta_table = self._meta_table.drop_columns(prefixed_keys) - # Create new meta table - new_meta_table = ( - self._meta_table.select(remaining_cols) if remaining_cols else None - ) - - # Combine tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram # 6. Data Column Operations def select(self, *column_names: str) -> Self: @@ -579,18 +532,10 @@ def select(self, *column_names: str) -> Self: if missing_cols: raise ValueError(f"Columns not found: {missing_cols}") - new_data_table = self._data_table.select(list(column_names)) + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = new_datagram._data_table.select(column_names) - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: """ @@ -610,27 +555,12 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: raise KeyError( f"Following columns do not exist and cannot be dropped: {sorted(missing)}" ) + column_names = tuple(c for c in column_names if self._data_table.columns) - # Filter data columns - remaining_cols = [ - col for col in self._data_table.column_names if col not in column_names - ] - - if not remaining_cols: - raise ValueError("Cannot drop all data columns") - - new_data_table = self._data_table.select(remaining_cols) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = self._data_table.drop_columns(column_names) + # TODO: consider dropping extra semantic columns if they are no longer needed + return new_datagram def rename(self, column_mapping: Mapping[str, str]) -> Self: """ @@ -644,30 +574,22 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: New ArrowDatagram instance with renamed data columns """ # Create new schema with renamed fields, preserving original types - new_fields = [] - for field in self._data_table.schema: - old_name = field.name - new_name = column_mapping.get(old_name, old_name) - new_field = pa.field(new_name, field.type) - new_fields.append(new_field) - - # Create new data table with renamed columns - new_schema = pa.schema(new_fields) - new_data_table = self._data_table.rename_columns( - [column_mapping.get(name, name) for name in self._data_table.column_names] - ).cast(new_schema) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - return self.__class__( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, + if not column_mapping: + return self + + new_names = [column_mapping.get(k, k) for k in self._data_table.column_names] + + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = new_datagram._data_table.rename_columns(new_names) + + # apply the same rename to the converters + new_datagram._semantic_converter = self._semantic_converter.rename( + column_mapping ) + return new_datagram + def update(self, **updates: DataValue) -> Self: """ Create a new ArrowDatagram with specific column values updated. @@ -699,23 +621,19 @@ def update(self, **updates: DataValue) -> Self: f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" ) + new_datagram = self.copy(include_cache=False) + updates_typespec = schemas.PythonSchema( {k: v for k, v in self.types().items() if k in updates} ) - update_table = self._semantic_converter.from_python_to_arrow( updates, updates_typespec ) - all_tables = [self._data_table.drop_columns(list(updates.keys())), update_table] - - if self._meta_table is not None: - all_tables.append(self._meta_table) + new_datagram._data_table = arrow_utils.hstack_tables( + self._data_table.drop_columns(list(updates.keys())), update_table + ).select(self._data_table.column_names) # adjsut the order to match original - return self.__class__( - table=arrow_utils.hstack_tables(*all_tables), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) + return new_datagram def with_columns( self, @@ -742,7 +660,7 @@ def with_columns( if not updates: return self - # Error if any column already exists + # Error if any of the columns already exists existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) if existing_overlaps: raise ValueError( @@ -750,7 +668,11 @@ def with_columns( f"Use update() to modify existing columns." ) + # create a copy and perform in-place updates + new_datagram = self.copy() + # TODO: consider simplifying this conversion logic + # prepare update's table typespec = typespec_utils.get_typespec_from_dict(updates, column_types) updates_converter = SemanticConverter.from_semantic_schema( @@ -761,21 +683,16 @@ def with_columns( # TODO: cleanup the handling of typespec python schema and various conversion points new_data_table = updates_converter.from_python_to_arrow(updates, typespec) - # Combine with meta table for reconstruction - all_tables = [self._data_table, new_data_table] - if self._meta_table is not None: - all_tables.append(self._meta_table) - - combined_table = arrow_utils.hstack_tables(*all_tables) + # perform in-place update + new_datagram._data_table = arrow_utils.hstack_tables( + new_datagram._data_table, new_data_table + ) # prepare the joined converter - total_converter = self._semantic_converter.join(updates_converter) - - return self.__class__( - table=combined_table, - semantic_converter=total_converter, - data_context=self._data_context, + new_datagram._semantic_converter = self._semantic_converter.join( + updates_converter ) + return new_datagram # 7. Context Operations def with_context_key(self, new_context_key: str) -> Self: @@ -789,6 +706,7 @@ def with_context_key(self, new_context_key: str) -> Self: Returns: New ArrowDatagram instance with new context """ + # TODO: consider if there is a more efficient way to handle context # Combine all tables for reconstruction combined_table = self._data_table if self._meta_table is not None: @@ -801,23 +719,25 @@ def with_context_key(self, new_context_key: str) -> Self: ) # 8. Utility Operations - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - new_datagram = self.__class__( - combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_content_hash = self._cached_content_hash + new_datagram = super().copy() + + new_datagram._data_table = self._data_table + new_datagram._meta_table = self._meta_table + new_datagram._data_context = self._data_context + new_datagram._semantic_converter = self._semantic_converter + + if include_cache: + new_datagram._cached_python_schema = self._cached_python_schema + new_datagram._cached_python_dict = self._cached_python_dict + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_meta_python_schema = self._cached_meta_python_schema + else: + new_datagram._cached_python_schema = None + new_datagram._cached_python_dict = None + new_datagram._cached_content_hash = None + new_datagram._cached_meta_python_schema = None return new_datagram diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 0ec1501..f253995 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -269,33 +269,6 @@ def with_context_key(self, new_context_key: str) -> Self: ... # 8. Utility Operations - @abstractmethod def copy(self) -> Self: """Create a shallow copy of the datagram.""" - ... - - @abstractmethod - def _core_info(self) -> dict[str, Any]: - """ - Return core information about the datagram. - This is meant to be used for internal purposes only and is not part of the public API. - It provides necessary information to create an efficient copy of the datagram - and in a manner that works across inheritance hierarchies. - - Returns: - Dictionary with all information necessary to recreate the datagram in a copy. - """ - ... - - @abstractmethod - def _create_from_core_info(self, core_info: dict[str, Any]) -> Self: - """ - Create a new datagram instance from core information. - - Args: - core_info: Dictionary with core information about the datagram - - Returns: - New datagram instance - """ - ... + return object.__new__(self.__class__) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index f22b9fe..8662903 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -366,7 +366,7 @@ def identity_structure(self, *streams: dp.Stream) -> Any: class WrappedPod(ActivatablePodBase): """ - A wrapper for a pod that allows it to be used as a kernel. + A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. This class is meant to serve as a base class for other pods that need to wrap existing pods. """ diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 5ad2a55..0f6ef94 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,8 +1,9 @@ from orcapod.protocols import data_protocols as dp, hashing_protocols as hp from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict +from collections.abc import Generator from abc import ABC, abstractmethod -from typing import Any, ContextManager, Generator +from typing import Any from contextlib import contextmanager diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/legacy_nodes.py similarity index 100% rename from src/orcapod/pipeline/nodes.py rename to src/orcapod/pipeline/legacy_nodes.py diff --git a/src/orcapod/pipeline/pipeline.py b/src/orcapod/pipeline/legacy_pipeline.py similarity index 100% rename from src/orcapod/pipeline/pipeline.py rename to src/orcapod/pipeline/legacy_pipeline.py diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 817c249..889d8a2 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -106,3 +106,13 @@ def join(self, other: Self, strict: bool = False) -> Self: new_converter_lut[key] = converter return self.__class__(new_converter_lut) + + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """Rename columns in the converter lookup table.""" + new_converter_lut = {} + new_converter_lut = { + column_mapping.get(key, key): converter + for key, converter in self._converter_lut.items() + } + + return self.__class__(new_converter_lut) diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 0947499..700fa3e 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -43,20 +43,25 @@ def hstack_tables(*tables: pa.Table) -> pa.Table: "All tables must have the same number of rows for horizontal stacking." ) - # create combined column names - all_column_names = [] - all_columns = [] + # create combined schema + all_fields = [] all_names = set() - for i, table in enumerate(tables): - if overlap := set(table.column_names).intersection(all_names): - raise ValueError( - f"Duplicate column names {overlap} found when stacking table at index {i}: {table}" - ) - all_names.update(table.column_names) - all_column_names += table.column_names + for table in tables: + for field in table.schema: + if field.name in all_names: + raise ValueError( + f"Duplicate column name '{field.name}' found in input tables." + ) + all_fields.append(field) + all_names.add(field.name) + combined_schmea = pa.schema(all_fields) + + # create combined columns + all_columns = [] + for table in tables: all_columns += table.columns - return pa.Table.from_arrays(all_columns, names=all_column_names) + return pa.Table.from_arrays(all_columns, schema=combined_schmea) def check_arrow_schema_compatibility( From 2b9e15f2fe425c47d5a16cb1a9baa372b8bbd29c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 21:59:58 +0000 Subject: [PATCH 118/224] refactor: clean up unused imports and move old code into renamed module --- src/orcapod/data/base.py | 3 - src/orcapod/data/context.py | 28 +- src/orcapod/data/datagram_store.py | 890 --------- src/orcapod/data/old_datagrams.py | 2281 ---------------------- src/orcapod/data/operators.py | 9 +- src/orcapod/errors.py | 5 + src/orcapod/hashing/versioned_hashers.py | 1 - src/orcapod/pipeline/__init__.py | 8 +- src/orcapod/pipeline/legacy_pipeline.py | 2 +- src/orcapod/protocols/store_protocols.py | 4 + src/orcapod/protocols/types.py | 51 - 11 files changed, 30 insertions(+), 3252 deletions(-) delete mode 100644 src/orcapod/data/datagram_store.py delete mode 100644 src/orcapod/data/old_datagrams.py create mode 100644 src/orcapod/errors.py delete mode 100644 src/orcapod/protocols/types.py diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index f8788e1..dec4f06 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -1,9 +1,6 @@ -from abc import ABC, abstractmethod from typing import Any from orcapod.protocols import hashing_protocols as hp -from orcapod.types import TypeSpec from orcapod.hashing.defaults import get_default_object_hasher -import pyarrow as pa import logging diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py index 85261d2..20bc43a 100644 --- a/src/orcapod/data/context.py +++ b/src/orcapod/data/context.py @@ -1,4 +1,3 @@ -from typing import Self from orcapod.types.semantic_types import SemanticTypeRegistry from orcapod.types import default_registry from orcapod.protocols import hashing_protocols as hp @@ -22,27 +21,19 @@ def resolve_data_context(data_context: "str | DataContext | None") -> "DataConte return orcapod_system_data_context_manager.resolve_context(data_context) -default_data_context = DataContext( - "std:v0.1.0:default", - default_registry, - get_default_arrow_hasher(), - get_default_object_hasher(), -) - - class DataContextManager(dict[str, DataContext]): - def register_context(self, DataContext): + def register_context(self, data_context: DataContext): """ Register a new DataContext instance. Args: - DataContext: The DataContext instance to register. + data_context: The DataContext instance to register. """ - if DataContext.context_key in self: + if data_context.context_key in self: raise ValueError( - f"DataContext with key {DataContext.context_key} already exists." + f"DataContext with key {data_context.context_key} already exists." ) - self[DataContext.context_key] = DataContext + self[data_context.context_key] = data_context def resolve_context(self, context_info: str | DataContext | None) -> DataContext: if isinstance(context_info, DataContext): @@ -56,5 +47,14 @@ def resolve_context(self, context_info: str | DataContext | None) -> DataContext raise ValueError(f"DataContext with key {context_info} not found.") + +default_data_context = DataContext( + "std:v0.1.0:default", + default_registry, + get_default_arrow_hasher(), + get_default_object_hasher(), +) + + orcapod_system_data_context_manager = DataContextManager() orcapod_system_data_context_manager.register_context(default_data_context) diff --git a/src/orcapod/data/datagram_store.py b/src/orcapod/data/datagram_store.py deleted file mode 100644 index 72d082c..0000000 --- a/src/orcapod/data/datagram_store.py +++ /dev/null @@ -1,890 +0,0 @@ -# class DatagramStore(Protocol): -# def record_datagram( -# self, -# record_path: tuple[str, ...], -# datagram: dp.Datagram, -# ignore_duplicates: bool = False, -# ) -> str | None: ... - -# def record_stream( -# self, -# record_path: tuple[str, ...], -# stream: dp.Stream, -# ignore_duplicates: bool = False, -# ) -> None: ... - -# def get_recorded_datagram( -# self, -# record_path: tuple[str, ...], -# record_id: str, -# ) -> dp.Datagram | None: ... - -# def get_all_records(self, record_path: tuple[str, ...]) -> dp.Stream | None: -# """Retrieve all records for a given path as a stream.""" -# ... - -# def get_all_records_as_polars( -# self, record_path: tuple[str, ...] -# ) -> pl.DataFrame | None: -# """Retrieve all records for a given path as a Polars stream.""" -# ... - -# def get_records_by_ids( -# self, -# record_path: tuple[str, ...], -# entry_ids: Collection[str], -# add_entry_id_column: bool | str = False, -# preseve_input_order: bool = False, -# ) -> dp.Stream: ... - - -import pyarrow as pa -import pyarrow.compute as pc -import pyarrow.dataset as ds -import polars as pl -from pathlib import Path -from typing import Any -import logging -from deltalake import DeltaTable, write_deltalake -from deltalake.exceptions import TableNotFoundError -from collections import defaultdict -from orcapod.data.datagrams import ArrowDatagram, SemanticTypeRegistry -from orcapod.data.streams import ImmutableTableStream -from orcapod.hashing import get_default_arrow_hasher -from orcapod.hashing.types import ArrowHasher -from orcapod.protocols import data_protocols as dp -from orcapod.types import default_registry - - -# Module-level logger -logger = logging.getLogger(__name__) - - -class DeltaTableArrowStore: - """ - Delta Table-based Arrow data store with flexible hierarchical path support and schema preservation. - - Uses tuple-based source paths for robust parameter handling: - - ("source_name", "source_id") -> source_name/source_id/ - - ("org", "project", "dataset") -> org/project/dataset/ - - ("year", "month", "day", "experiment") -> year/month/day/experiment/ - """ - - def __init__( - self, - base_path: str | Path, - duplicate_entry_behavior: str = "error", - create_base_path: bool = True, - max_hierarchy_depth: int = 10, - batch_size: int = 100, - ): - """ - Initialize the DeltaTableArrowDataStore. - - Args: - base_path: Base directory path where Delta tables will be stored - duplicate_entry_behavior: How to handle duplicate entry_ids: - - 'error': Raise ValueError when entry_id already exists - - 'overwrite': Replace existing entry with new data - create_base_path: Whether to create the base path if it doesn't exist - max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) - batch_size: Number of records to batch before writing to Delta table - auto_flush_interval: Time in seconds to auto-flush pending batches (0 to disable) - """ - # Validate duplicate behavior - if duplicate_entry_behavior not in ["error", "overwrite"]: - raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - - self.duplicate_entry_behavior = duplicate_entry_behavior - self.base_path = Path(base_path) - self.max_hierarchy_depth = max_hierarchy_depth - self.batch_size = batch_size - - if create_base_path: - self.base_path.mkdir(parents=True, exist_ok=True) - elif not self.base_path.exists(): - raise ValueError( - f"Base path {self.base_path} does not exist and create_base_path=False" - ) - - # Cache for Delta tables to avoid repeated initialization - self._delta_table_cache: dict[str, DeltaTable] = {} - - # Batch management - self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) - - logger.info( - f"Initialized DeltaTableArrowDataStore at {self.base_path} " - f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " - f"batch_size={batch_size}, as" - ) - - def flush(self) -> None: - """ - Flush all pending batches immediately. - - This method is called to ensure all pending data is written to the Delta tables. - """ - try: - self.flush_all_batches() - except Exception as e: - logger.error(f"Error during flush: {e}") - - def flush_batch(self, source_path: tuple[str, ...]) -> None: - """ - Flush pending batch for a specific source path. - - Args: - source_path: Tuple of path components - """ - logger.debug("Flushing triggered!!") - source_key = self._get_source_key(source_path) - - if ( - source_key not in self._pending_batches - or not self._pending_batches[source_key] - ): - return - - # Get all pending records - pending_tables = self._pending_batches[source_key] - self._pending_batches[source_key] = {} - - try: - # Combine all tables in the batch - combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() - - table_path = self._get_table_path(source_path) - table_path.mkdir(parents=True, exist_ok=True) - - # Check if table exists - delta_table = self._get_existing_delta_table(source_path) - - if delta_table is None: - # TODO: reconsider mode="overwrite" here - write_deltalake( - table_path, - combined_table, - mode="overwrite", - ) - logger.debug( - f"Created new Delta table for {source_key} with {len(combined_table)} records" - ) - else: - if self.duplicate_entry_behavior == "overwrite": - # Get entry IDs from the batch - entry_ids = combined_table.column("__entry_id").to_pylist() - unique_entry_ids = list(set(entry_ids)) - - # Delete existing records with these IDs - if unique_entry_ids: - entry_ids_str = "', '".join(unique_entry_ids) - delete_predicate = f"__entry_id IN ('{entry_ids_str}')" - try: - delta_table.delete(delete_predicate) - logger.debug( - f"Deleted {len(unique_entry_ids)} existing records from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing records to delete from {source_key}: {e}" - ) - - # otherwise, only insert if same entry_id does not exist yet - delta_table.merge( - source=combined_table, - predicate="target.__entry_id = source.__entry_id", - source_alias="source", - target_alias="target", - ).when_not_matched_insert_all().execute() - - logger.debug( - f"Appended batch of {len(combined_table)} records to {source_key}" - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - # Put the tables back in the pending queue - self._pending_batches[source_key] = pending_tables - raise - - def flush_all_batches(self) -> None: - """Flush all pending batches.""" - source_keys = list(self._pending_batches.keys()) - - # TODO: capture and re-raise exceptions at the end - for source_key in source_keys: - source_path = tuple(source_key.split("/")) - try: - self.flush_batch(source_path) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - - def __del__(self): - """Cleanup when object is destroyed.""" - self.flush() - - def _validate_source_path(self, source_path: tuple[str, ...]) -> None: - # TODO: consider removing this as path creation can be tried directly - """ - Validate source path components. - - Args: - source_path: Tuple of path components - - Raises: - ValueError: If path is invalid - """ - if not source_path: - raise ValueError("Source path cannot be empty") - - if len(source_path) > self.max_hierarchy_depth: - raise ValueError( - f"Source path depth {len(source_path)} exceeds maximum {self.max_hierarchy_depth}" - ) - - # Validate path components - for i, component in enumerate(source_path): - if not component or not isinstance(component, str): - raise ValueError( - f"Source path component {i} is invalid: {repr(component)}" - ) - - # Check for filesystem-unsafe characters - unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] - if any(char in component for char in unsafe_chars): - raise ValueError( - f"Source path component contains invalid characters: {repr(component)}" - ) - - def _get_source_key(self, source_path: tuple[str, ...]) -> str: - """Generate cache key for source storage.""" - return "/".join(source_path) - - def _get_table_path(self, source_path: tuple[str, ...]) -> Path: - """Get the filesystem path for a given source path.""" - path = self.base_path - for subpath in source_path: - path = path / subpath - return path - - def _get_existing_delta_table( - self, source_path: tuple[str, ...] - ) -> DeltaTable | None: - """ - Get or create a Delta table, handling schema initialization properly. - - Args: - source_path: Tuple of path components - - Returns: - DeltaTable instance or None if table doesn't exist - """ - source_key = self._get_source_key(source_path) - table_path = self._get_table_path(source_path) - - # Check cache first - if dt := self._delta_table_cache.get(source_key): - return dt - - try: - # Try to load existing table - delta_table = DeltaTable(str(table_path)) - self._delta_table_cache[source_key] = delta_table - logger.debug(f"Loaded existing Delta table for {source_key}") - return delta_table - except TableNotFoundError: - # Table doesn't exist - return None - except Exception as e: - logger.error(f"Error loading Delta table for {source_key}: {e}") - # Try to clear any corrupted cache and retry once - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - return None - - def _ensure_entry_id_column(self, arrow_data: pa.Table, entry_id: str) -> pa.Table: - """Ensure the table has an __entry_id column.""" - if "__entry_id" not in arrow_data.column_names: - # Add entry_id column at the beginning - key_array = pa.array([entry_id] * len(arrow_data), type=pa.large_string()) - arrow_data = arrow_data.add_column(0, "__entry_id", key_array) - return arrow_data - - def _remove_entry_id_column(self, arrow_data: pa.Table) -> pa.Table: - """Remove the __entry_id column if it exists.""" - if "__entry_id" in arrow_data.column_names: - column_names = arrow_data.column_names - indices_to_keep = [ - i for i, name in enumerate(column_names) if name != "__entry_id" - ] - arrow_data = arrow_data.select(indices_to_keep) - return arrow_data - - def _handle_entry_id_column( - self, arrow_data: pa.Table, add_entry_id_column: bool | str = False - ) -> pa.Table: - """ - Handle entry_id column based on add_entry_id_column parameter. - - Args: - arrow_data: Arrow table with __entry_id column - add_entry_id_column: Control entry ID column inclusion: - - False: Remove __entry_id column - - True: Keep __entry_id column as is - - str: Rename __entry_id column to custom name - """ - if add_entry_id_column is False: - # Remove the __entry_id column - return self._remove_entry_id_column(arrow_data) - elif isinstance(add_entry_id_column, str): - # Rename __entry_id to custom name - if "__entry_id" in arrow_data.column_names: - schema = arrow_data.schema - new_names = [ - add_entry_id_column if name == "__entry_id" else name - for name in schema.names - ] - return arrow_data.rename_columns(new_names) - # If add_entry_id_column is True, keep __entry_id as is - return arrow_data - - def _create_entry_id_filter(self, entry_id: str) -> list: - """ - Create a proper filter expression for Delta Lake. - - Args: - entry_id: The entry ID to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [("__entry_id", "=", entry_id)] - - def _create_entry_ids_filter(self, entry_ids: list[str]) -> list: - """ - Create a proper filter expression for multiple entry IDs. - - Args: - entry_ids: List of entry IDs to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [("__entry_id", "in", entry_ids)] - - def _read_table_with_filter( - self, - delta_table: DeltaTable, - filters: list | None = None, - ) -> pa.Table: - """ - Read table using to_pyarrow_dataset with original schema preservation. - - Args: - delta_table: The Delta table to read from - filters: Optional filters to apply - - Returns: - Arrow table with preserved schema - """ - # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading - dataset: ds.Dataset = delta_table.to_pyarrow_dataset(as_large_types=True) - if filters: - # Apply filters at dataset level for better performance - import pyarrow.compute as pc - - filter_expr = None - for filt in filters: - if len(filt) == 3: - col, op, val = filt - if op == "=": - expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore - elif op == "in": - expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore - else: - logger.warning( - f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." - ) - # Fallback to table-level filtering - return dataset.to_table()(filters=filters) - - if filter_expr is None: - filter_expr = expr - else: - filter_expr = pc.and_(filter_expr, expr) # type: ignore - - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) - - return dataset.to_table() - - def record_data( - self, - record_path: tuple[str, ...], - entry_id: str, - data: pa.Table, - force_flush: bool = False, - error_on_duplicate: bool | None = None, - ) -> pa.Table: - self._validate_source_path(record_path) - source_key = self._get_source_key(record_path) - - # Check for existing entry - if error_on_duplicate is None: - error_on_duplicate = self.duplicate_entry_behavior == "error" - if error_on_duplicate: - pending_table = self._pending_batches[source_key].get(entry_id, None) - if pending_table is not None: - raise ValueError( - f"Entry '{entry_id}' already exists in pending batch for {source_key}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - existing_record = self.get_recorded_data(record_path, entry_id, flush=False) - if existing_record is not None: - raise ValueError( - f"Entry '{entry_id}' already exists in {'/'.join(record_path)}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - - # Add entry_id column to the data - data_with_entry_id = self._ensure_entry_id_column(data, entry_id) - - if force_flush: - # Write immediately - table_path = self._get_table_path(record_path) - table_path.mkdir(parents=True, exist_ok=True) - - delta_table = self._get_existing_delta_table(record_path) - - if delta_table is None: - # Create new table - save original schema first - write_deltalake(str(table_path), data_with_entry_id, mode="overwrite") - logger.debug(f"Created new Delta table for {source_key}") - else: - if self.duplicate_entry_behavior == "overwrite": - try: - delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" - ) - logger.debug( - f"Deleted existing record {entry_id} from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing record to delete for {entry_id}: {e}" - ) - - write_deltalake( - table_path, - data_with_entry_id, - mode="append", - schema_mode="merge", - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - else: - # Add to the batch for later flushing - self._pending_batches[source_key][entry_id] = data_with_entry_id - batch_size = len(self._pending_batches[source_key]) - - # Check if we need to flush - if batch_size >= self.batch_size: - self.flush_batch(record_path) - - logger.debug(f"Added record {entry_id} to {source_key}") - return data - - def get_recorded_data( - self, - record_path: tuple[str, ...], - entry_id: str, - flush: bool = False, - ) -> pa.Table | None: - """ - Get a specific record by entry_id with schema preservation. - - Args: - source_path: Tuple of path components - entry_id: Unique identifier for the record - - Returns: - Arrow table for the record or None if not found - """ - - if flush: - self.flush_batch(record_path) - self._validate_source_path(record_path) - - # check if entry_id is found in pending batches - source_key = self._get_source_key(record_path) - if entry_id in self._pending_batches[source_key]: - # Return the pending record directly - return self._pending_batches[source_key][entry_id] - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read - filter_expr = self._create_entry_id_filter(entry_id) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - # Remove the __entry_id column before returning - return self._remove_entry_id_column(result) - - except Exception as e: - logger.error( - f"Error getting record {entry_id} from {'/'.join(record_path)}: {e}" - ) - raise e - - def get_all_records( - self, - record_path: tuple[str, ...], - add_entry_id_column: bool | str = False, - retrieve_pending: bool = True, - flush: bool = False, - ) -> pa.Table | None: - """ - Retrieve all records for a given source path as a single table with schema preservation. - - Args: - source_path: Tuple of path components - add_entry_id_column: Control entry ID column inclusion: - - False: Don't include entry ID column (default) - - True: Include entry ID column as "__entry_id" - - str: Include entry ID column with custom name - - Returns: - Arrow table containing all records with original schema, or None if no records found - """ - # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily - - if flush: - self.flush_batch(record_path) - self._validate_source_path(record_path) - - collected_tables = [] - if retrieve_pending: - # Check if there are pending records in the batch - for entry_id, arrow_table in self._pending_batches[ - self._get_source_key(record_path) - ].items(): - collected_tables.append( - self._ensure_entry_id_column(arrow_table, entry_id) - ) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is not None: - try: - # Use filter-based read - result = self._read_table_with_filter(delta_table) - - if len(result) != 0: - collected_tables.append(result) - - except Exception as e: - logger.error( - f"Error getting all records from {'/'.join(record_path)}: {e}" - ) - if collected_tables: - total_table = pa.concat_tables(collected_tables) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(total_table, add_entry_id_column) - - return None - - # def get_all_records_as_polars( - # self, source_path: tuple[str, ...], flush: bool = True - # ) -> pl.LazyFrame | None: - # """ - # Retrieve all records for a given source path as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - - # Returns: - # Polars LazyFrame containing all records, or None if no records found - # """ - # all_records = self.get_all_records(source_path, flush=flush) - # if all_records is None: - # return None - # # TODO: take care of converting semantics to Python objects - # return pl.LazyFrame(all_records.as_table()) - - def get_records_by_ids( - self, - source_path: tuple[str, ...], - entry_ids: list[str] | pl.Series | pa.Array, - add_entry_id_column: bool | str = False, - preserve_input_order: bool = False, - flush: bool = False, - ) -> pa.Table | None: - """ - Retrieve records by entry IDs as a single table with schema preservation. - - Args: - source_path: Tuple of path components - entry_ids: Entry IDs to retrieve - add_entry_id_column: Control entry ID column inclusion - preserve_input_order: If True, return results in input order with nulls for missing - - Returns: - Arrow table containing all found records with original schema, or None if no records found - """ - - if flush: - self.flush_batch(source_path) - - self._validate_source_path(source_path) - - # Convert input to list of strings for consistency - if isinstance(entry_ids, list): - if not entry_ids: - return None - entry_ids_list = entry_ids - elif isinstance(entry_ids, pl.Series): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_list() - elif isinstance(entry_ids, pa.Array): - if len(entry_ids) == 0: - return None - entry_ids_list = entry_ids.to_pylist() - else: - raise TypeError( - f"entry_ids must be list[str], pl.Series, or pa.Array, got {type(entry_ids)}" - ) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read with filters - filter_expr = self._create_entry_ids_filter(entry_ids_list) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - if preserve_input_order: - raise NotImplementedError("Preserve input order is not yet implemented") - # Need to reorder results and add nulls for missing entries - import pandas as pd - - df = result.to_pandas() - df = df.set_index("__entry_id") - - # Create a DataFrame with the desired order, filling missing with NaN - ordered_df = df.reindex(entry_ids_list) - - # Convert back to Arrow - result = pa.Table.from_pandas(ordered_df.reset_index()) - - # Handle entry_id column based on parameter - return self._handle_entry_id_column(result, add_entry_id_column) - - except Exception as e: - logger.error( - f"Error getting records by IDs from {'/'.join(source_path)}: {e}" - ) - return None - - # def get_records_by_ids_as_polars( - # self, - # source_path: tuple[str, ...], - # entry_ids: list[str] | pl.Series | pa.Array, - # add_entry_id_column: bool | str = False, - # preserve_input_order: bool = False, - # flush: bool = False, - # ) -> pl.LazyFrame | None: - # """ - # Retrieve records by entry IDs as a single Polars LazyFrame. - - # Args: - # source_path: Tuple of path components - # entry_ids: Entry IDs to retrieve - # add_entry_id_column: Control entry ID column inclusion - # preserve_input_order: If True, return results in input order with nulls for missing - - # Returns: - # Polars LazyFrame containing all found records, or None if no records found - # """ - # arrow_result = self.get_records_by_ids( - # source_path, - # entry_ids, - # add_entry_id_column, - # preserve_input_order, - # flush=flush, - # ) - - # if arrow_result is None: - # return None - - # # Convert to Polars LazyFrame - # return pl.LazyFrame(arrow_result) - - # Additional utility methods - def list_sources(self) -> list[tuple[str, ...]]: - """ - List all available source paths. - - Returns: - List of source path tuples - """ - sources = [] - - def _scan_directory(current_path: Path, path_components: tuple[str, ...]): - """Recursively scan for Delta tables.""" - for item in current_path.iterdir(): - if not item.is_dir(): - continue - - new_path_components = path_components + (item.name,) - - # Check if this directory contains a Delta table - try: - DeltaTable(str(item)) - sources.append(new_path_components) - except TableNotFoundError: - # Not a Delta table, continue scanning subdirectories - if len(new_path_components) < self.max_hierarchy_depth: - _scan_directory(item, new_path_components) - - _scan_directory(self.base_path, ()) - return sources - - def delete_source(self, source_path: tuple[str, ...]) -> bool: - """ - Delete an entire source (all records for a source path). - - Args: - source_path: Tuple of path components - - Returns: - True if source was deleted, False if it didn't exist - """ - self._validate_source_path(source_path) - - # Flush any pending batches first - self.flush_batch(source_path) - - table_path = self._get_table_path(source_path) - source_key = self._get_source_key(source_path) - - if not table_path.exists(): - return False - - try: - # Remove from caches - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - - # Remove directory - import shutil - - shutil.rmtree(table_path) - - logger.info(f"Deleted source {source_key}") - return True - - except Exception as e: - logger.error(f"Error deleting source {source_key}: {e}") - return False - - def delete_record(self, source_path: tuple[str, ...], entry_id: str) -> bool: - """ - Delete a specific record. - - Args: - source_path: Tuple of path components - entry_id: ID of the record to delete - - Returns: - True if record was deleted, False if it didn't exist - """ - self._validate_source_path(source_path) - - # Flush any pending batches first - self.flush_batch(source_path) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return False - - try: - # Check if record exists using proper filter - filter_expr = self._create_entry_id_filter(entry_id) - existing = self._read_table_with_filter(delta_table, filters=filter_expr) - if len(existing) == 0: - return False - - # Delete the record using SQL-style predicate (this is correct for delete operations) - delta_table.delete( - f"__entry_id = '{entry_id.replace(chr(39), chr(39) + chr(39))}'" - ) - - # Update cache - source_key = self._get_source_key(source_path) - self._delta_table_cache[source_key] = delta_table - - logger.debug(f"Deleted record {entry_id} from {'/'.join(source_path)}") - return True - - except Exception as e: - logger.error( - f"Error deleting record {entry_id} from {'/'.join(source_path)}: {e}" - ) - return False - - def get_table_info(self, source_path: tuple[str, ...]) -> dict[str, Any] | None: - """ - Get metadata information about a Delta table. - - Args: - source_path: Tuple of path components - - Returns: - Dictionary with table metadata, or None if table doesn't exist - """ - self._validate_source_path(source_path) - - delta_table = self._get_existing_delta_table(source_path) - if delta_table is None: - return None - - try: - # Get basic info - schema = delta_table.schema() - history = delta_table.history() - source_key = self._get_source_key(source_path) - - # Add pending batch info - pending_info = self.get_pending_batch_info() - pending_count = pending_info.get(source_key, 0) - - return { - "path": str(self._get_table_path(source_path)), - "source_path": source_path, - "schema": schema, - "version": delta_table.version(), - "num_files": len(delta_table.files()), - "history_length": len(history), - "latest_commit": history[0] if history else None, - "pending_records": pending_count, - } - - except Exception as e: - logger.error(f"Error getting table info for {'/'.join(source_path)}: {e}") - return None diff --git a/src/orcapod/data/old_datagrams.py b/src/orcapod/data/old_datagrams.py deleted file mode 100644 index a0386c8..0000000 --- a/src/orcapod/data/old_datagrams.py +++ /dev/null @@ -1,2281 +0,0 @@ -""" -Data structures and utilities for working with datagrams in OrcaPod. - -This module provides classes and functions for handling packet-like data structures -that can represent data in various formats (Python dicts, Arrow tables, etc.) while -maintaining type information, source metadata, and semantic type conversion capability. - -Key classes: -- SemanticConverter: Converts between different data representations. Intended for internal use. -- DictDatagram: Immutable dict-based data structure -- PythonDictPacket: Python dict-based packet with source info -- ArrowPacket: Arrow table-based packet implementation -- PythonDictTag/ArrowTag: Tag implementations for data identification - -The module also provides utilities for schema validation, table operations, -and type conversions between semantic stores, Python stores, and Arrow tables. -""" - -from hmac import new -import logging -from abc import ABC, abstractmethod -from collections.abc import Collection, Iterator, Mapping -from types import new_class -from typing import Self, TypeAlias, cast - -from matplotlib.pyplot import arrow -import pyarrow as pa - -from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.data.context import ( - DataContext, -) -from orcapod.protocols import data_protocols as dp -from orcapod.protocols import hashing_protocols as hp -from orcapod.types import TypeSpec, schemas, typespec_utils -from orcapod.types import typespec_utils as tsutils -from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter -from orcapod.utils import arrow_utils - -logger = logging.getLogger(__name__) - -# A conveniece packet-like type that defines a value that can be -# converted to a packet. It's broader than Packet and a simple mapping -# from string keys to DataValue (e.g., int, float, str) can be regarded -# as PacketLike, allowing for more flexible interfaces. -# Anything that requires Packet-like data but without the strict features -# of a Packet should accept PacketLike. -# One should be careful when using PacketLike as a return type as it does not -# enforce the typespec or source_info, which are important for packet integrity. -PacketLike: TypeAlias = Mapping[str, DataValue] - -PythonStore: TypeAlias = Mapping[str, DataValue] - - -class ImmutableDict(Mapping[str, DataValue]): - """ - An immutable dictionary-like container for DataValues. - - Provides a read-only view of a dictionary mapping strings to DataValues, - implementing the Mapping protocol for compatibility with dict-like operations. - - Initialize with data from a mapping. - Args: - data: Source mapping to copy data from - """ - - def __init__(self, data: Mapping[str, DataValue]): - self._data = dict(data) - - def __getitem__(self, key: str) -> DataValue: - return self._data[key] - - def __iter__(self): - return iter(self._data) - - def __len__(self) -> int: - return len(self._data) - - def __repr__(self) -> str: - return self._data.__repr__() - - def __str__(self) -> str: - return self._data.__str__() - - def __or__(self, other: Mapping[str, DataValue]) -> Self: - """ - Create a new ImmutableDict by merging with another mapping. - - Args: - other: Another mapping to merge with - - Returns: - A new ImmutableDict containing the combined data - """ - return self.__class__(self._data | dict(other)) - - -def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: - """ - Check if a column name matches any of the given prefixes. - - Args: - column: Column name to check - prefixes: Collection of prefixes to match against - - Returns: - True if the column starts with any of the prefixes, False otherwise - """ - for prefix in prefixes: - if column.startswith(prefix): - return True - return False - - -class BaseDatagram(ABC): - """ - Abstract base class for immutable datagram implementations. - - Provides shared functionality and enforces consistent interface across - different storage backends (dict, Arrow table, etc.). Concrete subclasses - must implement the abstract methods to handle their specific storage format. - - The base class only manages the data context key string - how that key - is interpreted and used is left to concrete implementations. - """ - - def __init__(self, data_context: DataContext | str | None = None) -> None: - """ - Initialize base datagram with data context. - - Args: - data_context: Context for semantic interpretation. Can be a string key - or a DataContext object, or None for default. - """ - self._data_context = DataContext.resolve_data_context(data_context) - - # 1. Core Properties (Identity & Structure) - @property - def data_context_key(self) -> str: - """Return the data context key.""" - return self._data_context.context_key - - @property - @abstractmethod - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - ... - - # 2. Dict-like Interface (Data Access) - @abstractmethod - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - ... - - @abstractmethod - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - ... - - @abstractmethod - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - ... - - @abstractmethod - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - ... - - # 3. Structural Information - @abstractmethod - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - ... - - @abstractmethod - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> TypeSpec: - """Return type specification for the datagram.""" - ... - - @abstractmethod - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """Return the PyArrow schema for this datagram.""" - ... - - @abstractmethod - def content_hash(self) -> str: - """Calculate and return content hash of the datagram.""" - ... - - # 4. Format Conversions (Export) - @abstractmethod - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """Return dictionary representation of the datagram.""" - ... - - @abstractmethod - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """Convert the datagram to an Arrow table.""" - ... - - # 5. Meta Column Operations - @abstractmethod - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """Get a meta column value.""" - ... - - @abstractmethod - def with_meta_columns(self, **updates: DataValue) -> Self: - """Create new datagram with updated meta columns.""" - ... - - @abstractmethod - def drop_meta_columns(self, *keys: str) -> Self: - """Create new datagram with specified meta columns removed.""" - ... - - # 6. Data Column Operations - @abstractmethod - def select(self, *column_names: str) -> Self: - """Create new datagram with only specified data columns.""" - ... - - @abstractmethod - def drop(self, *column_names: str) -> Self: - """Create new datagram with specified data columns removed.""" - ... - - @abstractmethod - def rename(self, column_mapping: Mapping[str, str]) -> Self: - """Create new datagram with data columns renamed.""" - ... - - @abstractmethod - def update(self, **updates: DataValue) -> Self: - """Create new datagram with existing column values updated.""" - ... - - @abstractmethod - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> Self: - """Create new datagram with additional data columns.""" - ... - - # 7. Context Operations - @abstractmethod - def with_context_key(self, new_context_key: str) -> Self: - """Create new datagram with different data context.""" - ... - - # 8. Utility Operations - @abstractmethod - def copy(self) -> Self: - """Create a shallow copy of the datagram.""" - ... - - -class DictDatagram(BaseDatagram): - """ - Immutable datagram implementation using dictionary as storage backend. - - This implementation uses composition (not inheritance from Mapping) to maintain - control over the interface while leveraging dictionary efficiency for data access. - Provides clean separation between data, meta, and context components. - - The underlying data is split into separate components: - - Data dict: Primary business data columns - - Meta dict: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes - - Context: Data context information with {orcapod.CONTEXT_KEY} - - Future Packet subclass will also handle: - - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes - - When exposing to external tools, semantic types are encoded as - `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). - - All operations return new instances, preserving immutability. - - Example: - >>> data = {{ - ... "user_id": 123, - ... "name": "Alice", - ... "__pipeline_version": "v2.1.0", - ... "{orcapod.CONTEXT_KEY}": "financial_v1" - ... }} - >>> datagram = DictDatagram(data) - >>> updated = datagram.update(name="Alice Smith") - """ - - def __init__( - self, - data: Mapping[str, DataValue], - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - """ - Initialize DictDatagram from dictionary data. - - Args: - data: Source data mapping containing all column data. - typespec: Optional type specification for fields. - semantic_converter: Optional converter for semantic type handling. - If None, will be created based on data context and inferred types. - data_context: Data context for semantic type resolution. - If None and data contains context column, will extract from data. - - Note: - The input data is automatically split into data, meta, and context - components based on column naming conventions. - """ - # Parse through data and extract different column types - data_columns = {} - meta_columns = {} - extracted_context = None - - for k, v in data.items(): - if k == constants.CONTEXT_KEY: - # Extract data context but keep it separate from meta data - if data_context is None: - extracted_context = v - # Don't store context in meta_data - it's managed separately - elif k.startswith(constants.META_PREFIX): - # Double underscore = meta metadata - meta_columns[k] = v - else: - # Everything else = user data (including _source_ and semantic types) - data_columns[k] = v - - # Initialize base class with data context - final_context = data_context or cast(str, extracted_context) - super().__init__(final_context) - - # Store data and meta components separately (immutable) - self._data = dict(data_columns) - self._meta_data = dict(meta_columns) - - # Combine provided typespec info with inferred typespec from content - # If the column value is None and no type spec is provided, defaults to str. - self._data_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._data, - typespec, - ) - ) - - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - self._data_python_schema.to_semantic_schema( - semantic_type_registry=self._data_context.semantic_type_registry - ), - ) - self.semantic_converter = semantic_converter - - # Create schema for meta data - self._meta_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._meta_data, - typespec=typespec, - ) - ) - - # Initialize caches - self._cached_data_table: pa.Table | None = None - self._cached_meta_table: pa.Table | None = None - self._cached_content_hash: str | None = None - self._cached_data_arrow_schema: pa.Schema | None = None - self._cached_meta_arrow_schema: pa.Schema | None = None - - # 1. Core Properties (Identity & Structure) - @property - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - return tuple(self._meta_data.keys()) - - # 2. Dict-like Interface (Data Access) - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - if key not in self._data: - raise KeyError(f"Data column '{key}' not found") - return self._data[key] - - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - return key in self._data - - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - return iter(self._data) - - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - return self._data.get(key, default) - - # 3. Structural Information - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - # Start with data columns - result_keys = list(self._data.keys()) - - # Add context if requested - if include_context: - result_keys.append(constants.CONTEXT_KEY) - - # Add meta columns if requested - if include_meta_columns: - if include_meta_columns is True: - result_keys.extend(self.meta_columns) - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - filtered_meta_cols = [ - col - for col in self.meta_columns - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - result_keys.extend(filtered_meta_cols) - - return tuple(result_keys) - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> schemas.PythonSchema: - """ - Return Python schema for the datagram. - - Args: - include_meta_columns: Whether to include meta column types. - - True: include all meta column types - - Collection[str]: include meta column types matching these prefixes - - False: exclude meta column types - include_context: Whether to include context type - - Returns: - Python schema - """ - # Start with data schema - schema = dict(self._data_python_schema) - - # Add context if requested - if include_context: - schema[constants.CONTEXT_KEY] = str - - # Add meta schema if requested - if include_meta_columns and self._meta_data: - if include_meta_columns is True: - schema.update(self._meta_python_schema) - elif isinstance(include_meta_columns, Collection): - filtered_meta_schema = { - k: v - for k, v in self._meta_python_schema.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - schema.update(filtered_meta_schema) - - return schemas.PythonSchema(schema) - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_meta_columns: Whether to include meta columns in the schema. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - # Build data schema (cached) - if self._cached_data_arrow_schema is None: - self._cached_data_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._data_python_schema - ) - ) - - all_schemas = [self._cached_data_arrow_schema] - - # Add context schema if requested - if include_context: - context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) - all_schemas.append(context_schema) - - # Add meta schema if requested - if include_meta_columns and self._meta_data: - if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) - ) - - assert self._cached_meta_arrow_schema is not None, ( - "Meta Arrow schema should be initialized by now" - ) - - if include_meta_columns is True: - meta_schema = self._cached_meta_arrow_schema - elif isinstance(include_meta_columns, Collection): - # Filter meta schema by prefix matching - matched_fields = [ - field - for field in self._cached_meta_arrow_schema - if any( - field.name.startswith(prefix) for prefix in include_meta_columns - ) - ] - if matched_fields: - meta_schema = pa.schema(matched_fields) - else: - meta_schema = None - else: - meta_schema = None - - if meta_schema is not None: - all_schemas.append(meta_schema) - - return arrow_utils.join_arrow_schemas(*all_schemas) - - def content_hash(self) -> str: - """ - Calculate and return content hash of the datagram. - Only includes data columns, not meta columns or context. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_meta_columns=False, include_context=False), - prefix_hasher_id=True, - ) - return self._cached_content_hash - - # 4. Format Conversions (Export) - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation of the datagram. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context key - - Returns: - Dictionary representation - """ - result_dict = dict(self._data) # Start with user data - - # Add context if requested - if include_context: - result_dict[constants.CONTEXT_KEY] = self._data_context.context_key - - # Add meta columns if requested - if include_meta_columns and self._meta_data: - if include_meta_columns is True: - # Include all meta columns - result_dict.update(self._meta_data) - elif isinstance(include_meta_columns, Collection): - # Include only meta columns matching prefixes - filtered_meta_data = { - k: v - for k, v in self._meta_data.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - result_dict.update(filtered_meta_data) - - return result_dict - - def _get_meta_arrow_table(self) -> pa.Table: - if self._cached_meta_table is None: - arrow_schema = self._get_meta_arrow_schema() - self._cached_meta_table = pa.Table.from_pylist( - [self._meta_data], - schema=arrow_schema, - ) - assert self._cached_meta_table is not None, ( - "Meta Arrow table should be initialized by now" - ) - return self._cached_meta_table - - def _get_meta_arrow_schema(self) -> pa.Schema: - if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) - ) - assert self._cached_meta_arrow_schema is not None, ( - "Meta Arrow schema should be initialized by now" - ) - return self._cached_meta_arrow_schema - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """ - Convert the datagram to an Arrow table. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include the context column - - Returns: - Arrow table representation - """ - # Build data table (cached) - if self._cached_data_table is None: - self._cached_data_table = self.semantic_converter.from_python_to_arrow( - self._data, - self._data_python_schema, - ) - assert self._cached_data_table is not None, ( - "Data Arrow table should be initialized by now" - ) - result_table = self._cached_data_table - - # Add context if requested - if include_context: - result_table = result_table.append_column( - constants.CONTEXT_KEY, - pa.array([self._data_context.context_key], type=pa.large_string()), - ) - - # Add meta columns if requested - meta_table = None - if include_meta_columns and self._meta_data: - meta_table = self._get_meta_arrow_table() - # Select appropriate meta columns - if isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - matched_cols = [ - col - for col in self._meta_data.keys() - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - if matched_cols: - meta_table = meta_table.select(matched_cols) - else: - meta_table = None - - # Combine tables if we have meta columns to add - if meta_table is not None: - result_table = arrow_utils.hstack_tables(result_table, meta_table) - - return result_table - - # 5. Meta Column Operations - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """ - Get meta column value with optional default. - - Args: - key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). - default: Value to return if meta column doesn't exist. - - Returns: - Meta column value if exists, otherwise the default value. - """ - # Handle both prefixed and unprefixed keys - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - - return self._meta_data.get(key, default) - - def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": - """ - Create a new DictDatagram with updated meta columns. - Maintains immutability by returning a new instance. - - Args: - **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) - - Returns: - New DictDatagram instance - """ - # Prefix the keys and prepare updates - prefixed_updates = {} - for k, v in meta_updates.items(): - if not k.startswith(constants.META_PREFIX): - k = constants.META_PREFIX + k - prefixed_updates[k] = v - - # Start with existing meta data - new_meta_data = dict(self._meta_data) - new_meta_data.update(prefixed_updates) - - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(new_meta_data) # Meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def drop_meta_columns( - self, *keys: str, ignore_missing: bool = False - ) -> "DictDatagram": - """ - Create a new DictDatagram with specified meta columns dropped. - Maintains immutability by returning a new instance. - - Args: - *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) - ignore_missing: If True, ignore missing meta columns without raising an error. - - Raises: - KeyError: If any specified meta column to drop doesn't exist and ignore_missing=False. - - Returns: - New DictDatagram instance without specified meta columns - """ - # Normalize keys to have prefixes - prefixed_keys = set() - for key in keys: - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - prefixed_keys.add(key) - - missing_keys = prefixed_keys - set(self._meta_data.keys()) - if missing_keys and not ignore_missing: - raise KeyError( - f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" - ) - - # Filter out specified meta columns - new_meta_data = { - k: v for k, v in self._meta_data.items() if k not in prefixed_keys - } - - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(new_meta_data) # Filtered meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # 6. Data Column Operations - def select(self, *column_names: str) -> "DictDatagram": - """ - Create a new DictDatagram with only specified data columns. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to keep - - Returns: - New DictDatagram instance with only specified data columns - """ - # Validate columns exist - missing_cols = set(column_names) - set(self._data.keys()) - if missing_cols: - raise KeyError(f"Columns not found: {missing_cols}") - - # Keep only specified data columns - new_data = {k: v for k, v in self._data.items() if k in column_names} - - # Reconstruct full data dict for new instance - full_data = new_data # Selected user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": - """ - Create a new DictDatagram with specified data columns dropped. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to drop - - Returns: - New DictDatagram instance without specified data columns - """ - # Filter out specified data columns - missing = set(column_names) - set(self._data.keys()) - if missing and not ignore_missing: - raise KeyError( - f"Following columns do not exist and cannot be dropped: {sorted(missing)}" - ) - - new_data = {k: v for k, v in self._data.items() if k not in column_names} - - if not new_data: - raise ValueError("Cannot drop all data columns") - - # Reconstruct full data dict for new instance - full_data = new_data # Filtered user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": - """ - Create a new DictDatagram with data columns renamed. - Maintains immutability by returning a new instance. - - Args: - column_mapping: Mapping from old column names to new column names - - Returns: - New DictDatagram instance with renamed data columns - """ - # Rename data columns according to mapping, preserving original types - new_data = {} - for old_name, value in self._data.items(): - new_name = column_mapping.get(old_name, old_name) - new_data[new_name] = value - - # Handle typespec updates for renamed columns - new_typespec = None - if self._data_python_schema: - existing_typespec = dict(self._data_python_schema) - - # Rename types according to column mapping - renamed_typespec = {} - for old_name, old_type in existing_typespec.items(): - new_name = column_mapping.get(old_name, old_name) - renamed_typespec[new_name] = old_type - - new_typespec = renamed_typespec - - # Reconstruct full data dict for new instance - full_data = new_data # Renamed user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - typespec=new_typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def update(self, **updates: DataValue) -> "DictDatagram": - """ - Create a new DictDatagram with existing column values updated. - Maintains immutability by returning a new instance. - - Args: - **updates: Column names and their new values (columns must exist) - - Returns: - New DictDatagram instance with updated values - - Raises: - KeyError: If any column doesn't exist (use with_columns() to add new columns) - """ - if not updates: - return self - - # Error if any column doesn't exist - missing_columns = set(updates.keys()) - set(self._data.keys()) - if missing_columns: - raise KeyError( - f"Columns not found: {sorted(missing_columns)}. " - f"Use with_columns() to add new columns." - ) - - # Update existing columns - new_data = dict(self._data) - new_data.update(updates) - - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - semantic_converter=self.semantic_converter, # Keep existing converter - data_context=self._data_context, - ) - - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> "DictDatagram": - """ - Create a new DictDatagram with new data columns added. - Maintains immutability by returning a new instance. - - Args: - column_updates: New data columns as a mapping - column_types: Optional type specifications for new columns - **kwargs: New data columns as keyword arguments - - Returns: - New DictDatagram instance with new data columns added - - Raises: - ValueError: If any column already exists (use update() instead) - """ - # Combine explicit updates with kwargs - - if not updates: - return self - - # Error if any column already exists - existing_overlaps = set(updates.keys()) & set(self._data.keys()) - if existing_overlaps: - raise ValueError( - f"Columns already exist: {sorted(existing_overlaps)}. " - f"Use update() to modify existing columns." - ) - - # Update user data with new columns - new_data = dict(self._data) - new_data.update(updates) - - # Create updated typespec - handle None values by defaulting to str - typespec = self.types() - if column_types is not None: - typespec.update(column_types) - - new_typespec = tsutils.get_typespec_from_dict( - new_data, - typespec=typespec, - ) - - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - return DictDatagram( - data=full_data, - typespec=new_typespec, - # semantic converter needs to be rebuilt for new columns - data_context=self._data_context, - ) - - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> "DictDatagram": - """ - Create a new DictDatagram with a different data context key. - Maintains immutability by returning a new instance. - - Args: - new_context_key: New data context key string - - Returns: - New DictDatagram instance with new context - """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - return DictDatagram( - data=full_data, - data_context=new_context_key, # New context - # Note: semantic_converter will be rebuilt for new context - ) - - # 8. Utility Operations - def copy(self) -> Self: - """ - Create a shallow copy of the datagram. - - Returns a new datagram instance with the same data and cached values. - This is more efficient than reconstructing from scratch when you need - an identical datagram instance. - - Returns: - New DictDatagram instance with copied data and caches. - """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - new_datagram = self.__class__( - full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_data_table = self._cached_data_table - new_datagram._cached_meta_table = self._cached_meta_table - new_datagram._cached_content_hash = self._cached_content_hash - new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema - new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema - - return new_datagram - - # 9. String Representations - def __str__(self) -> str: - """ - Return user-friendly string representation. - - Shows the datagram as a simple dictionary for user-facing output, - messages, and logging. Only includes data columns for clean output. - - Returns: - Dictionary-style string representation of data columns only. - """ - return str(self._data) - - def __repr__(self) -> str: - """ - Return detailed string representation for debugging. - - Shows the datagram type and comprehensive information including - data columns, meta columns count, and context for debugging purposes. - - Returns: - Detailed representation with type and metadata information. - """ - meta_count = len(self.meta_columns) - context_key = self.data_context_key - - return ( - f"DictDatagram(" - f"data={self._data}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) - - -class ArrowDatagram(BaseDatagram): - """ - Immutable datagram implementation using PyArrow Table as storage backend. - - This implementation provides high-performance columnar data operations while - maintaining the datagram interface. It efficiently handles type conversions, - semantic processing, and interoperability with Arrow-based tools. - - The underlying table is split into separate components: - - Data table: Primary business data columns - - Meta table: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes - - Context table: Data context information with {orcapod.CONTEXT_KEY} - - Future Packet subclass will also handle: - - Source info: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes - - When exposing to external tools, semantic types are encoded as - `_{semantic_type}_` prefixes (_path_config_file, _id_user_name). - - All operations return new instances, preserving immutability. - - Example: - >>> table = pa.Table.from_pydict({ - ... "user_id": [123], - ... "name": ["Alice"], - ... "__pipeline_version": ["v2.1.0"], - ... "{orcapod.CONTEXT_KEY}": ["financial_v1"] - ... }) - >>> datagram = ArrowDatagram(table) - >>> updated = datagram.update(name="Alice Smith") - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - """ - Initialize ArrowDatagram from PyArrow Table. - - Args: - table: PyArrow Table containing the data. Must have exactly one row. - semantic_converter: Optional converter for semantic type handling. - If None, will be created based on the data context and table schema. - data_context: Context key string or DataContext object. - If None and table contains context column, will extract from table. - - Raises: - ValueError: If table doesn't contain exactly one row. - - Note: - The input table is automatically split into data, meta, and context - components based on column naming conventions. - """ - # Validate table has exactly one row for datagram - if len(table) != 1: - raise ValueError( - "Table must contain exactly one row to be a valid datagram." - ) - - # Split table into data, meta, and context components - context_columns = [constants.CONTEXT_KEY] - meta_columns = [ - col for col in table.column_names if col.startswith(constants.META_PREFIX) - ] - - # Extract context table if present - if constants.CONTEXT_KEY in table.column_names and data_context is None: - context_table = table.select([constants.CONTEXT_KEY]) - data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] - - # Initialize base class with data context - super().__init__(data_context) - - # Split table into components - self._data_table = table.drop(context_columns + meta_columns) - self._meta_table = table.select(meta_columns) if meta_columns else None - if len(self._data_table.column_names) == 0: - raise ValueError("Data table must contain at least one data column.") - - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - self._data_table.schema, - self._data_context.semantic_type_registry, - ) - ) - self._semantic_converter = semantic_converter - - # Create data context table - data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) - self._data_context_table = pa.Table.from_pylist( - [{constants.CONTEXT_KEY: self._data_context.context_key}], - schema=data_context_schema, - ) - - # Initialize caches - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_meta_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - # 1. Core Properties (Identity & Structure) - @property - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names.""" - if self._meta_table is None: - return () - return tuple(self._meta_table.column_names) - - # 2. Dict-like Interface (Data Access) - def __getitem__(self, key: str) -> DataValue: - """Get data column value by key.""" - if key not in self._data_table.column_names: - raise KeyError(f"Data column '{key}' not found") - - return self._data_table[key].to_pylist()[0] - - def __contains__(self, key: str) -> bool: - """Check if data column exists.""" - return key in self._data_table.column_names - - def __iter__(self) -> Iterator[str]: - """Iterate over data column names.""" - return iter(self._data_table.column_names) - - def get(self, key: str, default: DataValue = None) -> DataValue: - """Get data column value with default.""" - if key in self._data_table.column_names: - return self.as_dict()[key] - return default - - # 3. Structural Information - def keys( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """Return tuple of column names.""" - # Start with data columns - result_keys = list(self._data_table.column_names) - - # Add context if requested - if include_context: - result_keys.append(constants.CONTEXT_KEY) - - # Add meta columns if requested - if include_meta_columns: - if include_meta_columns is True: - result_keys.extend(self.meta_columns) - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - filtered_meta_cols = [ - col - for col in self.meta_columns - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - result_keys.extend(filtered_meta_cols) - - return tuple(result_keys) - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> schemas.PythonSchema: - """ - Return Python schema for the datagram. - - Args: - include_meta_columns: Whether to include meta column types. - - True: include all meta column types - - Collection[str]: include meta column types matching these prefixes - - False: exclude meta column types - include_context: Whether to include context type - - Returns: - Python schema - """ - # Get data schema (cached) - if self._cached_python_schema is None: - self._cached_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( - self._data_table.schema - ) - ) - - schema = dict(self._cached_python_schema) - - # Add context if requested - if include_context: - schema[constants.CONTEXT_KEY] = str - - # Add meta schema if requested - if include_meta_columns and self._meta_table is not None: - if self._cached_meta_python_schema is None: - self._cached_meta_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( - self._meta_table.schema - ) - ) - meta_schema = dict(self._cached_meta_python_schema) - if include_meta_columns is True: - schema.update(meta_schema) - elif isinstance(include_meta_columns, Collection): - filtered_meta_schema = { - k: v - for k, v in meta_schema.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - schema.update(filtered_meta_schema) - - return schemas.PythonSchema(schema) - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_meta_columns: Whether to include meta columns in the schema. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context column in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - all_schemas = [self._data_table.schema] - - # Add context schema if requested - if include_context: - # TODO: reassess the efficiency of this approach - all_schemas.append(self._data_context_table.schema) - - # Add meta schema if requested - if include_meta_columns and self._meta_table is not None: - if include_meta_columns is True: - meta_schema = self._meta_table.schema - elif isinstance(include_meta_columns, Collection): - # Filter meta schema by prefix matching - matched_fields = [ - field - for field in self._meta_table.schema - if any( - field.name.startswith(prefix) for prefix in include_meta_columns - ) - ] - if matched_fields: - meta_schema = pa.schema(matched_fields) - else: - meta_schema = None - else: - meta_schema = None - - if meta_schema is not None: - all_schemas.append(meta_schema) - - return arrow_utils.join_arrow_schemas(*all_schemas) - - def content_hash(self) -> str: - """ - Calculate and return content hash of the datagram. - Only includes data columns, not meta columns or context. - - Returns: - Hash string of the datagram content - """ - if self._cached_content_hash is None: - self._cached_content_hash = self._data_context.arrow_hasher.hash_table( - self._data_table, - prefix_hasher_id=True, - ) - return self._cached_content_hash - - # 4. Format Conversions (Export) - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation of the datagram. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include context key - - Returns: - Dictionary representation - """ - # Get data dict (cached) - if self._cached_python_dict is None: - self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self._data_table - )[0] - - result_dict = dict(self._cached_python_dict) - - # Add context if requested - if include_context: - result_dict[constants.CONTEXT_KEY] = self._data_context.context_key - - # Add meta data if requested - if include_meta_columns and self._meta_table is not None: - if include_meta_columns is True: - meta_dict = self._meta_table.to_pylist()[0] - elif isinstance(include_meta_columns, Collection): - meta_dict = self._meta_table.to_pylist()[0] - # Include only meta columns matching prefixes - meta_dict = { - k: v - for k, v in meta_dict.items() - if any(k.startswith(prefix) for prefix in include_meta_columns) - } - if meta_dict is not None: - result_dict.update(meta_dict) - - return result_dict - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> pa.Table: - """ - Convert the datagram to an Arrow table. - - Args: - include_meta_columns: Whether to include meta columns. - - True: include all meta columns - - Collection[str]: include meta columns matching these prefixes - - False: exclude meta columns - include_context: Whether to include the context column - - Returns: - Arrow table representation - """ - all_tables = [self._data_table] - - # Add context if requested - if include_context: - all_tables.append(self._data_context_table) - - # Add meta columns if requested - if include_meta_columns and self._meta_table is not None: - meta_table = None - if include_meta_columns is True: - meta_table = self._meta_table - elif isinstance(include_meta_columns, Collection): - # Filter meta columns by prefix matching - matched_cols = [ - col - for col in self._meta_table.column_names - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - if matched_cols: - meta_table = self._meta_table.select(matched_cols) - else: - meta_table = None - - if meta_table is not None: - all_tables.append(meta_table) - - return arrow_utils.hstack_tables(*all_tables) - - # 5. Meta Column Operations - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """ - Get a meta column value. - - Args: - key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix) - default: Default value if not found - - Returns: - Meta column value - """ - if self._meta_table is None: - return default - - # Handle both prefixed and unprefixed keys - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - - if key not in self._meta_table.column_names: - return default - - return self._meta_table[key].to_pylist()[0] - - def with_meta_columns(self, **meta_updates: DataValue) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with updated meta columns. - Maintains immutability by returning a new instance. - - Args: - **meta_updates: Meta column updates (keys will be prefixed with {orcapod.META_PREFIX} ('__') if needed) - - Returns: - New ArrowDatagram instance - """ - # Prefix the keys and prepare updates - prefixed_updates = {} - for k, v in meta_updates.items(): - if not k.startswith(constants.META_PREFIX): - k = constants.META_PREFIX + k - prefixed_updates[k] = v - - # Start with existing meta data - meta_dict = {} - if self._meta_table is not None: - meta_dict = self._meta_table.to_pylist()[0] - - # Apply updates - meta_dict.update(prefixed_updates) - - # Create new meta table - new_meta_table = pa.Table.from_pylist([meta_dict]) if meta_dict else None - - # Combine all tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def drop_meta_columns( - self, *keys: str, ignore_missing: bool = True - ) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specified meta columns dropped. - Maintains immutability by returning a new instance. - - Args: - *keys: Meta column keys to drop (with or without {orcapod.META_PREFIX} ('__') prefix) - - Returns: - New ArrowDatagram instance without specified meta columns - """ - if self._meta_table is None: - return self # No meta columns to drop - - # Normalize keys to have prefixes - prefixed_keys = set() - for key in keys: - if not key.startswith(constants.META_PREFIX): - key = constants.META_PREFIX + key - prefixed_keys.add(key) - - missing_keys = prefixed_keys - set(self._meta_table.column_names) - if missing_keys and not ignore_missing: - raise KeyError( - f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" - ) - - # Filter meta columns - remaining_cols = [ - col for col in self._meta_table.column_names if col not in prefixed_keys - ] - - # Create new meta table - new_meta_table = ( - self._meta_table.select(remaining_cols) if remaining_cols else None - ) - - # Combine tables for reconstruction - combined_table = self._data_table - if new_meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, new_meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # 6. Data Column Operations - def select(self, *column_names: str) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with only specified data columns. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to keep - - Returns: - New ArrowDatagram instance with only specified data columns - """ - # Validate columns exist - missing_cols = set(column_names) - set(self._data_table.column_names) - if missing_cols: - raise ValueError(f"Columns not found: {missing_cols}") - - new_data_table = self._data_table.select(list(column_names)) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def drop(self, *column_names: str, ignore_missing: bool = False) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specified data columns dropped. - Maintains immutability by returning a new instance. - - Args: - *column_names: Data column names to drop - - Returns: - New ArrowDatagram instance without specified data columns - """ - - # Filter out specified data columns - missing = set(column_names) - set(self._data_table.column_names) - if missing and not ignore_missing: - raise KeyError( - f"Following columns do not exist and cannot be dropped: {sorted(missing)}" - ) - - # Filter data columns - remaining_cols = [ - col for col in self._data_table.column_names if col not in column_names - ] - - if not remaining_cols: - raise ValueError("Cannot drop all data columns") - - new_data_table = self._data_table.select(remaining_cols) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def rename(self, column_mapping: Mapping[str, str]) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with data columns renamed. - Maintains immutability by returning a new instance. - - Args: - column_mapping: Mapping from old column names to new column names - - Returns: - New ArrowDatagram instance with renamed data columns - """ - # Create new schema with renamed fields, preserving original types - new_fields = [] - for field in self._data_table.schema: - old_name = field.name - new_name = column_mapping.get(old_name, old_name) - new_field = pa.field(new_name, field.type) - new_fields.append(new_field) - - # Create new data table with renamed columns - new_schema = pa.schema(new_fields) - new_data_table = self._data_table.rename_columns( - [column_mapping.get(name, name) for name in self._data_table.column_names] - ).cast(new_schema) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def update(self, **updates: DataValue) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with specific column values updated. - - Args: - **updates: Column names and their new values - - Returns: - New ArrowDatagram instance with updated values - - Raises: - KeyError: If any specified column doesn't exist - - Example: - # Convert relative path to absolute path - updated = datagram.update(file_path="/absolute/path/to/file.txt") - - # Update multiple values - updated = datagram.update(status="processed", file_path="/new/path") - """ - # Only update if there are columns to update - if not updates: - return self - - # Validate all columns exist - missing_cols = set(updates.keys()) - set(self._data_table.column_names) - if missing_cols: - raise KeyError( - f"Only existing columns can be updated. Following columns were not found: {sorted(missing_cols)}" - ) - - updates_typespec = schemas.PythonSchema( - {k: v for k, v in self.types().items() if k in updates} - ) - - update_table = self._semantic_converter.from_python_to_arrow( - updates, updates_typespec - ) - all_tables = [self._data_table.drop(list(updates.keys())), update_table] - - if self._meta_table is not None: - all_tables.append(self._meta_table) - - return ArrowDatagram( - table=arrow_utils.hstack_tables(*all_tables), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with new data columns added. - Maintains immutability by returning a new instance. - - Args: - column_updates: New data columns as a mapping - column_types: Optional type specifications for new columns - **kwargs: New data columns as keyword arguments - - Returns: - New ArrowDatagram instance with new data columns added - - Raises: - ValueError: If any column already exists (use update() instead) - """ - # Combine explicit updates with kwargs - - if not updates: - return self - - # Error if any column already exists - existing_overlaps = set(updates.keys()) & set(self._data_table.column_names) - if existing_overlaps: - raise ValueError( - f"Columns already exist: {sorted(existing_overlaps)}. " - f"Use update() to modify existing columns." - ) - - # TODO: consider simplifying this conversion logic - typespec = typespec_utils.get_typespec_from_dict(updates, column_types) - - updates_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_typespec( - typespec, self._data_context.semantic_type_registry - ) - ) - # TODO: cleanup the handling of typespec python schema and various conversion points - new_data_table = updates_converter.from_python_to_arrow(updates, typespec) - - # Combine with meta table for reconstruction - combined_table = new_data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - # prepare the joined converter - total_converter = self._semantic_converter.join(updates_converter) - - return ArrowDatagram( - table=combined_table, - semantic_converter=total_converter, - data_context=self._data_context, - ) - - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> "ArrowDatagram": - """ - Create a new ArrowDatagram with a different data context key. - Maintains immutability by returning a new instance. - - Args: - new_context_key: New data context key string - - Returns: - New ArrowDatagram instance with new context - """ - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - return ArrowDatagram( - table=combined_table, - data_context=new_context_key, - # Note: semantic_converter will be rebuilt for new context - ) - - # 8. Utility Operations - def copy(self) -> Self: - """Return a copy of the datagram.""" - # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - - new_datagram = self.__class__( - combined_table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_python_schema = self._cached_python_schema - new_datagram._cached_python_dict = self._cached_python_dict - new_datagram._cached_content_hash = self._cached_content_hash - - return new_datagram - - # 9. String Representations - def __str__(self) -> str: - """ - Return user-friendly string representation. - - Shows the datagram as a simple dictionary for user-facing output, - messages, and logging. Only includes data columns for clean output. - - Returns: - Dictionary-style string representation of data columns only. - - Example: - >>> str(datagram) - "{'user_id': 123, 'name': 'Alice'}" - >>> print(datagram) - {'user_id': 123, 'name': 'Alice'} - """ - return str(self.as_dict()) - - def __repr__(self) -> str: - """ - Return detailed string representation for debugging. - - Shows the datagram type and comprehensive information including - data columns, meta columns count, and context for debugging purposes. - - Returns: - Detailed representation with type and metadata information. - - Example: - >>> repr(datagram) - "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" - """ - data_dict = self.as_dict() - meta_count = len(self.meta_columns) - context_key = self.data_context_key - - return ( - f"ArrowDatagram(" - f"data={data_dict}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) - - -class DictTag(DictDatagram): - """ - A simple tag implementation using Python dictionary. - - Represents a tag (metadata) as a dictionary that can be converted - to different representations like Arrow tables. - """ - - -class DictPacket(DictDatagram): - """ - Enhanced packet implementation with source information support. - - Extends DictDatagram to include source information tracking and - enhanced table conversion capabilities that can include or exclude - source metadata. - - Initialize packet with data and optional source information. - - Args: - data: Primary data content - source_info: Optional mapping of field names to source information - typespec: Optional type specification - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types. Defaults to system default registry. - arrow_hasher: Optional Arrow hasher. Defaults to system default arrow hasher. - """ - - def __init__( - self, - data: Mapping[str, DataValue], - source_info: Mapping[str, str | None] | None = None, - typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - # normalize the data content and remove any source info keys - data_only = { - k: v for k, v in data.items() if not k.startswith(constants.SOURCE_PREFIX) - } - contained_source_info = { - k.removeprefix(constants.SOURCE_PREFIX): v - for k, v in data.items() - if k.startswith(constants.SOURCE_PREFIX) - } - - super().__init__( - data_only, - typespec=typespec, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._source_info = {**contained_source_info, **(source_info or {})} - self._cached_source_info_table: pa.Table | None = None - self._cached_source_info_schema: pa.Schema | None = None - - @property - def _source_info_schema(self) -> pa.Schema: - if self._cached_source_info_schema is None: - self._cached_source_info_schema = pa.schema( - { - f"{constants.SOURCE_PREFIX}{k}": pa.large_string() - for k in self.keys() - } - ) - return self._cached_source_info_schema - - def as_table( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - """Convert the packet to an Arrow table.""" - table = super().as_table( - include_meta_columns=include_meta_columns, - include_context=include_context, - ) - if include_source: - if self._cached_source_info_table is None: - source_info_data = { - f"{constants.SOURCE_PREFIX}{k}": v - for k, v in self.source_info().items() - } - self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=self._source_info_schema - ) - assert self._cached_source_info_table is not None, ( - "Cached source info table should not be None" - ) - # subselect the corresponding _source_info as the columns present in the data table - source_info_table = self._cached_source_info_table.select( - [ - f"{constants.SOURCE_PREFIX}{k}" - for k in table.column_names - if k in self.keys() - ] - ) - table = arrow_utils.hstack_tables(table, source_info_table) - return table - - def as_dict( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> dict[str, DataValue]: - """ - Return dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - dict_copy = super().as_dict( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - for key, value in self.source_info().items(): - dict_copy[f"{constants.SOURCE_PREFIX}{key}"] = value - return dict_copy - - def types( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - for key in self.keys(): - schema[f"{constants.SOURCE_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema( - include_meta_columns=include_meta_columns, include_context=include_context - ) - if include_source: - return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) - return schema - - def as_datagram( - self, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, - ) -> DictDatagram: - """ - Convert the packet to a DictDatagram. - - Args: - include_source: Whether to include source info fields - - Returns: - DictDatagram representation of the packet - """ - data = self.as_dict( - include_meta_columns=include_meta_columns, include_source=include_source - ) - typespec = self.types(include_source=include_source) - return DictDatagram( - data, - typespec=typespec, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Dictionary mapping field names to their source info - """ - return {key: self._source_info.get(key, None) for key in self.keys()} - - def copy(self) -> Self: - """Return a shallow copy of the packet.""" - instance = super().copy() - instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table - return instance - - -class ArrowTag(ArrowDatagram): - """ - A tag implementation using Arrow table backend. - - Represents a single-row Arrow table that can be converted to Python - dictionary representation while caching computed values for efficiency. - - Initialize with an Arrow table. - - Args: - table: Single-row Arrow table representing the tag - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(table) != 1: - raise ValueError( - "ArrowTag should only contain a single row, " - "as it represents a single tag." - ) - super().__init__( - table=table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - -class ArrowPacket(ArrowDatagram): - """ - Arrow table-based packet implementation with comprehensive features. - - A packet implementation that uses Arrow tables as the primary storage format, - providing efficient memory usage and columnar data operations while supporting - source information tracking and content hashing. - - - Initialize ArrowPacket with Arrow table and configuration. - - Args: - table: Single-row Arrow table representing the packet - source_info: Optional source information mapping - semantic_converter: Optional semantic converter - semantic_type_registry: Registry for semantic types - finger_print: Optional fingerprint for tracking - arrow_hasher: Optional Arrow hasher - post_hash_callback: Optional callback after hash calculation - skip_source_info_extraction: Whether to skip source info processing - - Raises: - ValueError: If table doesn't contain exactly one row - """ - - def __init__( - self, - data: pa.Table, - source_info: dict[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, - ) -> None: - if len(data) != 1: - raise ValueError( - "ArrowPacket should only contain a single row, " - "as it represents a single packet." - ) - if source_info is None: - source_info = {} - - # normalize the table to ensure it has the expected source_info columns - data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( - data, - {constants.SOURCE_PREFIX: source_info}, - exclude_columns=[constants.CONTEXT_KEY], - ) - self._source_info_table = prefixed_tables[constants.SOURCE_INFO_PREFIX] - - super().__init__( - data_table, - semantic_converter=semantic_converter, - data_context=data_context, - ) - - self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: schemas.PythonSchema | None = None - self._cached_content_hash: str | None = None - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - ) -> pa.Table: - table = super().as_table(include_data_context=include_data_context) - if include_source: - # add source_info only for existing data columns - table = arrow_utils.hstack_tables( - table, - self._source_info_table.select( - [ - f"{constants.SOURCE_INFO_PREFIX}{c}" - for c in table.column_names - if c in self.keys() - ] - ), - ) - return table - - def types( - self, include_data_context: bool = False, include_source: bool = False - ) -> schemas.PythonSchema: - """Return copy of the Python schema.""" - schema = super().types(include_data_context=include_data_context) - if include_source: - for key in self.keys(): - schema[f"{constants.SOURCE_INFO_PREFIX}{key}"] = str - return schema - - def arrow_schema( - self, include_data_context: bool = False, include_source: bool = False - ) -> pa.Schema: - """ - Return the PyArrow schema for this datagram. - - Args: - include_data_context: Whether to include data context column in the schema - include_source: Whether to include source info columns in the schema - - Returns: - PyArrow schema representing the datagram's structure - """ - schema = super().arrow_schema(include_data_context=include_data_context) - if include_source: - return arrow_utils.join_arrow_schemas( - schema, self._source_info_table.schema - ) - return schema - - def as_dict( - self, include_data_context: bool = False, include_source: bool = False - ) -> dict[str, DataValue]: - """ - Convert to dictionary representation. - - Args: - include_source: Whether to include source info fields - - Returns: - Dictionary representation of the packet - """ - return_dict = super().as_dict(include_data_context=include_data_context) - if include_source: - return_dict.update( - { - f"{constants.SOURCE_INFO_PREFIX}{k}": v - for k, v in self.source_info().items() - } - ) - return return_dict - - def as_datagram(self, include_source: bool = False) -> ArrowDatagram: - table = self.as_table(include_source=include_source) - return ArrowDatagram( - table, - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - - def source_info(self) -> dict[str, str | None]: - """ - Return source information for all keys. - - Returns: - Copy of the dictionary mapping field names to their source info - """ - if self._cached_source_info is None: - self._cached_source_info = { - k.removeprefix(constants.SOURCE_INFO_PREFIX): v - for k, v in self._source_info_table.to_pylist()[0].items() - } - return self._cached_source_info.copy() - - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash - - return new_packet - - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[dp.Tag, Collection[dp.Packet]] -"""Type alias for a batch: a tuple containing a tag and collection of packets.""" diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index b1b3d1b..f10bb2e 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -5,19 +5,14 @@ from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from abc import abstractmethod from typing import Any - - -class InputValidationError(Exception): - """ - Exception raised when the inputs are not valid. - This is used to indicate that the inputs do not meet the requirements of the operator. - """ +from orcapod.errors import InputValidationError class Operator(TrackedKernelBase): """ Base class for all operators. Operators are a special type of kernel that can be used to perform operations on streams. + They are defined as a callable that takes a (possibly empty) collection of streams as the input and returns a new stream as output (note that output stream is always singular). """ diff --git a/src/orcapod/errors.py b/src/orcapod/errors.py new file mode 100644 index 0000000..b1566cd --- /dev/null +++ b/src/orcapod/errors.py @@ -0,0 +1,5 @@ +class InputValidationError(Exception): + """ + Exception raised when the inputs are not valid. + This is used to indicate that the inputs do not meet the requirements of the operator. + """ diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index 0cd0722..91b7931 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -2,7 +2,6 @@ from .arrow_hashers import SemanticArrowHasher from orcapod.utils.object_spec import parse_objectspec from orcapod.protocols.hashing_protocols import ObjectHasher -from typing import Any CURRENT_VERSION = "v0.1" diff --git a/src/orcapod/pipeline/__init__.py b/src/orcapod/pipeline/__init__.py index 9a99f89..9d3e0f5 100644 --- a/src/orcapod/pipeline/__init__.py +++ b/src/orcapod/pipeline/__init__.py @@ -1,5 +1,5 @@ -from .pipeline import Pipeline +# from .legacy_pipeline import Pipeline -__all__ = [ - "Pipeline", -] +# __all__ = [ +# "Pipeline", +# ] diff --git a/src/orcapod/pipeline/legacy_pipeline.py b/src/orcapod/pipeline/legacy_pipeline.py index 1fb5236..8c931f7 100644 --- a/src/orcapod/pipeline/legacy_pipeline.py +++ b/src/orcapod/pipeline/legacy_pipeline.py @@ -10,7 +10,7 @@ from orcapod.core import Invocation, Kernel, SyncStream from orcapod.core.pod import FunctionPod -from orcapod.pipeline.nodes import KernelNode, FunctionPodNode, Node +from orcapod.pipeline.legacy_nodes import KernelNode, FunctionPodNode, Node from orcapod.core.tracker import GraphTracker from orcapod.stores import ArrowDataStore diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index d51ead8..0356270 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -41,3 +41,7 @@ def get_records_by_ids( record_ids: Collection[str], record_id_column: str | None = None, ) -> pa.Table: ... + + def flush(self) -> None: + """Flush any buffered writes to the underlying storage.""" + ... diff --git a/src/orcapod/protocols/types.py b/src/orcapod/protocols/types.py deleted file mode 100644 index 73e67f1..0000000 --- a/src/orcapod/protocols/types.py +++ /dev/null @@ -1,51 +0,0 @@ -# from typing import TypeAlias -# from collections.abc import Collection, Mapping -# from pathlib import Path -# import logging -# import os - -# logger = logging.getLogger(__name__) - - -# # class TypeSpec(dict[str, DataType]): -# # def __init__(self, *args, **kwargs): -# # """ -# # TypeSpec is a mapping of parameter names to their types. -# # It can be used to define the expected types of parameters in a function or a pod. -# # """ -# # super().__init__(*args, **kwargs) - - -# # Convenience alias for anything pathlike -# PathLike: TypeAlias = str | os.PathLike - -# # an (optional) string or a collection of (optional) string values -# # Note that TagValue can be nested, allowing for an arbitrary depth of nested lists -# TagValue: TypeAlias = int | str | None | Collection["TagValue"] - -# # the top level tag is a mapping from string keys to values that can be a string or -# # an arbitrary depth of nested list of strings or None -# Tag: TypeAlias = Mapping[str, TagValue] - -# # a pathset is a path or an arbitrary depth of nested list of paths -# PathSet: TypeAlias = PathLike | Collection[PathLike | None] - -# # Simple data types that we support (with clear Polars correspondence) -# SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes - -# ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathLike - -# TypeSpec = dict[str, type] # Mapping of parameter names to their types - -# # Extended data values that can be stored in packets -# # Either the original PathSet or one of our supported simple data types -# DataValue: TypeAlias = ( -# PathSet -# | SupportedNativePythonData -# | None -# | Collection["DataValue"] -# | Mapping[str, "DataValue"] -# ) - - -# PacketLike = Mapping[str, DataValue] From 6f8b6909b9162b8ef7efb072d8867b6fcaa68ed4 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:00:39 +0000 Subject: [PATCH 119/224] feat: add lazyloading system --- src/orcapod/hashing/arrow_utils.py | 17 ++-- src/orcapod/utils/lazy_module.py | 155 +++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 src/orcapod/utils/lazy_module.py diff --git a/src/orcapod/hashing/arrow_utils.py b/src/orcapod/hashing/arrow_utils.py index 0d46cd7..7dc565e 100644 --- a/src/orcapod/hashing/arrow_utils.py +++ b/src/orcapod/hashing/arrow_utils.py @@ -1,12 +1,17 @@ -import pyarrow as pa import json import hashlib -from typing import Dict, Any +from typing import Any, TYPE_CHECKING from decimal import Decimal import base64 +from orcapod.utils.lazy_module import LazyModule +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") -def serialize_pyarrow_table_schema(table: pa.Table) -> str: + +def serialize_pyarrow_table_schema(table: "pa.Table") -> str: """ Serialize PyArrow table schema to JSON with Python type names and filtered metadata. @@ -29,7 +34,7 @@ def serialize_pyarrow_table_schema(table: pa.Table) -> str: return json.dumps(schema_info, separators=(",", ":"), sort_keys=True) -def serialize_pyarrow_table(table: pa.Table) -> str: +def serialize_pyarrow_table(table: "pa.Table") -> str: """ Serialize a PyArrow table to a stable JSON string with both schema and data. @@ -74,7 +79,7 @@ def serialize_pyarrow_table(table: pa.Table) -> str: ) -def get_pyarrow_table_hash(table: pa.Table) -> str: +def get_pyarrow_table_hash(table: "pa.Table") -> str: """ Get a stable SHA-256 hash of the table content. @@ -88,7 +93,7 @@ def get_pyarrow_table_hash(table: pa.Table) -> str: return hashlib.sha256(serialized.encode("utf-8")).hexdigest() -def deserialize_to_pyarrow_table(serialized_str: str) -> pa.Table: +def deserialize_to_pyarrow_table(serialized_str: str) -> "pa.Table": """ Deserialize JSON string back to a PyArrow table. diff --git a/src/orcapod/utils/lazy_module.py b/src/orcapod/utils/lazy_module.py new file mode 100644 index 0000000..75cf057 --- /dev/null +++ b/src/orcapod/utils/lazy_module.py @@ -0,0 +1,155 @@ +import importlib +from types import ModuleType +from typing import Any, Optional + + +class LazyModule: + """ + A wrapper that lazily loads a module only when its attributes are first accessed. + + Example: + # Instead of: import expensive_module + expensive_module = LazyModule('expensive_module') + + # Module is only loaded when you access something: + result = expensive_module.some_function() # Now it imports + """ + + def __init__(self, module_name: str, package: str | None = None): + """ + Initialize lazy module loader. + + Args: + module_name: Name of the module to import + package: Package for relative imports (same as importlib.import_module) + """ + self._module_name = module_name + self._package = package + self._module: ModuleType | None = None + self._loaded = False + + def _load_module(self) -> ModuleType: + """Load the module if not already loaded.""" + if not self._loaded: + self._module = importlib.import_module(self._module_name, self._package) + self._loaded = True + assert self._module is not None, ( + f"Module '{self._module_name}' could not be loaded. " + "This should not happen if the module exists." + ) + return self._module + + def __getattr__(self, name: str) -> Any: + """Get attribute from the wrapped module, loading it if necessary.""" + if name.startswith("_"): + # Avoid infinite recursion for internal attributes + raise AttributeError( + f"'{self.__class__.__name__}' object has no attribute '{name}'" + ) + + module = self._load_module() + return getattr(module, name) + + def __setattr__(self, name: str, value: Any) -> None: + """Set attribute on the wrapped module or on this instance.""" + if name.startswith("_") or not self._loaded: + # Set on this instance for internal attributes or before loading + super().__setattr__(name, value) + else: + # Set on the wrapped module + setattr(self._load_module(), name, value) + + def __delattr__(self, name: str) -> None: + """Delete attribute from the wrapped module.""" + if name.startswith("_"): + super().__delattr__(name) + else: + delattr(self._load_module(), name) + + def __dir__(self) -> list[str]: + """Return directory of the wrapped module.""" + if self._loaded: + return dir(self._module) + else: + # Return empty list or basic attributes before loading + return [] + + def __repr__(self) -> str: + """String representation.""" + if self._loaded: + return f"" + else: + return f"" + + def __str__(self) -> str: + """String representation.""" + return self.__repr__() + + # Support for callable modules (modules with __call__) + def __call__(self, *args, **kwargs): + """Call the module if it's callable.""" + module = self._load_module() + return module(*args, **kwargs) # type: ignore + + # Support for iteration if the module is iterable + def __iter__(self): + """Iterate over the module if it's iterable.""" + module = self._load_module() + return iter(module) # type: ignore + + def __len__(self): + """Get length of the module if it supports len().""" + module = self._load_module() + return len(module) # type: ignore + + def __getitem__(self, key): + """Get item from the module if it supports indexing.""" + module = self._load_module() + return module[key] # type: ignore + + def __setitem__(self, key, value): + """Set item on the module if it supports item assignment.""" + module = self._load_module() + module[key] = value # type: ignore + + def __contains__(self, item): + """Check if item is in the module if it supports 'in' operator.""" + module = self._load_module() + return item in module + + @property + def is_loaded(self) -> bool: + """Check if the module has been loaded.""" + return self._loaded + + @property + def module_name(self) -> str: + """Get the module name.""" + return self._module_name + + def force_load(self) -> ModuleType: + """Force load the module and return it.""" + return self._load_module() + + +# Convenience function for creating lazy modules +def lazy_import(module_name: str, package: Optional[str] = None) -> LazyModule: + """ + Create a lazy module loader. + + Args: + module_name: Name of the module to import + package: Package for relative imports + + Returns: + LazyModule instance that will load the module on first access + + Example: + np = lazy_import('numpy') + pd = lazy_import('pandas') + + # Modules are only imported when you use them: + array = np.array([1, 2, 3]) # numpy imported here + df = pd.DataFrame({'a': [1, 2]}) # pandas imported here + """ + return LazyModule(module_name, package) From 239a45842f809bec909177fc1bb483d532f7afcb Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:11 +0000 Subject: [PATCH 120/224] refactor: refine kernel and pod setup --- src/orcapod/data/kernels.py | 76 ++++++++++++++++--- src/orcapod/data/pods.py | 43 ++++++----- src/orcapod/pipeline/graph.py | 134 ++++++++++++++++++++++++++++++++++ src/orcapod/pipeline/nodes.py | 32 ++++++++ 4 files changed, 253 insertions(+), 32 deletions(-) create mode 100644 src/orcapod/pipeline/graph.py create mode 100644 src/orcapod/pipeline/nodes.py diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 58a920f..468b3f1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -11,9 +11,6 @@ logger = logging.getLogger(__name__) -def get_tracker_manager() -> dp.TrackerManager: ... - - class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): """ Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. @@ -68,7 +65,7 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... if self.fixed_input_streams is not None: if len(streams) != 0: raise ValueError( - f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted." + f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted at this point." ) return self.fixed_input_streams return streams @@ -86,13 +83,13 @@ def prepare_output_stream( """ return KernelStream(source=self, upstreams=streams, label=label) - def track_invocation(self, *streams: dp.Stream) -> None: + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: """ Track the invocation of the kernel with the provided streams. This is a convenience method that calls record_kernel_invocation. """ if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_kernel_invocation(self, streams) + self._tracker_manager.record_kernel_invocation(self, streams, label=label) def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs @@ -100,7 +97,7 @@ def __call__( processed_streams = self.pre_process_input_streams(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) - self.track_invocation(*processed_streams) + self.track_invocation(*processed_streams, label=label) return output_stream @abstractmethod @@ -111,8 +108,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: Subclasses should override this method to provide the kernel with its unique behavior """ + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + processed_streams = self.pre_process_input_streams(*streams) + self.validate_inputs(*processed_streams) + return self.kernel_output_types(*processed_streams) + @abstractmethod - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... def __repr__(self): return self.__class__.__name__ @@ -122,6 +124,9 @@ def __str__(self): return f"{self.__class__.__name__}({self._label})" return self.__class__.__name__ + @abstractmethod + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ... + def identity_structure(self, *streams: dp.Stream) -> Any: # Default implementation of identity_structure for the kernel only # concerns the kernel class and the streams if present. Subclasses of @@ -137,7 +142,54 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. - logger.warning( - f"Identity structure not implemented for {self.__class__.__name__}" - ) - return (self.__class__.__name__,) + streams + if len(streams) > 0: + streams = self.pre_process_input_streams(*streams) + self.validate_inputs(*streams) + return self.kernel_identity_structure(*streams) + + +class WrappedKernel(TrackedKernelBase): + """ + A wrapper for a kernel that allows it to be used as a stream source. + This is useful for cases where you want to use a kernel as a source of data + in a pipeline or other data processing context. + """ + + def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + # TODO: handle fixed input stream already set on the kernel + super().__init__(**kwargs) + self.kernel = kernel + + @property + def kernel_id(self) -> tuple[str, ...]: + return self.kernel.kernel_id + + def computed_label(self) -> str | None: + """ + Compute a label for this kernel based on its content. + If label is not explicitly set for this kernel and computed_label returns a valid value, + it will be used as label of this kernel. + """ + return self.kernel.label + + def forward(self, *streams: dp.Stream) -> dp.Stream: + return self.kernel.forward(*streams) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + return self.kernel.output_types(*streams) + + def validate_inputs(self, *streams: dp.Stream) -> None: + pass + + def __repr__(self): + return f"WrappedKernel({self.kernel!r})" + + def __str__(self): + return f"WrappedKernel:{self.kernel!s}" + + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: + return self.kernel.identity_structure(*streams) + + +class CachedKernel(WrappedKernel): + pass diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 8662903..10e088e 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -2,7 +2,7 @@ import sys from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence -from typing import Any, Literal, cast +from typing import Any, Literal, cast, TYPE_CHECKING from orcapod.data.datagrams import ( DictPacket, @@ -22,7 +22,12 @@ from orcapod.types import typespec_utils as tsutils from orcapod.utils import arrow_utils from orcapod.data.system_constants import orcapod_constants as constants -import pyarrow as pa +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") logger = logging.getLogger(__name__) @@ -65,14 +70,12 @@ def __init__( self._active = True self.error_handling = error_handling - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ - input_streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*input_streams) - tag_typespec, _ = input_streams[0].types() + tag_typespec, _ = streams[0].types() return tag_typespec, self.output_packet_types() def is_active(self) -> bool: @@ -124,10 +127,15 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... """ # if multiple streams are provided, join them # otherwise, return as is - if self.fixed_input_streams is not None and len(streams) > 0: - output_stream = self._join_streams(*self.fixed_input_streams) - if len(streams) > 0: + if self.fixed_input_streams is not None: + if len(streams) == 0: + output_stream = self._join_streams(*self.fixed_input_streams) + else: restrict_stream = self._join_streams(*streams) + raise NotImplementedError( + f"{self.__class__.__name__} does not support semijoining fixed input streams with additional streams yet. " + "Please implement this functionality in the subclass." + ) # output_stream = SemiJoin()(output_stream, restrict_stream) else: if len(streams) == 0: @@ -144,9 +152,9 @@ def prepare_output_stream( output_stream.label = label return output_stream - def track_invocation(self, *streams: dp.Stream) -> None: + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, streams) + self._tracker_manager.record_pod_invocation(self, streams, label=label) def forward(self, *streams: dp.Stream) -> PodStream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" @@ -289,8 +297,6 @@ def __str__(self) -> str: return f"FunctionPod:{func_sig}" def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: - v: dp.Packet = DictPacket({}) - print(v) if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -330,7 +336,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) return tag, output_packet - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name @@ -355,11 +361,8 @@ def identity_structure(self, *streams: dp.Stream) -> Any: ) # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure - if len(streams) > 0: - # TODO: extract the common handling of input streams - processed_streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*processed_streams) - id_struct += (processed_streams[0],) + if len(streams) != 0: + id_struct += (streams[0],) return id_struct @@ -416,7 +419,7 @@ def output_packet_types(self) -> TypeSpec: def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure(self, *streams: dp.Stream) -> Any: return self.pod.identity_structure(*streams) def __repr__(self) -> str: diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py new file mode 100644 index 0000000..3266e3b --- /dev/null +++ b/src/orcapod/pipeline/graph.py @@ -0,0 +1,134 @@ +from orcapod.data.trackers import GraphTracker, Invocation +from orcapod.pipeline.nodes import KernelNode, PodNode +from orcapod.data.context import DataContext +from orcapod.protocols import data_protocols as dp +from orcapod.protocols import store_protocols as sp +from typing import Any +from collections.abc import Collection +from orcapod.data.streams import WrappedStream +import logging + + +logger = logging.getLogger(__name__) + + +class Pipeline(GraphTracker): + """ + Represents a pipeline in the system. + This class extends GraphTracker to manage the execution of kernels and pods in a pipeline. + """ + + def __init__( + self, + name: str | tuple[str, ...], + pipeline_store: sp.ArrowDataStore, + results_store: sp.ArrowDataStore | None = None, + tracker_manager: dp.TrackerManager | None = None, + data_context: str | DataContext | None = None, + auto_compile: bool = True, + ): + super().__init__(tracker_manager=tracker_manager, data_context=data_context) + if not isinstance(name, tuple): + name = (name,) + self.name = name + self.pipeline_store_path_prefix = self.name + self.results_store_path_prefix = () + if results_store is None: + if pipeline_store is None: + raise ValueError( + "Either pipeline_store or results_store must be provided" + ) + results_store = pipeline_store + self.results_store_path_prefix = self.name + ("_results",) + self.pipeline_store = pipeline_store + self.results_store = results_store + self.nodes = {} + self.auto_compile = auto_compile + self._dirty = False + self._ordered_nodes = [] # Track order of invocations + + def __exit__(self, exc_type=None, exc_value=None, traceback=None): + """ + Exit the pipeline context, ensuring all nodes are properly closed. + """ + super().__exit__(exc_type, exc_value, traceback) + if self.auto_compile: + self.compile() + + def flush(self) -> None: + self.pipeline_store.flush() + self.results_store.flush() + + def record_kernel_invocation( + self, + kernel: dp.Kernel, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + super().record_kernel_invocation(kernel, upstreams, label) + self._dirty = True + + def record_pod_invocation( + self, + pod: dp.Pod, + upstreams: tuple[dp.Stream, ...], + label: str | None = None, + ) -> None: + super().record_pod_invocation(pod, upstreams, label) + self._dirty = True + + def compile(self) -> None: + import networkx as nx + + invocation_to_stream_lut = {} + G = self.generate_graph() + for invocation in nx.topological_sort(G): + input_streams = [ + invocation_to_stream_lut[parent] for parent in invocation.parents() + ] + node = self.wrap_invocation(invocation, new_input_streams=input_streams) + invocation_to_stream_lut[invocation] = node() + self.nodes[node.label] = node + + def wrap_invocation( + self, + invocation: Invocation, + new_input_streams: Collection[dp.Stream], + ) -> dp.Kernel: + if invocation in self.invocation_to_pod_lut: + pod = self.invocation_to_pod_lut[invocation] + node = PodNode( + pod=pod, fixed_input_streams=new_input_streams, label=invocation.label + ) + else: + node = KernelNode( + kernel=invocation.kernel, + fixed_input_streams=new_input_streams, + label=invocation.label, + ) + return node + + def __getattr__(self, item: str) -> Any: + """Allow direct access to pipeline attributes.""" + if item in self.nodes: + return self.nodes[item] + raise AttributeError(f"Pipeline has no attribute '{item}'") + + def __dir__(self) -> list[str]: + """Return a list of attributes and methods of the pipeline.""" + return list(super().__dir__()) + list(self.nodes.keys()) + + def rename(self, old_name: str, new_name: str) -> None: + """ + Rename a node in the pipeline. + This will update the label and the internal mapping. + """ + if old_name not in self.nodes: + raise KeyError(f"Node '{old_name}' does not exist in the pipeline.") + if new_name in self.nodes: + raise KeyError(f"Node '{new_name}' already exists in the pipeline.") + node = self.nodes[old_name] + del self.nodes[old_name] + node.label = new_name + self.nodes[new_name] = node + logger.info(f"Node '{old_name}' renamed to '{new_name}'") diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py new file mode 100644 index 0000000..13347f6 --- /dev/null +++ b/src/orcapod/pipeline/nodes.py @@ -0,0 +1,32 @@ +from orcapod.data.kernels import WrappedKernel +from orcapod.data.pods import WrappedPod +from orcapod.protocols import data_protocols as dp + + +class KernelNode(WrappedKernel): + """ + A node in the pipeline that represents a kernel. + This node can be used to execute the kernel and process data streams. + """ + + def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + super().__init__(kernel=kernel, **kwargs) + self.kernel = kernel + + def __repr__(self): + return f"KernelNode(kernel={self.kernel!r})" + + def __str__(self): + return f"KernelNode:{self.kernel!s}" + + +class PodNode(WrappedPod): + def __init__(self, pod: dp.Pod, **kwargs) -> None: + super().__init__(pod=pod, **kwargs) + self.pod = pod + + def __repr__(self): + return f"PodNode(pod={self.pod!r})" + + def __str__(self): + return f"PodNode:{self.pod!s}" From 643de2b97b661efde4737d68fd369d210b6406c8 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:28 +0000 Subject: [PATCH 121/224] refactor: refine tracker system --- src/orcapod/data/trackers.py | 114 ++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 41 deletions(-) diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 0f6ef94..3cf42a9 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,11 +1,16 @@ +from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.data.context import DataContext from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from collections.abc import Generator from abc import ABC, abstractmethod -from typing import Any +from typing import Any, TYPE_CHECKING from contextlib import contextmanager +if TYPE_CHECKING: + import networkx as nx + class BasicTrackerManager: def __init__(self) -> None: @@ -115,35 +120,74 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) -class Invocation: +class StubKernel: + def __init__(self, stream: dp.Stream, label: str | None = None) -> None: + """ + A placeholder kernel that does nothing. + This is used to represent a kernel that has no computation. + """ + self.label = label or stream.label + self.stream = stream + + def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: + """ + Forward the stream through the stub kernel. + This is a no-op and simply returns the stream. + """ + return self.stream + + def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: + return self.forward(*args, **kwargs) + + def identity_structure(self, *streams: dp.Stream) -> Any: + # FIXME: using label as a stop-gap for identity structure + return self.label + + def __hash__(self) -> int: + # TODO: resolve the logic around identity structure on a stream / stub kernel + """ + Hash the StubKernel based on its label and stream. + This is used to uniquely identify the StubKernel in the tracker. + """ + identity_structure = self.identity_structure() + if identity_structure is None: + return hash(self.stream) + return identity_structure + + +class Invocation(LabeledContentIdentifiableBase): def __init__( self, kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + upstreams: tuple[dp.Stream, ...] = (), label: str | None = None, ) -> None: """ Represents an invocation of a kernel with its upstream streams. This is used to track the computational graph and the invocations of kernels. """ + super().__init__(label=label) self.kernel = kernel self.upstreams = upstreams - self._label = label def parents(self) -> tuple["Invocation", ...]: parent_invoctions = [] for stream in self.upstreams: if stream.source is not None: parent_invoctions.append(Invocation(stream.source, stream.upstreams)) + else: + source = StubKernel(stream) + parent_invoctions.append(Invocation(source)) + return tuple(parent_invoctions) - @property - def label(self) -> str | None: + def computed_label(self) -> str | None: """ - Return the label of the kernel invocation. - This is used to identify the invocation in the tracker. + Compute a label for this invocation based on its kernel and upstreams. + If label is not explicitly set for this invocation and computed_label returns a valid value, + it will be used as label of this invocation. """ - return self._label or self.kernel.label or self.kernel.__class__.__name__ + return self.kernel.label def identity_structure(self) -> Any: """ @@ -152,6 +196,9 @@ def identity_structure(self) -> Any: """ return self.kernel.identity_structure(*self.upstreams) + def __repr__(self) -> str: + return f"Invocation(kernel={self.kernel}, upstreams={self.upstreams}, label={self.label})" + class GraphTracker(AutoRegisteringContextBasedTracker): """ @@ -164,41 +211,28 @@ class GraphTracker(AutoRegisteringContextBasedTracker): def __init__( self, tracker_manager: dp.TrackerManager | None = None, - object_hasher: hp.ObjectHasher | None = None, + data_context: str | DataContext | None = None, ) -> None: super().__init__(tracker_manager=tracker_manager) - if object_hasher is None: - object_hasher = get_default_object_hasher() - self.object_hasher = object_hasher + self._data_context = DataContext.resolve_data_context(data_context) + # Dictionary to map kernels to the streams they have invoked # This is used to track the computational graph and the invocations of kernels + self.kernel_invocations: set[Invocation] = set() + self.invocation_to_pod_lut: dict[Invocation, dp.Pod] = {} self.id_to_invocation_lut: dict[str, Invocation] = {} self.id_to_label_lut: dict[str, list[str]] = defaultdict(list) self.id_to_pod_lut: dict[str, dp.Pod] = {} - def record(self, stream: dp.Stream) -> None: - assert stream.source is not None, ( - "Stream must have a source kernel when recording." - ) - stream_list = self.kernel_to_invoked_stream_lut[stream.source] - if stream not in stream_list: - stream_list.append(stream) - - def _record_kernel_and_get_id( + def _record_kernel_and_get_invocation( self, kernel: dp.Kernel, upstreams: tuple[dp.Stream, ...], label: str | None = None, - ) -> str: + ) -> Invocation: invocation = Invocation(kernel, upstreams, label=label) - invocation_id = self.object_hasher.hash_to_hex(invocation) - if invocation_id not in self.id_to_invocation_lut: - self.id_to_invocation_lut[invocation_id] = invocation - label = label or kernel.label or kernel.__class__.__name__ - existing_labels = self.id_to_label_lut[invocation_id] - if label not in existing_labels: - existing_labels.append(label) - return invocation_id + self.kernel_invocations.add(invocation) + return invocation def record_kernel_invocation( self, @@ -210,7 +244,7 @@ def record_kernel_invocation( Record the output stream of a kernel invocation in the tracker. This is used to track the computational graph and the invocations of kernels. """ - self._record_kernel_and_get_id(kernel, upstreams, label) + self._record_kernel_and_get_invocation(kernel, upstreams, label) def record_pod_invocation( self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None @@ -218,8 +252,8 @@ def record_pod_invocation( """ Record the output stream of a pod invocation in the tracker. """ - invocation_id = self._record_kernel_and_get_id(pod, upstreams, label) - self.id_to_pod_lut[invocation_id] = pod + invocation = self._record_kernel_and_get_invocation(pod, upstreams, label) + self.invocation_to_pod_lut[invocation] = pod def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: """ @@ -229,18 +263,16 @@ def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: self.kernel_to_invoked_stream_lut = defaultdict(list) return recorded_streams - def generate_graph(self): + def generate_graph(self) -> "nx.DiGraph": import networkx as nx G = nx.DiGraph() # Add edges for each invocation - for _, streams in self.kernel_to_invoked_stream_lut.items(): - for stream in streams: - if stream not in G: - G.add_node(stream) - for upstream in stream.upstreams: - G.add_edge(upstream, stream) + for invocation in self.kernel_invocations: + G.add_node(invocation) + for upstream_invocation in invocation.parents(): + G.add_edge(upstream_invocation, invocation) return G # def generate_namemap(self) -> dict[Invocation, str]: From 7f8e3d4d491397db5f6a582ff935cc95e9574ad9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:01:38 +0000 Subject: [PATCH 122/224] feat: add wrapped stream --- src/orcapod/data/streams.py | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index b8ce85d..d0ecce3 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -655,3 +655,56 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: else: for i in range(len(self._cached_output_packets)): yield self._cached_output_packets[i] + + +class WrappedStream(StreamBase): + def __init__( + self, + stream: dp.Stream, + source: dp.Kernel, + input_streams: tuple[dp.Stream, ...], + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) + self._stream = stream + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + return self._stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + return self._stream.types() + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + return self._stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + return self._stream.iter_packets() + + def identity_structure(self) -> Any: + return self._stream.identity_structure() From e0916aad3185284324899c90d25a41d7cb7288bd Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 22 Jul 2025 22:02:09 +0000 Subject: [PATCH 123/224] refactor: use hasher id consistently --- src/orcapod/hashing/arrow_hashers.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 695ffe8..264caad 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -49,11 +49,11 @@ class SemanticArrowHasher: def __init__( self, - hasher_id: str, hash_algorithm: str = "sha256", + semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, chunk_size: int = 8192, + hasher_id: str | None = None, handle_missing: str = "error", - semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, serialization_method: str = "logical", # TODO: consider passing options for serialization method ): @@ -64,6 +64,8 @@ def __init__( chunk_size: Size of chunks to read files in bytes handle_missing: How to handle missing files ('error', 'skip', 'null_hash') """ + if hasher_id is None: + hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" self._hasher_id = hasher_id self.chunk_size = chunk_size self.handle_missing = handle_missing @@ -90,7 +92,8 @@ def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: else: raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") - def get_hasher_id(self) -> str: + @property + def hasher_id(self) -> str: return self._hasher_id def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): @@ -113,9 +116,9 @@ def _get_semantic_type(self, field: pa.Field) -> str | None: def _create_hash_column( self, original_column: pa.Array, - hash_algorithm: str, hash_bytes: bytes, original_field: pa.Field, + hash_algorithm: str | None = None, ) -> tuple[pa.Array, pa.Field]: """Create a new column containing the hash bytes.""" # Create array of hash bytes (one hash value repeated for each row) @@ -128,7 +131,7 @@ def _create_hash_column( "semantic_type", "unknown" ) new_metadata["semantic_type"] = "hash" - new_metadata["hash_algorithm"] = hash_algorithm_id + new_metadata["hash_algorithm"] = hash_algorithm or self.hasher_id new_field = pa.field( original_field.name, @@ -156,7 +159,7 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: # Replace column with hash hash_column, hash_field = self._create_hash_column( - column, hasher.hasher_id, hash_bytes, field + column, hash_bytes, field ) new_columns.append(hash_column) new_fields.append(hash_field) @@ -226,7 +229,7 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: hash_str = hasher.hexdigest() if prefix_hasher_id: - hash_str = f"{self.get_hasher_id()}@{hash_str}" + hash_str = f"{self.hasher_id}@{hash_str}" return hash_str From 1d14975becdd729e4d20cc783309d2d02b41acec Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:07:24 +0000 Subject: [PATCH 124/224] refactor: remove fixed stream from kernel and clean up cached pod --- src/orcapod/data/kernels.py | 12 --- src/orcapod/data/pods.py | 173 +++++++++--------------------------- 2 files changed, 43 insertions(+), 142 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 468b3f1..5392cb4 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -25,7 +25,6 @@ class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): def __init__( self, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, data_context: str | DataContext | None = None, skip_tracking: bool = False, @@ -39,7 +38,6 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER - self.fixed_input_streams = fixed_input_streams @property def data_context(self) -> DataContext: @@ -62,12 +60,6 @@ def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ... pre-processing step will be tracked separately from the main computation in forward. By default, it returns the input streams unchanged. """ - if self.fixed_input_streams is not None: - if len(streams) != 0: - raise ValueError( - f"{self.__class__.__name__} has fixed input streams. Additional streams cannot be accepted at this point." - ) - return self.fixed_input_streams return streams @abstractmethod @@ -189,7 +181,3 @@ def __str__(self): def kernel_identity_structure(self, *streams: dp.Stream) -> Any: return self.kernel.identity_structure(*streams) - - -class CachedKernel(WrappedKernel): - pass diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 10e088e..bae7c9b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone import logging import sys from abc import abstractmethod @@ -61,12 +62,11 @@ def call( def __init__( self, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, error_handling: error_handling_options = "raise", label: str | None = None, **kwargs, ) -> None: - super().__init__(fixed_input_streams=fixed_input_streams, label=label, **kwargs) + super().__init__(label=label, **kwargs) self._active = True self.error_handling = error_handling @@ -90,21 +90,6 @@ def set_active(self, active: bool) -> None: """ self._active = active - def validate_inputs(self, *streams: dp.Stream) -> None: - if len(streams) != 1: - raise ValueError( - f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" - ) - input_stream = streams[0] - _, incoming_packet_types = input_stream.types() - if not tsutils.check_typespec_compatibility( - incoming_packet_types, self.input_packet_types() - ): - # TODO: use custom exception type for better error handling - raise ValueError( - f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" - ) - @staticmethod def _join_streams(*streams: dp.Stream) -> dp.Stream: if not streams: @@ -120,31 +105,33 @@ def _join_streams(*streams: dp.Stream) -> dp.Stream: def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ - Prepare the incoming streams for execution in the pod. If fixed_input_streams are present, - they will be used as the input streams and the newly provided streams would be used to - restrict (semijoin) the fixed streams. - Otherwise, the join of the provided streams will be returned. + Prepare the incoming streams for execution in the pod. At least one stream must be present. + If more than one stream is present, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - if self.fixed_input_streams is not None: - if len(streams) == 0: - output_stream = self._join_streams(*self.fixed_input_streams) - else: - restrict_stream = self._join_streams(*streams) - raise NotImplementedError( - f"{self.__class__.__name__} does not support semijoining fixed input streams with additional streams yet. " - "Please implement this functionality in the subclass." - ) - # output_stream = SemiJoin()(output_stream, restrict_stream) - else: - if len(streams) == 0: - raise ValueError( - f"{self.__class__.__name__} expects at least one input stream" - ) - output_stream = self._join_streams(*streams) + if len(streams) == 0: + raise ValueError( + f"{self.__class__.__name__} expects at least one input stream" + ) + output_stream = self._join_streams(*streams) return (output_stream,) + def validate_inputs(self, *streams: dp.Stream) -> None: + if len(streams) != 1: + raise ValueError( + f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" + ) + input_stream = streams[0] + _, incoming_packet_types = input_stream.types() + if not tsutils.check_typespec_compatibility( + incoming_packet_types, self.input_packet_types() + ): + # TODO: use custom exception type for better error handling + raise ValueError( + f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" + ) + def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None ) -> dp.LiveStream: @@ -152,10 +139,6 @@ def prepare_output_stream( output_stream.label = label return output_stream - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: - if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_pod_invocation(self, streams, label=label) - def forward(self, *streams: dp.Stream) -> PodStream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" input_stream = streams[0] @@ -166,6 +149,10 @@ def forward(self, *streams: dp.Stream) -> PodStream: error_handling=cast(error_handling_options, self.error_handling), ) + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_pod_invocation(self, streams, label=label) + def function_pod( output_keys: str | Collection[str] | None = None, @@ -376,7 +363,6 @@ class WrappedPod(ActivatablePodBase): def __init__( self, pod: dp.Pod, - fixed_input_streams: tuple[dp.Stream, ...] | None = None, label: str | None = None, data_context: str | DataContext | None = None, **kwargs, @@ -384,7 +370,6 @@ def __init__( if data_context is None: data_context = pod.data_context_key super().__init__( - fixed_input_streams=fixed_input_streams, label=label, data_context=data_context, **kwargs, @@ -437,19 +422,18 @@ class CachedPod(WrappedPod): # name of the column in the tag store that contains the packet hash PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" + DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" def __init__( self, pod: dp.Pod, result_store: ArrowDataStore, - pipeline_store: ArrowDataStore | None, record_path_prefix: tuple[str, ...] = (), **kwargs, ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix self.result_store = result_store - self.pipeline_store = pipeline_store # unset data_context native to the object self.pod_hash = self.data_context.object_hasher.hash_to_hex( @@ -468,66 +452,27 @@ def call( self, tag: dp.Tag, packet: dp.Packet, + skip_record_check: bool = False, skip_recording: bool = False, + overwrite_existing: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: - output_packet = self.get_recorded_output_packet(packet) + output_packet = None + if not skip_record_check: + output_packet = self.get_recorded_output_packet(packet) if output_packet is None: tag, output_packet = self.pod.call(tag, packet) if output_packet is not None and not skip_recording: - self.record_packet(packet, output_packet) + self.record_packet( + packet, output_packet, overwrite_existing=overwrite_existing + ) - if output_packet is not None: - self.add_pipeline_record(tag, input_packet=packet) return tag, output_packet - def add_pipeline_record(self, tag: dp.Tag, input_packet: dp.Packet) -> None: - if self.pipeline_store is None: - # no pipeline store configured, skip recording - return - # combine dp.Tag with packet content hash to compute entry hash - tag_with_hash = tag.as_table().append_column( - self.PACKET_HASH_COLUMN, - pa.array([input_packet.content_hash()], type=pa.large_string()), - ) - entry_id = self.data_context.arrow_hasher.hash_table( - tag_with_hash, prefix_hasher_id=True - ) - - existing_record = self.pipeline_store.get_record_by_id( - self.record_path, - entry_id, - ) - - if existing_record is not None: - # if the record already exists, return it - return - - # no record matching, so construct the full record - - input_packet_info = ( - input_packet.as_table( - include_source=True, - ) - .append_column( - f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", - pa.array([input_packet.data_context_key], type=pa.large_string()), - ) - .drop(input_packet.keys()) - ) - - combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) - - self.pipeline_store.add_record( - self.record_path, - entry_id, - combined_record, - ignore_duplicates=False, - ) - def record_packet( self, input_packet: dp.Packet, output_packet: dp.Packet, + overwrite_existing: bool = False, ignore_duplicates: bool = False, ) -> dp.Packet: """ @@ -539,6 +484,7 @@ def record_packet( self.record_path, input_packet.content_hash(), data_table, + overwrite_existing=overwrite_existing, ignore_duplicates=ignore_duplicates, ) if result_flag is None: @@ -560,41 +506,8 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non if result_table is None: return None - return ArrowPacket(result_table) - - def _get_all_records(self) -> "pa.Table | None": - results = self.result_store.get_all_records( - self.record_path, record_id_column=self.PACKET_HASH_COLUMN - ) - - if self.pipeline_store is None: - raise ValueError( - "Pipeline store is not configured, cannot retrieve tag info" - ) - taginfo = self.pipeline_store.get_all_records( - self.record_path, - ) - - if results is None or taginfo is None: - return None - - tag_columns = [ - c - for c in taginfo.column_names - if not c.startswith(constants.META_PREFIX) - and not c.startswith(constants.SOURCE_PREFIX) - ] - - packet_columns = [ - c for c in results.column_names if c != self.PACKET_HASH_COLUMN - ] - - # TODO: do not hardcode the join keys - joined_info = taginfo.join( - results, - self.PACKET_HASH_COLUMN, - join_type="inner", + # note that data context will be loaded from the result store + return ArrowPacket( + result_table, + meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, ) - - joined_info = joined_info.select([*tag_columns, *packet_columns]) - return joined_info From dbb70f953492a5098f870f1bd0fa47049ea37b02 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:08:04 +0000 Subject: [PATCH 125/224] refactor: consistent copy logic and ability to specify meta info in constructor --- src/orcapod/data/datagrams/arrow_datagram.py | 39 +++++++++++- .../data/datagrams/arrow_tag_packet.py | 29 +++++---- src/orcapod/data/datagrams/base.py | 4 +- src/orcapod/data/datagrams/dict_datagram.py | 59 ++++++++++--------- src/orcapod/data/datagrams/dict_tag_packet.py | 12 ++-- 5 files changed, 91 insertions(+), 52 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5eb158c..5ceb3cb 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -52,6 +52,7 @@ class ArrowDatagram(BaseDatagram): def __init__( self, table: pa.Table, + meta_info: Mapping[str, DataValue] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -84,9 +85,6 @@ def __init__( if constants.CONTEXT_KEY in table.column_names else [] ) - meta_columns = [ - col for col in table.column_names if col.startswith(constants.META_PREFIX) - ] # Extract context table if present if constants.CONTEXT_KEY in table.column_names and data_context is None: @@ -96,9 +94,13 @@ def __init__( # Initialize base class with data context super().__init__(data_context) + meta_columns = [ + col for col in table.column_names if col.startswith(constants.META_PREFIX) + ] # Split table into components self._data_table = table.drop_columns(context_columns + meta_columns) self._meta_table = table.select(meta_columns) if meta_columns else None + if len(self._data_table.column_names) == 0: raise ValueError("Data table must contain at least one data column.") @@ -112,6 +114,35 @@ def __init__( ) self._semantic_converter = semantic_converter + # process supplemented meta info if provided + if meta_info is not None: + # make sure it has the expected prefixes + meta_info = { + ( + f"{constants.META_PREFIX}{k}" + if not k.startswith(constants.META_PREFIX) + else k + ): v + for k, v in meta_info.items() + } + # Note that meta information cannot contain semantic types + typespec = typespec_utils.get_typespec_from_dict(meta_info) + new_meta_table = self._semantic_converter.from_python_to_arrow( + meta_info, typespec + ) + if self._meta_table is None: + self._meta_table = new_meta_table + else: + # drop any column that will be overwritten by the new meta table + keep_meta_columns = [ + c + for c in self._meta_table.column_names + if c not in new_meta_table.column_names + ] + self._meta_table = arrow_utils.hstack_tables( + self._meta_table.select(keep_meta_columns), new_meta_table + ) + # Create data context table data_context_schema = pa.schema({constants.CONTEXT_KEY: pa.large_string()}) self._data_context_table = pa.Table.from_pylist( @@ -477,6 +508,8 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: # Apply updates meta_dict.update(prefixed_updates) + # TODO: properly handle case where meta data is None (it'll get inferred as NoneType) + # Create new meta table new_datagram._meta_table = ( pa.Table.from_pylist([meta_dict]) if meta_dict else None diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index f776365..503b83e 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -1,5 +1,6 @@ +from hmac import new import logging -from collections.abc import Collection +from collections.abc import Collection, Mapping from typing import Self @@ -81,7 +82,8 @@ class ArrowPacket(ArrowDatagram): def __init__( self, table: pa.Table, - source_info: dict[str, str | None] | None = None, + meta_info: Mapping[str, DataValue] | None = None, + source_info: Mapping[str, str | None] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -94,19 +96,21 @@ def __init__( source_info = {} # normalize the table to ensure it has the expected source_info columns + # TODO: use simpler function to ensure source_info columns data_table, prefixed_tables = arrow_utils.prepare_prefixed_columns( table, {constants.SOURCE_PREFIX: source_info}, exclude_columns=[constants.CONTEXT_KEY], exclude_prefixes=[constants.META_PREFIX], ) - self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] super().__init__( data_table, + meta_info=meta_info, semantic_converter=semantic_converter, data_context=data_context, ) + self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] self._cached_source_info: dict[str, str | None] | None = None self._cached_python_schema: schemas.PythonSchema | None = None @@ -252,17 +256,12 @@ def source_info(self) -> dict[str, str | None]: } return self._cached_source_info.copy() - def copy(self) -> Self: - # TODO: restructure copy to allow for better inheritance and expansion - new_packet = self.__class__( - self.as_table(), - self.source_info(), - semantic_converter=self._semantic_converter, - data_context=self._data_context, - ) - new_packet._cached_source_info = self._cached_source_info - new_packet._cached_python_dict = self._cached_python_dict - new_packet._cached_python_schema = self._cached_python_schema - new_packet._cached_content_hash = self._cached_content_hash + # 8. Utility Operations + def copy(self, include_cache: bool = True) -> Self: + """Return a copy of the datagram.""" + new_packet = super().copy(include_cache=include_cache) + + if include_cache: + new_packet._cached_source_info = self._cached_source_info return new_packet diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index f253995..9f6d4a8 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -271,4 +271,6 @@ def with_context_key(self, new_context_key: str) -> Self: # 8. Utility Operations def copy(self) -> Self: """Create a shallow copy of the datagram.""" - return object.__new__(self.__class__) + new_datagram = object.__new__(self.__class__) + new_datagram._data_context = self._data_context + return new_datagram diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 5ebd926..9f7088f 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -1,3 +1,4 @@ +from curses import meta import logging from collections.abc import Collection, Iterator, Mapping from typing import Self, cast @@ -54,6 +55,7 @@ def __init__( self, data: Mapping[str, DataValue], typespec: TypeSpec | None = None, + meta_info: Mapping[str, DataValue] | None = None, semantic_converter: SemanticConverter | None = None, data_context: str | DataContext | None = None, ) -> None: @@ -96,7 +98,9 @@ def __init__( # Store data and meta components separately (immutable) self._data = dict(data_columns) - self._meta_data = dict(meta_columns) + if meta_info is not None: + meta_columns.update(meta_info) + self._meta_data = meta_columns # Combine provided typespec info with inferred typespec from content # If the column value is None and no type spec is provided, defaults to str. @@ -114,7 +118,7 @@ def __init__( semantic_type_registry=self._data_context.semantic_type_registry ), ) - self.semantic_converter = semantic_converter + self._semantic_converter = semantic_converter # Create schema for meta data self._meta_python_schema = schemas.PythonSchema( @@ -256,7 +260,7 @@ def arrow_schema( # Build data schema (cached) if self._cached_data_arrow_schema is None: self._cached_data_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._data_python_schema ) ) @@ -272,7 +276,7 @@ def arrow_schema( if include_meta_columns and self._meta_data: if self._cached_meta_arrow_schema is None: self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._meta_python_schema ) ) @@ -379,7 +383,7 @@ def _get_meta_arrow_table(self) -> pa.Table: def _get_meta_arrow_schema(self) -> pa.Schema: if self._cached_meta_arrow_schema is None: self._cached_meta_arrow_schema = ( - self.semantic_converter.from_python_to_arrow_schema( + self._semantic_converter.from_python_to_arrow_schema( self._meta_python_schema ) ) @@ -412,7 +416,7 @@ def as_table( # Build data table (cached) if self._cached_data_table is None: - self._cached_data_table = self.semantic_converter.from_python_to_arrow( + self._cached_data_table = self._semantic_converter.from_python_to_arrow( self._data, self._data_python_schema, ) @@ -497,7 +501,7 @@ def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -542,7 +546,7 @@ def drop_meta_columns( return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -572,7 +576,7 @@ def select(self, *column_names: str) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -605,7 +609,7 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagra return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -646,7 +650,7 @@ def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": return DictDatagram( data=full_data, typespec=new_typespec, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -685,7 +689,7 @@ def update(self, **updates: DataValue) -> "DictDatagram": return DictDatagram( data=full_data, - semantic_converter=self.semantic_converter, # Keep existing converter + semantic_converter=self._semantic_converter, # Keep existing converter data_context=self._data_context, ) @@ -770,7 +774,7 @@ def with_context_key(self, new_context_key: str) -> "DictDatagram": ) # 8. Utility Operations - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """ Create a shallow copy of the datagram. @@ -781,22 +785,19 @@ def copy(self) -> Self: Returns: New DictDatagram instance with copied data and caches. """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data - - new_datagram = self.__class__( - full_data, - semantic_converter=self.semantic_converter, - data_context=self._data_context, - ) - - # Copy caches - new_datagram._cached_data_table = self._cached_data_table - new_datagram._cached_meta_table = self._cached_meta_table - new_datagram._cached_content_hash = self._cached_content_hash - new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema - new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + new_datagram = super().copy() + new_datagram._data = self._data.copy() + new_datagram._meta_data = self._meta_data.copy() + new_datagram._data_python_schema = self._data_python_schema.copy() + new_datagram._semantic_converter = self._semantic_converter + new_datagram._meta_python_schema = self._meta_python_schema.copy() + + if include_cache: + new_datagram._cached_data_table = self._cached_data_table + new_datagram._cached_meta_table = self._cached_meta_table + new_datagram._cached_content_hash = self._cached_content_hash + new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema + new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema return new_datagram diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 92bf6aa..ea9b7fa 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -1,6 +1,7 @@ import logging from collections.abc import Collection, Mapping from typing import Self +from xml.etree.ElementInclude import include import pyarrow as pa @@ -46,6 +47,7 @@ class DictPacket(DictDatagram): def __init__( self, data: Mapping[str, DataValue], + meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, typespec: TypeSpec | None = None, semantic_converter: SemanticConverter | None = None, @@ -64,6 +66,7 @@ def __init__( super().__init__( data_only, typespec=typespec, + meta_info=meta_info, semantic_converter=semantic_converter, data_context=data_context, ) @@ -235,7 +238,7 @@ def as_datagram( return DictDatagram( data, typespec=typespec, - semantic_converter=self.semantic_converter, + semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -248,9 +251,10 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """Return a shallow copy of the packet.""" - instance = super().copy() + instance = super().copy(include_cache=include_cache) instance._source_info = self._source_info.copy() - instance._cached_source_info_table = self._cached_source_info_table + if include_cache: + instance._cached_source_info_table = self._cached_source_info_table return instance From 390e99da1fd96ee088012ea8859145fe7ff317b9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 23 Jul 2025 06:09:29 +0000 Subject: [PATCH 126/224] feat: clean implementation of pipeline nodes --- src/orcapod/pipeline/graph.py | 8 +- src/orcapod/pipeline/nodes.py | 203 ++++++++++++++++++++++- src/orcapod/protocols/store_protocols.py | 2 + src/orcapod/stores/delta_lake_stores.py | 2 + 4 files changed, 207 insertions(+), 8 deletions(-) diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 3266e3b..0a371f5 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -98,12 +98,16 @@ def wrap_invocation( if invocation in self.invocation_to_pod_lut: pod = self.invocation_to_pod_lut[invocation] node = PodNode( - pod=pod, fixed_input_streams=new_input_streams, label=invocation.label + pod=pod, + input_streams=new_input_streams, + pipeline_store=self.pipeline_store, + label=invocation.label, ) else: node = KernelNode( kernel=invocation.kernel, - fixed_input_streams=new_input_streams, + input_streams=new_input_streams, + pipeline_store=self.pipeline_store, label=invocation.label, ) return node diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 13347f6..d9e34da 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,6 +1,18 @@ +from ast import Not +from collections.abc import Collection from orcapod.data.kernels import WrappedKernel -from orcapod.data.pods import WrappedPod +from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import PodStream +from orcapod.utils.lazy_module import LazyModule +from typing import TYPE_CHECKING +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.utils import arrow_utils + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") class KernelNode(WrappedKernel): @@ -9,9 +21,36 @@ class KernelNode(WrappedKernel): This node can be used to execute the kernel and process data streams. """ - def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + def __init__( + self, + kernel: dp.Kernel, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + **kwargs, + ) -> None: super().__init__(kernel=kernel, **kwargs) self.kernel = kernel + self.input_streams = tuple(input_streams) + self.pipeline_store = pipeline_store + self._cached_stream: dp.Stream | None = None + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + if len(streams) > 0: + raise NotImplementedError( + "At this moment, PodNode does not yet support handling additional input streams." + ) + return super().pre_process_input_streams(*self.input_streams) + + def forward(self, *args, **kwargs) -> dp.Stream: + """ + Forward the data through the kernel and return a PodStream. + This method can be overridden to customize the forwarding behavior. + """ + if self._cached_stream is None: + # TODO: reconsider this logic -- if we were to allow semijoin with inputs in the future + # this caching needs to be done more carefully + self._cached_stream = self.kernel.forward(*args, **kwargs) + return self._cached_stream def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" @@ -20,13 +59,165 @@ def __str__(self): return f"KernelNode:{self.kernel!s}" -class PodNode(WrappedPod): - def __init__(self, pod: dp.Pod, **kwargs) -> None: - super().__init__(pod=pod, **kwargs) - self.pod = pod +class PodNode(CachedPod): + PIPELINE_RESULT_PATH = ("_results",) + + def __init__( + self, + pod: dp.Pod, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + result_store: ArrowDataStore | None = None, + record_path_prefix: tuple[str, ...] = (), + **kwargs, + ) -> None: + self.pipeline_path_prefix = record_path_prefix + if result_store is None: + record_path_prefix += self.PIPELINE_RESULT_PATH + result_store = pipeline_store + super().__init__( + pod=pod, + result_store=result_store, + record_path_prefix=record_path_prefix, + **kwargs, + ) + self.pipeline_store = pipeline_store + self.input_streams = tuple(input_streams) + self._cached_stream: dp.LiveStream | None = None + + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return self.pipeline_path_prefix + self.kernel_id def __repr__(self): return f"PodNode(pod={self.pod!r})" def __str__(self): return f"PodNode:{self.pod!s}" + + def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + if len(streams) > 0: + raise NotImplementedError( + "At this moment, PodNode does not yet support handling additional input streams." + ) + return super().pre_process_input_streams(*self.input_streams) + + def __call__(self, *args, **kwargs) -> dp.LiveStream: + """ + Forward the data through the pod and return a PodStream. + This method can be overridden to customize the forwarding behavior. + """ + if self._cached_stream is None: + self._cached_stream = super().__call__(*args, **kwargs) + return self._cached_stream + + def call( + self, + tag: dp.Tag, + packet: dp.Packet, + skip_record_check: bool = False, + skip_recording: bool = False, + overwrite_existing: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: + tag, output_packet = super().call( + tag, + packet, + skip_record_check=skip_record_check, + skip_recording=skip_recording, + overwrite_existing=overwrite_existing, + ) + if output_packet is not None: + retrieved = ( + output_packet.get_meta_value(self.DATA_RETRIEVED_FLAG) is not None + ) + # add pipeline record if the output packet is not None + self.add_pipeline_record(tag, packet, retrieved=retrieved) + return tag, output_packet + + def add_pipeline_record( + self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None + ) -> None: + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.pipeline_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .append_column( + self.DATA_RETRIEVED_FLAG, + pa.array([retrieved], type=pa.bool_()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.pipeline_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) + + def _get_all_records(self) -> "pa.Table | None": + results = self.result_store.get_all_records( + self.record_path, record_id_column=self.PACKET_HASH_COLUMN + ) + + if self.pipeline_store is None: + raise ValueError( + "Pipeline store is not configured, cannot retrieve tag info" + ) + taginfo = self.pipeline_store.get_all_records( + self.record_path, + ) + + if results is None or taginfo is None: + return None + + tag_columns = [ + c + for c in taginfo.column_names + if not c.startswith(constants.META_PREFIX) + and not c.startswith(constants.SOURCE_PREFIX) + ] + + packet_columns = [ + c for c in results.column_names if c != self.PACKET_HASH_COLUMN + ] + + # TODO: do not hardcode the join keys + joined_info = taginfo.join( + results, + self.PACKET_HASH_COLUMN, + join_type="inner", + ) + + joined_info = joined_info.select([*tag_columns, *packet_columns]) + return joined_info diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 0356270..4940033 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -10,6 +10,7 @@ def add_record( record_id: str, data: pa.Table, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> str | None: ... def add_records( @@ -18,6 +19,7 @@ def add_records( records: pa.Table, record_id_column: str | None = None, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> list[str]: ... def get_record_by_id( diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 218c0e0..8dc6a1d 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -395,6 +395,7 @@ def add_record( record_id: str, data: pa.Table, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, force_flush: bool = False, ) -> pa.Table: self._validate_record_path(record_path) @@ -472,6 +473,7 @@ def add_records( records: pa.Table, record_id_column: str | None = None, ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, ) -> list[str]: raise NotImplementedError( "add_records is not implemented in BasicDeltaTableArrowStore yet. " From 2f7946c53743ab58c09806e0b52d1e24d80271de Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 08:59:01 +0000 Subject: [PATCH 127/224] refactor: rename pre-kernel step to be more explicit --- src/orcapod/data/kernels.py | 30 ++++++++++-------- src/orcapod/data/pods.py | 62 +++++++++++++++++-------------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 5392cb4..1cda423 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from collections.abc import Collection from typing import Any from orcapod.protocols import data_protocols as dp import logging @@ -52,13 +53,13 @@ def data_context_key(self) -> str: @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing on the input streams before the main computation. This is useful if you need to modify the input streams or perform any other operations before the main computation. Critically, any Kernel/Pod invocations in the - pre-processing step will be tracked separately from the main computation in forward. - By default, it returns the input streams unchanged. + pre-processing step will be tracked outside of the computation in the kernel. + Default implementation is a no-op, returning the input streams unchanged. """ return streams @@ -86,7 +87,7 @@ def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> Non def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs ) -> dp.LiveStream: - processed_streams = self.pre_process_input_streams(*streams) + processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) self.track_invocation(*processed_streams, label=label) @@ -101,7 +102,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: """ def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - processed_streams = self.pre_process_input_streams(*streams) + processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) return self.kernel_output_types(*processed_streams) @@ -117,9 +118,11 @@ def __str__(self): return self.__class__.__name__ @abstractmethod - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ... + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: ... - def identity_structure(self, *streams: dp.Stream) -> Any: + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: # Default implementation of identity_structure for the kernel only # concerns the kernel class and the streams if present. Subclasses of # Kernels should override this method to provide a more meaningful @@ -134,10 +137,9 @@ def identity_structure(self, *streams: dp.Stream) -> Any: # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. - if len(streams) > 0: - streams = self.pre_process_input_streams(*streams) - self.validate_inputs(*streams) - return self.kernel_identity_structure(*streams) + if streams is not None: + streams = self.pre_kernel_processing(*streams) + return self.kernel_identity_structure(streams) class WrappedKernel(TrackedKernelBase): @@ -179,5 +181,7 @@ def __repr__(self): def __str__(self): return f"WrappedKernel:{self.kernel!s}" - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: - return self.kernel.identity_structure(*streams) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return self.kernel.identity_structure(streams) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index bae7c9b..a66cfc6 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -10,9 +10,9 @@ ArrowPacket, ) from orcapod.data.context import DataContext -from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join -from orcapod.data.streams import PodStream +from orcapod.data.streams import LazyPodResultStream, PodStream from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp @@ -55,11 +55,6 @@ def output_packet_types(self) -> TypeSpec: """ ... - @abstractmethod - def call( - self, tag: dp.Tag, packet: dp.Packet - ) -> tuple[dp.Tag, dp.Packet | None]: ... - def __init__( self, error_handling: error_handling_options = "raise", @@ -103,17 +98,16 @@ def _join_streams(*streams: dp.Stream) -> dp.Stream: joined_stream = Join()(joined_stream, next_stream) return joined_stream - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Prepare the incoming streams for execution in the pod. At least one stream must be present. If more than one stream is present, the join of the provided streams will be returned. """ # if multiple streams are provided, join them # otherwise, return as is - if len(streams) == 0: - raise ValueError( - f"{self.__class__.__name__} expects at least one input stream" - ) + if len(streams) <= 1: + return streams + output_stream = self._join_streams(*streams) return (output_stream,) @@ -134,20 +128,17 @@ def validate_inputs(self, *streams: dp.Stream) -> None: def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None - ) -> dp.LiveStream: - output_stream = self.forward(*streams) - output_stream.label = label - return output_stream + ) -> KernelStream: + return KernelStream(source=self, upstreams=streams, label=label) - def forward(self, *streams: dp.Stream) -> PodStream: + def forward(self, *streams: dp.Stream) -> dp.Stream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" - input_stream = streams[0] + return LazyPodResultStream(pod=self, prepared_stream=streams[0]) - return PodStream( - self, - input_stream, - error_handling=cast(error_handling_options, self.error_handling), - ) + @abstractmethod + def call( + self, tag: dp.Tag, packet: dp.Packet + ) -> tuple[dp.Tag, dp.Packet | None]: ... def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: @@ -252,10 +243,7 @@ def __init__( @property def kernel_id(self) -> tuple[str, ...]: - return ( - self.function_name, - self.data_context.object_hasher.hash_to_hex(self), - ) + return (self.function_name,) def input_packet_types(self) -> PythonSchema: """ @@ -323,7 +311,9 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) return tag, output_packet - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: # construct identity structure for the function # if function_info_extractor is available, use that but substitute the function_name @@ -348,8 +338,8 @@ def kernel_identity_structure(self, *streams: dp.Stream) -> Any: ) # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure - if len(streams) != 0: - id_struct += (streams[0],) + if streams is not None and len(streams) != 0: + id_struct += tuple(streams) return id_struct @@ -358,6 +348,7 @@ class WrappedPod(ActivatablePodBase): """ A wrapper for an existing pod, allowing for additional functionality or modifications without changing the original pod. This class is meant to serve as a base class for other pods that need to wrap existing pods. + Note that only the call logic is pass through to the wrapped pod, but the forward logic is not. """ def __init__( @@ -401,11 +392,16 @@ def output_packet_types(self) -> TypeSpec: """ return self.pod.output_packet_types() + def validate_inputs(self, *streams: dp.Stream) -> None: + self.pod.validate_inputs(*streams) + def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: return self.pod.call(tag, packet) - def kernel_identity_structure(self, *streams: dp.Stream) -> Any: - return self.pod.identity_structure(*streams) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return self.pod.identity_structure(streams) def __repr__(self) -> str: return f"WrappedPod({self.pod!r})" @@ -446,7 +442,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.kernel_id + return self.record_path_prefix + self.kernel_id + (self.pod_hash,) def call( self, From 735299da642c381761cc10d98145d6fd4a6ea137 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 09:40:47 +0000 Subject: [PATCH 128/224] refactor: extract node base class --- src/orcapod/data/trackers.py | 6 +- src/orcapod/pipeline/nodes.py | 189 ++++++++++++++++++++++++---------- 2 files changed, 138 insertions(+), 57 deletions(-) diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 3cf42a9..799334e 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -3,7 +3,7 @@ from orcapod.data.context import DataContext from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict -from collections.abc import Generator +from collections.abc import Generator, Collection from abc import ABC, abstractmethod from typing import Any, TYPE_CHECKING from contextlib import contextmanager @@ -139,7 +139,7 @@ def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: return self.forward(*args, **kwargs) - def identity_structure(self, *streams: dp.Stream) -> Any: + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: # FIXME: using label as a stop-gap for identity structure return self.label @@ -194,7 +194,7 @@ def identity_structure(self) -> Any: Return a structure that represents the identity of this invocation. This is used to uniquely identify the invocation in the tracker. """ - return self.kernel.identity_structure(*self.upstreams) + return self.kernel.identity_structure(self.upstreams) def __repr__(self) -> str: return f"Invocation(kernel={self.kernel}, upstreams={self.upstreams}, label={self.label})" diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index d9e34da..7372175 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,11 +1,13 @@ from ast import Not -from collections.abc import Collection -from orcapod.data.kernels import WrappedKernel +from collections.abc import Collection, Iterator +from datetime import datetime +from orcapod.data.kernels import WrappedKernel, TrackedKernelBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp from orcapod.data.streams import PodStream +from orcapod.types import TypeSpec from orcapod.utils.lazy_module import LazyModule -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from orcapod.data.system_constants import orcapod_constants as constants from orcapod.utils import arrow_utils @@ -15,53 +17,144 @@ pa = LazyModule("pyarrow") -class KernelNode(WrappedKernel): +class Node( + TrackedKernelBase, +): """ - A node in the pipeline that represents a kernel. - This node can be used to execute the kernel and process data streams. + Mixin class for pipeline nodes """ def __init__( self, - kernel: dp.Kernel, input_streams: Collection[dp.Stream], pipeline_store: ArrowDataStore, + pipeline_path_prefix: tuple[str, ...] = (), **kwargs, - ) -> None: - super().__init__(kernel=kernel, **kwargs) - self.kernel = kernel + ): + super().__init__(**kwargs) + self._cached_stream: dp.LiveStream | None = None self.input_streams = tuple(input_streams) self.pipeline_store = pipeline_store - self._cached_stream: dp.Stream | None = None + self.pipeline_path_prefix = pipeline_path_prefix + # compute invocation hash - note that empty () is passed into identity_structure to signify + # identity structure of invocation with no input streams + self.invocation_hash = self.data_context.object_hasher.hash_to_hex( + self.identity_structure(()), prefix_hasher_id=True + ) + + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def validate_inputs(self, *processed_streams: dp.Stream) -> None: + pass + + def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: raise NotImplementedError( - "At this moment, PodNode does not yet support handling additional input streams." + "At this moment, Node does not yet support handling additional input streams." ) - return super().pre_process_input_streams(*self.input_streams) + # TODO: re-evaluate the use here + # super().validate_inputs(*self.input_streams) + return super().forward(*self.input_streams) - def forward(self, *args, **kwargs) -> dp.Stream: - """ - Forward the data through the kernel and return a PodStream. - This method can be overridden to customize the forwarding behavior. - """ + def __call__(self, *args, **kwargs) -> dp.LiveStream: if self._cached_stream is None: - # TODO: reconsider this logic -- if we were to allow semijoin with inputs in the future - # this caching needs to be done more carefully - self._cached_stream = self.kernel.forward(*args, **kwargs) + self._cached_stream = super().__call__(*args, **kwargs) return self._cached_stream + # properties and methods to act as a dp.Stream + @property + def source(self) -> dp.Kernel | None: + return self + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + return () + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + return self().keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + return self().types() + + @property + def last_modified(self) -> datetime | None: + return self().last_modified + + @property + def is_current(self) -> bool: + return self().is_current + + def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().__iter__() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().iter_packets() + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> "pa.Table": + return self().as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + return self().flow() + + +class KernelNode(Node, WrappedKernel): + """ + A node in the pipeline that represents a kernel. + This node can be used to execute the kernel and process data streams. + """ + + def __init__( + self, + kernel: dp.Kernel, + input_streams: Collection[dp.Stream], + pipeline_store: ArrowDataStore, + pipeline_path_prefix: tuple[str, ...] = (), + **kwargs, + ) -> None: + super().__init__( + kernel=kernel, + input_streams=input_streams, + pipeline_store=pipeline_store, + pipeline_path_prefix=pipeline_path_prefix, + **kwargs, + ) + def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" def __str__(self): return f"KernelNode:{self.kernel!s}" + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.kernel.identity_structure(self.input_streams) -class PodNode(CachedPod): - PIPELINE_RESULT_PATH = ("_results",) +class PodNode(Node, CachedPod): def __init__( self, pod: dp.Pod, @@ -69,29 +162,20 @@ def __init__( pipeline_store: ArrowDataStore, result_store: ArrowDataStore | None = None, record_path_prefix: tuple[str, ...] = (), + pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ) -> None: - self.pipeline_path_prefix = record_path_prefix - if result_store is None: - record_path_prefix += self.PIPELINE_RESULT_PATH - result_store = pipeline_store super().__init__( pod=pod, result_store=result_store, record_path_prefix=record_path_prefix, + input_streams=input_streams, + pipeline_store=pipeline_store, + pipeline_path_prefix=pipeline_path_prefix, **kwargs, ) self.pipeline_store = pipeline_store - self.input_streams = tuple(input_streams) - self._cached_stream: dp.LiveStream | None = None - - @property - def pipeline_path(self) -> tuple[str, ...]: - """ - Return the path to the pipeline run records. - This is used to store the run-associated tag info. - """ - return self.pipeline_path_prefix + self.kernel_id + # self.input_streams = tuple(input_streams) def __repr__(self): return f"PodNode(pod={self.pod!r})" @@ -99,22 +183,6 @@ def __repr__(self): def __str__(self): return f"PodNode:{self.pod!s}" - def pre_process_input_streams(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: - if len(streams) > 0: - raise NotImplementedError( - "At this moment, PodNode does not yet support handling additional input streams." - ) - return super().pre_process_input_streams(*self.input_streams) - - def __call__(self, *args, **kwargs) -> dp.LiveStream: - """ - Forward the data through the pod and return a PodStream. - This method can be overridden to customize the forwarding behavior. - """ - if self._cached_stream is None: - self._cached_stream = super().__call__(*args, **kwargs) - return self._cached_stream - def call( self, tag: dp.Tag, @@ -221,3 +289,16 @@ def _get_all_records(self) -> "pa.Table | None": joined_info = joined_info.select([*tag_columns, *packet_columns]) return joined_info + + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.pod.identity_structure(self.input_streams) From 6780ace2aff9e35a7502280eb258043707a05fed Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 24 Jul 2025 09:42:05 +0000 Subject: [PATCH 129/224] refactor: import cleanup and additional todos --- src/orcapod/data/datagrams/arrow_datagram.py | 25 + src/orcapod/data/streams.py | 569 ++++++++++++++----- src/orcapod/pipeline/graph.py | 4 + src/orcapod/protocols/data_protocols.py | 79 ++- src/orcapod/stores/delta_lake_stores.py | 2 +- src/orcapod/types/semantic_converter.py | 17 + src/orcapod/types/semantic_types.py | 16 + 7 files changed, 565 insertions(+), 147 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 5ceb3cb..c29cf58 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -456,6 +456,31 @@ def as_table( return arrow_utils.hstack_tables(*all_tables) + def as_arrow_compatible_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation compatible with Arrow. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation compatible with Arrow + """ + return self.as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ).to_pylist()[0] + # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: """ diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index d0ecce3..f0178d5 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,14 +1,12 @@ import logging +from pathlib import Path import warnings from abc import ABC, abstractmethod from collections.abc import Collection, Iterator from datetime import datetime, timezone from itertools import repeat -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal -import pyarrow as pa - -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext from orcapod.data.datagrams import ( @@ -16,10 +14,17 @@ ArrowTag, DictTag, ) +from orcapod.data.system_constants import orcapod_constants as constants from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -48,6 +53,8 @@ def __init__( self._upstreams = upstreams self._last_modified: datetime | None = None self._set_modified_time() + # note that this is not necessary for Stream protocol, but is provided + # for convenience to resolve semantic types and other context-specific information self._data_context = DataContext.resolve_data_context(data_context) @property @@ -168,143 +175,10 @@ def identity_structure(self) -> Any: """ if self.source is not None: # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(*self.upstreams) + return self.source.identity_structure(self.upstreams) return super().identity_structure() -class KernelStream(StreamBase): - """ - Recomputable stream that wraps a streams produced by a kernel to provide - an abstraction over the stream, taking the stream's source and upstreams as the basis of - recomputing the stream. - - This stream is used to represent the output of a kernel invocation. - """ - - def __init__( - self, - output_stream: dp.Stream | None = None, - source: dp.Kernel | None = None, - upstreams: tuple[ - dp.Stream, ... - ] = (), # if provided, this will override the upstreams of the output_stream - **kwargs, - ) -> None: - if (output_stream is None or output_stream.source is None) and source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - if source is None: - if output_stream is None or output_stream.source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - source = output_stream.source - upstreams = upstreams or output_stream.upstreams - - super().__init__(source=source, upstreams=upstreams, **kwargs) - self._cached_stream = output_stream - - def clear_cache(self) -> None: - """ - Clears the cached stream. - This is useful for re-processing the stream with the same kernel. - """ - self._cached_stream = None - self._set_modified_time(invalidate=True) - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.keys() - - def types(self) -> tuple[TypeSpec, TypeSpec]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.types() - - @property - def is_current(self) -> bool: - if self._cached_stream is None or not super().is_current: - status = self.refresh() - if not status: # if it failed to update for whatever reason - return False - return True - - def refresh(self, force: bool = False) -> bool: - updated = False - if force or (self._cached_stream is not None and not super().is_current): - self.clear_cache() - - if self._cached_stream is None: - assert self.source is not None, ( - "Stream source must be set to recompute the stream." - ) - self._cached_stream = self.source.forward(*self.upstreams) - self._set_modified_time() - updated = True - - if self._cached_stream is None: - # TODO: use beter error type - raise ValueError( - "Stream could not be updated. Ensure that the source is valid and upstreams are correct." - ) - - return updated - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - @property - def last_modified(self) -> datetime | None: - if self._cached_stream is None: - return None - return self._cached_stream.last_modified - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> pa.Table: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_content_hash=include_content_hash, - ) - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - yield from self._cached_stream.iter_packets() - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" - - class ImmutableTableStream(StreamBase): """ An immutable stream based on a PyArrow Table. @@ -486,23 +360,287 @@ def __repr__(self) -> str: ) +class KernelStream(StreamBase): + """ + Recomputable stream that wraps a stream produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + output_stream: dp.Stream | None = None, + source: dp.Kernel | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (output_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._set_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.refresh() + if not status: # if it failed to update for whatever reason + return False + return True + + def refresh(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._set_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" + + +class LazyPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) + self.pod = pod + self.prepared_stream = prepared_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the prepared stream + self._prepared_stream_iterator = prepared_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + if self._prepared_stream_iterator is not None: + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + processed = self.pod.call(tag, packet) + if processed is not None: + # Update shared cache for future iterators (optimization) + self._cached_output_packets[i] = processed + tag, packet = processed + if packet is not None: + yield tag, packet + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.prepared_stream.keys() + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types(self) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.prepared_stream.types() + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(): + if tag_schema is None: + tag_schema = tag.arrow_schema() + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict()) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + # FIXME: this skips the semantic version conversion and thus is not + # fully correct! + all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) + all_packets: pa.Table = pa.Table.from_pylist( + all_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + return output_table + + class PodStream(StreamBase): def __init__( self, pod: dp.Pod, - input_stream: dp.Stream, + input_streams: tuple[dp.Stream, ...], error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs, ) -> None: - super().__init__(upstreams=(input_stream,), **kwargs) + super().__init__(upstreams=input_streams, **kwargs) self.pod = pod - self.input_stream = input_stream + self.input_streams = input_streams self.error_handling = error_handling self._source = pod # Cache for processed packets # This is a dictionary mapping the index of the packet in the input stream to a tuple of (Tag, Packet) # This allows us to efficiently access the processed packets without re-processing them + self._cached_forward_stream: dp.Stream | None = None self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} self._computation_complete: bool = False self._cached_output_table: pa.Table | None = None @@ -516,18 +654,27 @@ def source(self) -> dp.Pod | None: """ return self._source + def forward_stream(self) -> dp.Stream: + if self._cached_forward_stream is None: + self._cached_forward_stream = self.pod.forward(*self.input_streams) + return self._cached_forward_stream + + @property + def is_current(self) -> bool: + return self.forward_stream().is_current + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - tag_keys, _ = self.input_stream.keys() + tag_keys, _ = self.forward_stream().keys() packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys def types(self) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, _ = self.input_stream.types() + tag_typespec, _ = self.forward_stream().types() # TODO: check if copying can be avoided packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec @@ -537,6 +684,7 @@ def clear_cache(self) -> None: Clears the cached results of the processed stream. This is useful for re-processing the stream with the same processor. """ + self._cached_forward_stream = None self._cached_output_packets = {} self._computation_complete = False self._cached_output_table = None @@ -624,7 +772,7 @@ def as_table( def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() if not self._computation_complete or self._cached_output_packets is None: - for i, (tag, packet) in enumerate(self.input_stream.iter_packets()): + for i, (tag, packet) in enumerate(self.forward_stream().iter_packets()): if i not in self._cached_output_packets: try: processed_tag, processed_packet = self.pod.call(tag, packet) @@ -708,3 +856,136 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: def identity_structure(self) -> Any: return self._stream.identity_structure() + + +class InvokedPodStream(StreamBase): + """ + Recomputable stream that wraps a streams produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + pod_stream: PodStream | None = None, + source: dp.Pod | None = None, + upstreams: tuple[ + dp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (pod_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._set_modified_time(invalidate=True) + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + self.refresh() + assert self._cached_stream is not None, ( + "_cached_stream should not be None here." + ) + return self._cached_stream.types() + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.refresh() + if not status: # if it failed to update for whatever reason + return False + return True + + def refresh(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._set_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + yield from self._cached_stream.iter_packets() + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 0a371f5..0ba9bf8 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -100,7 +100,10 @@ def wrap_invocation( node = PodNode( pod=pod, input_streams=new_input_streams, + result_store=self.results_store, + record_path_prefix=self.results_store_path_prefix, pipeline_store=self.pipeline_store, + pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) else: @@ -108,6 +111,7 @@ def wrap_invocation( kernel=invocation.kernel, input_streams=new_input_streams, pipeline_store=self.pipeline_store, + pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) return node diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 968d70e..cd21645 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -320,6 +320,46 @@ def as_table( """ ... + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # ) -> dict[str, Any]: + # """ + # Return dictionary with values optimized for Arrow table conversion. + + # This method returns a dictionary where values are in a form that can be + # efficiently converted to Arrow format using pa.Table.from_pylist(). + + # The key insight is that this avoids the expensive as_table() → concat pattern + # by providing values that are "Arrow-ready" while remaining in dict format + # for efficient batching. + + # Implementation note: This may involve format conversions (e.g., Path objects + # to strings, datetime objects to ISO strings, etc.) to ensure compatibility + # with Arrow's expected input formats. + + # Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied + # with arrow_schema(...) with the same argument options to ensure that the schema matches the table. + + # Args: + # include_all_info: Include all available information + # include_meta_columns: Controls meta column inclusion + # include_context: Whether to include context key + + # Returns: + # Dictionary with values optimized for Arrow conversion + + # Example: + # # Efficient batch conversion pattern + # arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] + # schema = datagrams[0].arrow_schema() + # table = pa.Table.from_pylist(arrow_dicts, schema=schema) + # """ + # ... + # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: """ @@ -789,6 +829,17 @@ def as_table( """ ... + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # include_source: bool = False, + # ) -> dict[str, Any]: + # """Extended version with source info support.""" + # ... + def as_datagram( self, include_all_info: bool = False, @@ -1034,6 +1085,15 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: Provides a more explicit method name when the intent is to iterate over packets specifically, improving code readability. + This method must return an immutable iterator -- that is, the returned iterator + should not change and must consistently return identical tag,packet pairs across + multiple iterations of the iterator. + + Note that this is NOT to mean that multiple invocation of `iter_packets` must always + return an identical iterator. The iterator returned by `iter_packets` may change + between invocations, but the iterator itself must not change. Consequently, it should be understood + that the returned iterators may be a burden on memory if the stream is large or infinite. + Yields: tuple[Tag, Packet]: Sequential (tag, packet) pairs """ @@ -1061,6 +1121,19 @@ def as_table( """ ... + def flow(self) -> Collection[tuple[Tag, Packet]]: + """ + Return the entire stream as a collection of (tag, packet) pairs. + + This method materializes the stream content into a list or similar + collection type. It is useful for small streams or when you need + to process all data at once. + + Returns: + Collection[tuple[Tag, Packet]]: All (tag, packet) pairs in the stream + """ + ... + class LiveStream(Stream, Protocol): """ @@ -1293,7 +1366,7 @@ def validate_inputs(self, *streams: Stream) -> None: """ ... - def identity_structure(self, *streams: Stream) -> Any: + def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: """ Generate a unique identity structure for this kernel and/or kernel invocation. When invoked without streams, it should return a structure @@ -1307,7 +1380,9 @@ def identity_structure(self, *streams: Stream) -> Any: - Tracking kernel invocations in computational graphs Args: - *streams: Optional input streams for this invocation + streams: Optional input streams for this invocation. If None, identity_structure is + based solely on the kernel. If streams are provided, they are included in the identity + to differentiate between different invocations of the same kernel. Returns: Any: Unique identity structure (e.g., tuple of class name and stream identities) diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 8dc6a1d..213ea3e 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -220,7 +220,7 @@ def _validate_record_path(self, record_path: tuple[str, ...]) -> None: unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] if any(char in component for char in unsafe_chars): raise ValueError( - f"Source path component contains invalid characters: {repr(component)}" + f"Source path {record_path} component {component} contains invalid characters: {repr(component)}" ) def _get_source_key(self, record_path: tuple[str, ...]) -> str: diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py index 889d8a2..047ad2c 100644 --- a/src/orcapod/types/semantic_converter.py +++ b/src/orcapod/types/semantic_converter.py @@ -63,6 +63,23 @@ def from_python_to_arrow( arrow_data[field] = [value] return pa.Table.from_pydict(arrow_data, schema=arrow_schema) + def from_arrow_to_arrow_compat_dict( + self, arrow_data: pa.Table + ) -> list[dict[str, Any]]: + """Convert Arrow data to a dictionary of Python values""" + return arrow_data.to_pylist() + + def from_python_to_arrow_compat_dict( + self, python_data: Mapping[str, Any] + ) -> dict[str, Any]: + arrow_compat_dict = dict(python_data) + for field, converter in self._converter_lut.items(): + if field in python_data: + arrow_compat_dict[field] = converter.from_python_to_arrow( + python_data[field] + ) + return arrow_compat_dict + def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: """Convert a dictionary of Arrow arrays to Python values""" diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py index 258617a..c0eaef2 100644 --- a/src/orcapod/types/semantic_types.py +++ b/src/orcapod/types/semantic_types.py @@ -59,6 +59,16 @@ def to_canonical(self, value: pa.Array) -> list[T]: """Convert from Arrow representation to canonical form""" pass + # @abstractmethod + # def from_canonical_to_arrow_compatible(self, value: T) -> Any: + # """Convert from canonical to Arrow-compatible representation""" + # pass + + # @abstractmethod + # def from_arrow_compatible_to_canonical(self, value: Any) -> T: + # """Convert from Arrow-compatible representation to canonical form""" + # pass + @abstractmethod def from_canonical(self, value: T | Collection[T]) -> pa.Array: """Convert from canonical to Arrow representation""" @@ -145,6 +155,12 @@ def from_canonical( value = [value] return pa.array([v.path_str for v in value], type=pa.large_string()) + def from_canonical_to_arrow_compatible(self, value: CanonicalPath) -> str: + return value.path_str + + def from_arrow_compatible_to_canonical(self, value: str) -> CanonicalPath: + return CanonicalPath(path_str=value, is_absolute=Path(value).is_absolute()) + def can_handle(self, arrow_type: pa.DataType) -> bool: return arrow_type == pa.large_string() From a3d362444078126e8e9413f2c9b0886939a3ac6e Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:51:55 +0000 Subject: [PATCH 130/224] feat: add ability to change source info --- .../data/datagrams/arrow_tag_packet.py | 27 +++++++++++++++++ src/orcapod/data/datagrams/dict_datagram.py | 6 ++++ src/orcapod/data/datagrams/dict_tag_packet.py | 29 +++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 503b83e..976a392 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -256,6 +256,31 @@ def source_info(self) -> dict[str, str | None]: } return self._cached_source_info.copy() + def with_source_info(self, **source_info: str | None) -> Self: + """ + Create a copy of the packet with updated source information. + + Args: + source_info: New source information mapping + + Returns: + New ArrowPacket instance with updated source info + """ + new_packet = self.copy(include_cache=False) + + existing_source_info_with_prefix = self._source_info_table.to_pylist()[0] + for key, value in source_info.items(): + if not key.startswith(constants.SOURCE_PREFIX): + # Ensure the key is prefixed correctly + key = f"{constants.SOURCE_PREFIX}{key}" + if key in existing_source_info_with_prefix: + existing_source_info_with_prefix[key] = value + + new_packet._source_info_table = pa.Table.from_pylist( + [existing_source_info_with_prefix] + ) + return new_packet + # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" @@ -263,5 +288,7 @@ def copy(self, include_cache: bool = True) -> Self: if include_cache: new_packet._cached_source_info = self._cached_source_info + else: + new_packet._cached_source_info = None return new_packet diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 9f7088f..6cacb0c 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -798,6 +798,12 @@ def copy(self, include_cache: bool = True) -> Self: new_datagram._cached_content_hash = self._cached_content_hash new_datagram._cached_data_arrow_schema = self._cached_data_arrow_schema new_datagram._cached_meta_arrow_schema = self._cached_meta_arrow_schema + else: + new_datagram._cached_data_table = None + new_datagram._cached_meta_table = None + new_datagram._cached_content_hash = None + new_datagram._cached_data_arrow_schema = None + new_datagram._cached_meta_arrow_schema = None return new_datagram diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index ea9b7fa..a45a22c 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -251,10 +251,39 @@ def source_info(self) -> dict[str, str | None]: """ return {key: self._source_info.get(key, None) for key in self.keys()} + def with_source_info(self, **source_info: str | None) -> Self: + """ + Create a new packet with updated source information. + + Args: + **kwargs: Key-value pairs to update source information + + Returns: + New DictPacket instance with updated source info + """ + current_source_info = self._source_info.copy() + + for key, value in source_info.items(): + if not key.startswith(constants.SOURCE_PREFIX): + key = f"{constants.SOURCE_PREFIX}{key}" + if key in current_source_info: + current_source_info[key] = value + + new_packet = self.copy(include_cache=False) + new_packet._source_info = current_source_info + + return new_packet + def copy(self, include_cache: bool = True) -> Self: """Return a shallow copy of the packet.""" instance = super().copy(include_cache=include_cache) instance._source_info = self._source_info.copy() if include_cache: instance._cached_source_info_table = self._cached_source_info_table + instance._cached_source_info_schema = self._cached_source_info_schema + + else: + instance._cached_source_info_table = None + instance._cached_source_info_schema = None + return instance From 2dffb9206d3f14e761e72f228bd8fe3e2e39a5ee Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:52:52 +0000 Subject: [PATCH 131/224] feat: add saving table with its own id column --- src/orcapod/protocols/data_protocols.py | 24 +++ src/orcapod/stores/delta_lake_stores.py | 225 ++++++++++++++++++++++-- 2 files changed, 230 insertions(+), 19 deletions(-) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index cd21645..c262fb6 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -877,6 +877,30 @@ def source_info(self) -> dict[str, str | None]: """ ... + def with_source_info( + self, + **source_info: str | None, + ) -> Self: + """ + Create new packet with updated source information. + + Adds or updates source metadata for the packet. This is useful for + tracking data provenance and lineage through the computational graph. + + Args: + **source_info: Source metadata as keyword arguments. + + Returns: + New packet instance with updated source information. + + Example: + >>> updated_packet = packet.with_source_info( + ... file_path="/new/path/to/file.txt", + ... source_id="source_123" + ... ) + """ + ... + class PodFunction(Protocol): """ diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 213ea3e..8490713 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -1,13 +1,20 @@ -import pyarrow as pa -import polars as pl from pathlib import Path -from typing import Any +from typing import Any, TYPE_CHECKING import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict from orcapod.data import constants +from orcapod.utils.lazy_module import LazyModule +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pc = LazyModule("pyarrow.compute") # Module-level logger logger = logging.getLogger(__name__) @@ -270,8 +277,8 @@ def _get_existing_delta_table( return None def _ensure_record_id_column( - self, arrow_data: pa.Table, record_id: str - ) -> pa.Table: + self, arrow_data: "pa.Table", record_id: str + ) -> "pa.Table": """Ensure the table has an record id column.""" if self.RECORD_ID_COLUMN not in arrow_data.column_names: # Add record_id column at the beginning @@ -279,7 +286,7 @@ def _ensure_record_id_column( arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) return arrow_data - def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: + def _remove_record_id_column(self, arrow_data: "pa.Table") -> "pa.Table": """Remove the record id column if it exists.""" if self.RECORD_ID_COLUMN in arrow_data.column_names: column_names = arrow_data.column_names @@ -292,8 +299,8 @@ def _remove_record_id_column(self, arrow_data: pa.Table) -> pa.Table: return arrow_data def _handle_record_id_column( - self, arrow_data: pa.Table, record_id_column: str | None = None - ) -> pa.Table: + self, arrow_data: "pa.Table", record_id_column: str | None = None + ) -> "pa.Table": """ Handle record_id column based on add_record_id_column parameter. @@ -347,7 +354,7 @@ def _read_table_with_filter( self, delta_table: DeltaTable, filters: list | None = None, - ) -> pa.Table: + ) -> "pa.Table": """ Read table using to_pyarrow_dataset with original schema preservation. @@ -393,11 +400,11 @@ def add_record( self, record_path: tuple[str, ...], record_id: str, - data: pa.Table, + data: "pa.Table", ignore_duplicates: bool | None = None, overwrite_existing: bool = False, force_flush: bool = False, - ) -> pa.Table: + ) -> "pa.Table": self._validate_record_path(record_path) source_key = self._get_source_key(record_path) @@ -470,23 +477,203 @@ def add_record( def add_records( self, record_path: tuple[str, ...], - records: pa.Table, + records: "pa.Table", record_id_column: str | None = None, ignore_duplicates: bool | None = None, overwrite_existing: bool = False, + force_flush: bool = False, ) -> list[str]: - raise NotImplementedError( - "add_records is not implemented in BasicDeltaTableArrowStore yet. " - "Use add_record for single record insertion." + """ + Add multiple records to the Delta table, using one column as record_id. + + Args: + record_path: Path tuple identifying the table location + records: PyArrow table containing the records to add + record_id_column: Column name to use as record_id (defaults to first column) + ignore_duplicates: Whether to ignore duplicate entries + overwrite_existing: Whether to overwrite existing records with same ID + force_flush: Whether to write immediately instead of batching + + Returns: + List of record IDs that were added + """ + self._validate_record_path(record_path) + source_key = self._get_source_key(record_path) + + # Determine record_id column + if record_id_column is None: + record_id_column = records.column_names[0] + + # Validate that the record_id column exists + if record_id_column not in records.column_names: + raise ValueError( + f"Record ID column '{record_id_column}' not found in table. " + f"Available columns: {records.column_names}" + ) + + # Rename the record_id column to the standard name + column_mapping = {record_id_column: self.RECORD_ID_COLUMN} + records_renamed = records.rename_columns( + [column_mapping.get(col, col) for col in records.column_names] ) + # Get unique record IDs from the data + record_ids_array = records_renamed[self.RECORD_ID_COLUMN] + unique_record_ids = pc.unique(record_ids_array).to_pylist() + + # Set default behavior for duplicates + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + + added_record_ids = [] + + # Check for duplicates if needed + if not ignore_duplicates: + # Check pending batches + pending_duplicates = [] + for record_id in unique_record_ids: + if record_id in self._pending_batches[source_key]: + pending_duplicates.append(record_id) + + if pending_duplicates: + raise ValueError( + f"Records {pending_duplicates} already exist in pending batch for {source_key}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Check existing table + existing_duplicates = [] + try: + for record_id in unique_record_ids: + existing_record = self.get_record_by_id( + record_path, str(record_id), flush=False + ) + if existing_record is not None: + existing_duplicates.append(record_id) + except Exception as e: + logger.debug(f"Error checking existing records: {e}") + + if existing_duplicates: + raise ValueError( + f"Records {existing_duplicates} already exist in {'/'.join(record_path)}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table + write_deltalake(str(table_path), records_renamed, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + added_record_ids = unique_record_ids + else: + # Handle existing table + if self.duplicate_entry_behavior == "overwrite" or overwrite_existing: + # Delete existing records with matching IDs + try: + # Create SQL condition for multiple record IDs + escaped_ids = [ + str(rid).replace("'", "''") for rid in unique_record_ids + ] + id_list = "', '".join(escaped_ids) + delete_condition = f"{self.RECORD_ID_COLUMN} IN ('{id_list}')" + + delta_table.delete(delete_condition) + logger.debug( + f"Deleted existing records {unique_record_ids} from {source_key}" + ) + except Exception as e: + logger.debug(f"No existing records to delete: {e}") + + # Filter out duplicates if not overwriting + if not ( + self.duplicate_entry_behavior == "overwrite" or overwrite_existing + ): + # Get existing record IDs + try: + existing_table = delta_table.to_pyarrow_table() + if len(existing_table) > 0: + existing_ids = pc.unique( + existing_table[self.RECORD_ID_COLUMN] + ) + + # Filter out records that already exist + mask = pc.invert( + pc.is_in( + records_renamed[self.RECORD_ID_COLUMN], existing_ids + ) + ) + records_renamed = pc.filter(records_renamed, mask) + + # Update the list of record IDs that will actually be added + if len(records_renamed) > 0: + added_record_ids = pc.unique( + records_renamed[self.RECORD_ID_COLUMN] + ).to_pylist() + else: + added_record_ids = [] + else: + added_record_ids = unique_record_ids + except Exception as e: + logger.debug(f"Error filtering duplicates: {e}") + added_record_ids = unique_record_ids + else: + added_record_ids = unique_record_ids + + # Append the (possibly filtered) records + if len(records_renamed) > 0: + write_deltalake( + table_path, + records_renamed, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + else: + # Add to batches for later flushing + # Group records by record_id for individual batch entries + for record_id in unique_record_ids: + # Filter records for this specific record_id + mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) + single_record = pc.filter(records_renamed, mask) + + # Add to pending batch (will overwrite if duplicate_entry_behavior allows) + if ( + self.duplicate_entry_behavior == "overwrite" + or overwrite_existing + or record_id not in self._pending_batches[source_key] + ): + self._pending_batches[source_key][str(record_id)] = single_record + added_record_ids.append(record_id) + elif ignore_duplicates: + logger.debug(f"Ignoring duplicate record {record_id}") + else: + # This should have been caught earlier, but just in case + logger.warning(f"Skipping duplicate record {record_id}") + + # Check if we need to flush + batch_size = len(self._pending_batches[source_key]) + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added {len(added_record_ids)} records to {source_key}") + return [str(rid) for rid in added_record_ids] + def get_record_by_id( self, record_path: tuple[str, ...], record_id: str, record_id_column: str | None = None, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Get a specific record by record_id with schema preservation. @@ -537,7 +724,7 @@ def get_all_records( record_id_column: str | None = None, retrieve_pending: bool = True, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Retrieve all records for a given source path as a single table with schema preservation. @@ -588,10 +775,10 @@ def get_all_records( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: list[str] | pl.Series | pa.Array, + record_ids: "list[str] | pl.Series | pa.Array", record_id_column: str | None = None, flush: bool = False, - ) -> pa.Table | None: + ) -> "pa.Table | None": """ Retrieve records by entry IDs as a single table with schema preservation. From 78e1b085de7eb5fc94ef42de12687c98f5063adc Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 25 Jul 2025 01:54:28 +0000 Subject: [PATCH 132/224] feat: add refined kernel id logic --- src/orcapod/data/pods.py | 20 +++- src/orcapod/data/trackers.py | 4 + src/orcapod/pipeline/nodes.py | 206 +++++++++++++++++++++++++++++----- 3 files changed, 195 insertions(+), 35 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index a66cfc6..06aea7b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -238,12 +238,16 @@ def __init__( semantic_type_registry=self.data_context.semantic_type_registry ) ) - self._function_info_extractor = function_info_extractor + # now compute hash for the self and store that info + self._pod_hash = self.data_context.object_hasher.hash_to_hex( + self, prefix_hasher_id=True + ) + @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name,) + return (self.function_name, self._pod_hash) def input_packet_types(self) -> PythonSchema: """ @@ -300,7 +304,10 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) output_data = {k: v for k, v in zip(self.output_keys, output_values)} - source_info = {k: ":".join(self.kernel_id + (k,)) for k in output_data} + source_info = { + k: ":".join(self.kernel_id + (packet.content_hash(), k)) + for k in output_data + } output_packet = DictPacket( {k: v for k, v in zip(self.output_keys, output_values)}, @@ -396,7 +403,8 @@ def validate_inputs(self, *streams: dp.Stream) -> None: self.pod.validate_inputs(*streams) def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: - return self.pod.call(tag, packet) + output_tag, output_packet = self.pod.call(tag, packet) + return output_tag, output_packet def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -442,7 +450,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.kernel_id + (self.pod_hash,) + return self.record_path_prefix + self.kernel_id def call( self, @@ -456,7 +464,7 @@ def call( if not skip_record_check: output_packet = self.get_recorded_output_packet(packet) if output_packet is None: - tag, output_packet = self.pod.call(tag, packet) + tag, output_packet = super().call(tag, packet) if output_packet is not None and not skip_recording: self.record_packet( packet, output_packet, overwrite_existing=overwrite_existing diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 799334e..70e27d9 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -129,6 +129,10 @@ def __init__(self, stream: dp.Stream, label: str | None = None) -> None: self.label = label or stream.label self.stream = stream + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.stream.__class__.__name__,) + def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: """ Forward the stream through the stub kernel. diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 7372175..0a4cfd7 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from ast import Not from collections.abc import Collection, Iterator from datetime import datetime @@ -13,8 +14,12 @@ if TYPE_CHECKING: import pyarrow as pa + import polars as pl + import pandas as pd else: pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pd = LazyModule("pandas") class Node( @@ -42,6 +47,31 @@ def __init__( self.identity_structure(()), prefix_hasher_id=True ) + @property + def contained_kernel(self) -> dp.Kernel: + raise NotImplementedError( + "This property should be implemented by subclasses to return the contained kernel." + ) + + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + tag_keys, _ = self.keys() + return tag_keys + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + # TODO: consider caching this + _, packet_keys = self.keys() + return packet_keys + @property def pipeline_path(self) -> tuple[str, ...]: """ @@ -111,6 +141,62 @@ def as_table( def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: return self().flow() + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + """ + Return the identity structure of the node. + This is used to compute the invocation hash. + """ + # construct identity structure from the node's information and the + # contained kernel + if streams is not None and len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return self.contained_kernel.identity_structure(self.input_streams) + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records associated with the node. + If include_system_columns is True, system columns will be included in the result. + """ + raise NotImplementedError("This method should be implemented by subclasses.") + + @property + def lazy(self) -> "pl.LazyFrame | None": + records = self.get_all_records(include_system_columns=False) + if records is not None: + return pl.LazyFrame(records) + return None + + @property + def df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + lazy_df = self.lazy + if lazy_df is not None: + return lazy_df.collect() + return None + + @property + def polars_df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + return self.df + + @property + def pandas_df(self) -> "pd.DataFrame | None": + """ + Return the pandas DataFrame representation of the pod's records. + """ + records = self.get_all_records(include_system_columns=False) + if records is not None: + return records.to_pandas() + return None + class KernelNode(Node, WrappedKernel): """ @@ -134,6 +220,10 @@ def __init__( **kwargs, ) + @property + def contained_kernel(self) -> dp.Kernel: + return self.kernel + def __repr__(self): return f"KernelNode(kernel={self.kernel!r})" @@ -153,6 +243,74 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An ) return self.kernel.identity_structure(self.input_streams) + def forward(self, *streams: dp.Stream) -> dp.Stream: + output_stream = super().forward(*streams) + + self.record_pipeline_output(output_stream) + return output_stream + + def record_pipeline_output(self, output_stream: dp.Stream) -> None: + key_column_name = "_record_hash" + output_table = output_stream.as_table( + include_data_context=True, + include_source=True, + include_content_hash=key_column_name, + ) + self.pipeline_store.add_records( + self.pipeline_path, + output_table, + record_id_column=key_column_name, + ignore_duplicates=True, + ) + + +def add_pipeline_record( + self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None +) -> None: + # combine dp.Tag with packet content hash to compute entry hash + tag_with_hash = tag.as_table().append_column( + self.PACKET_HASH_COLUMN, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + entry_id = self.data_context.arrow_hasher.hash_table( + tag_with_hash, prefix_hasher_id=True + ) + + existing_record = self.pipeline_store.get_record_by_id( + self.pipeline_path, + entry_id, + ) + + if existing_record is not None: + # if the record already exists, return it + return + + # no record matching, so construct the full record + + input_packet_info = ( + input_packet.as_table( + include_source=True, + ) + .append_column( + f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", + pa.array([input_packet.data_context_key], type=pa.large_string()), + ) + .append_column( + self.DATA_RETRIEVED_FLAG, + pa.array([retrieved], type=pa.bool_()), + ) + .drop(input_packet.keys()) + ) + + combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + + self.pipeline_store.add_record( + self.pipeline_path, + entry_id, + combined_record, + ignore_duplicates=False, + ) + class PodNode(Node, CachedPod): def __init__( @@ -175,7 +333,10 @@ def __init__( **kwargs, ) self.pipeline_store = pipeline_store - # self.input_streams = tuple(input_streams) + + @property + def contained_kernel(self) -> dp.Kernel: + return self.pod def __repr__(self): return f"PodNode(pod={self.pod!r})" @@ -253,7 +414,9 @@ def add_pipeline_record( ignore_duplicates=False, ) - def _get_all_records(self) -> "pa.Table | None": + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": results = self.result_store.get_all_records( self.record_path, record_id_column=self.PACKET_HASH_COLUMN ) @@ -263,42 +426,27 @@ def _get_all_records(self) -> "pa.Table | None": "Pipeline store is not configured, cannot retrieve tag info" ) taginfo = self.pipeline_store.get_all_records( - self.record_path, + self.pipeline_path, ) if results is None or taginfo is None: return None - tag_columns = [ - c - for c in taginfo.column_names - if not c.startswith(constants.META_PREFIX) - and not c.startswith(constants.SOURCE_PREFIX) - ] - - packet_columns = [ - c for c in results.column_names if c != self.PACKET_HASH_COLUMN - ] - # TODO: do not hardcode the join keys joined_info = taginfo.join( results, self.PACKET_HASH_COLUMN, join_type="inner", ) - - joined_info = joined_info.select([*tag_columns, *packet_columns]) + tag_keys, packet_keys = self.keys() + + if not include_system_columns: + system_columns = [ + c + for c in joined_info.column_names + if c.startswith(constants.META_PREFIX) + or c.startswith(constants.CONTEXT_KEY) + or c.startswith(constants.SOURCE_PREFIX) + ] + joined_info = joined_info.drop(system_columns) return joined_info - - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - """ - Return the identity structure of the node. - This is used to compute the invocation hash. - """ - # construct identity structure from the node's information and the - # contained kernel - if streams is not None and len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) - return self.pod.identity_structure(self.input_streams) From 9c9b11f758020811376e84c5bccdfba0dac5990c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 26 Jul 2025 19:16:02 +0000 Subject: [PATCH 133/224] feat: add get_all_records to KernelNode --- src/orcapod/pipeline/nodes.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 0a4cfd7..e6bdc6d 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -204,6 +204,8 @@ class KernelNode(Node, WrappedKernel): This node can be used to execute the kernel and process data streams. """ + HASH_COLUMN_NAME = "_record_hash" + def __init__( self, kernel: dp.Kernel, @@ -250,7 +252,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: return output_stream def record_pipeline_output(self, output_stream: dp.Stream) -> None: - key_column_name = "_record_hash" + key_column_name = self.HASH_COLUMN_NAME output_table = output_stream.as_table( include_data_context=True, include_source=True, @@ -263,6 +265,26 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: ignore_duplicates=True, ) + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + results = self.pipeline_store.get_all_records(self.pipeline_path) + + if results is None: + return None + + if not include_system_columns: + system_columns = [ + c + for c in results.column_names + if c.startswith(constants.META_PREFIX) + or c.startswith(constants.CONTEXT_KEY) + or c.startswith(constants.SOURCE_PREFIX) + ] + results = results.drop(system_columns) + + return results + def add_pipeline_record( self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None @@ -438,7 +460,6 @@ def get_all_records( self.PACKET_HASH_COLUMN, join_type="inner", ) - tag_keys, packet_keys = self.keys() if not include_system_columns: system_columns = [ From 5a9ff9a687e3c49e8437b51d212ef18a19605606 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 05:16:09 +0000 Subject: [PATCH 134/224] feat: add new delta data store capable of handling mutiple entries as once --- src/orcapod/stores/delta_lake_stores.py | 797 +++++++++++++++++++++++- uv.lock | 14 + 2 files changed, 810 insertions(+), 1 deletion(-) diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 8490713..0a316e4 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -1,9 +1,13 @@ +from multiprocessing import Value from pathlib import Path -from typing import Any, TYPE_CHECKING +from typing import Any, Literal, TYPE_CHECKING, cast import logging from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict +from collections.abc import Collection + +from pyarrow import Table from orcapod.data import constants from orcapod.utils.lazy_module import LazyModule @@ -20,6 +24,797 @@ logger = logging.getLogger(__name__) +class BatchedDeltaTableArrowStore: + """ + A batched Delta table store with clear insert vs update semantics. + + - insert(): Never overwrites existing records by default. Can skip duplicates if requested. + Can be batched for performance. Supports composite keys. + + - update(): Always overwrites existing records. Executes immediately. + Requires pending batches to be flushed first (or use force_flush=True). + + Supports both single column and composite (multi-column) record IDs. + """ + + # Class constants for internal column names + ROW_INDEX_COLUMN = "__row_index" + RECORD_ID_COLUMN = "__record_id" + + def __init__( + self, + base_path: str | Path, + create_base_path: bool = True, + batch_size: int = 1000, + max_hierarchy_depth: int = 10, + ): + self.base_path = Path(base_path) + self.batch_size = batch_size + self.max_hierarchy_depth = max_hierarchy_depth + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, pa.Table] = {} + self._pending_record_ids: dict[str, set[str]] = defaultdict(set) + self._existing_ids_cache: dict[str, set[str]] = defaultdict(set) + # TODO: reconsider this approach as this is NOT serializable + self._cache_dirty: dict[str, bool] = defaultdict(lambda: True) + + def _clear_pending(self): + """Clear all pending state.""" + self._pending_batches = {} + self._pending_record_ids = defaultdict(set) + # Note: next_row_index continues incrementing + + def _get_record_key(self, record_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(record_path) + + def _get_table_path(self, record_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in record_path: + path = path / subpath + return path + + def _validate_record_path(self, record_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + record_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not record_path: + raise ValueError("Source path cannot be empty") + + if len(record_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(record_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(record_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path {record_path} component {component} contains invalid characters: {repr(component)}" + ) + + def _get_delta_table(self, record_path: tuple[str, ...]) -> DeltaTable | None: + """ + Get an existing Delta table, either from cache or by loading it. + + Args: + record_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + record_key = self._get_record_key(record_path) + table_path = self._get_table_path(record_path) + + # Check cache first + if dt := self._delta_table_cache.get(record_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[record_key] = delta_table + logger.debug(f"Loaded existing Delta table for {record_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {record_key}: {e}") + # Try to clear any corrupted cache + if record_key in self._delta_table_cache: + self._delta_table_cache.pop(record_key) + raise + + def _ensure_record_id_column( + self, arrow_data: "pa.Table", record_id: str + ) -> "pa.Table": + """Ensure the table has an record id column.""" + if self.RECORD_ID_COLUMN not in arrow_data.column_names: + # Add record_id column at the beginning + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) + return arrow_data + + def _remove_record_id_column(self, arrow_data: "pa.Table") -> "pa.Table": + """Remove the record id column if it exists.""" + if self.RECORD_ID_COLUMN in arrow_data.column_names: + arrow_data = arrow_data.drop([self.RECORD_ID_COLUMN]) + return arrow_data + + def _handle_record_id_column( + self, arrow_data: "pa.Table", record_id_column: str | None = None + ) -> "pa.Table": + """ + Handle record_id column based on add_record_id_column parameter. + + Args: + arrow_data: Arrow table with record id column + record_id_column: Control entry ID column inclusion: + + """ + if not record_id_column: + # Remove the record id column + return self._remove_record_id_column(arrow_data) + + # Rename record id column + if self.RECORD_ID_COLUMN in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + record_id_column if name == self.RECORD_ID_COLUMN else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + else: + raise ValueError( + f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." + ) + + def _create_record_id_filter(self, record_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + record_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [(self.RECORD_ID_COLUMN, "=", record_id)] + + def _create_record_ids_filter(self, record_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + record_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [(self.RECORD_ID_COLUMN, "in", record_ids)] + + def _refresh_existing_ids_cache(self, record_path: tuple[str, ...]) -> None: + """Refresh the cache of existing IDs in the Delta table.""" + record_key = self._get_record_key(record_path) + + delta_table = self._get_delta_table(record_path) + + if delta_table is None: + self._existing_ids_cache[record_key] = set() + self._cache_dirty[record_key] = False + return + + try: + # Get all existing IDs from Delta table using standard RECORD_ID_COLUMN + # TODO: replace this with more targetted loading of only the target column and in batches + arrow_table = delta_table.to_pyarrow_table() + if arrow_table.num_rows == 0: + self._existing_ids_cache[record_key] = set() + elif self.RECORD_ID_COLUMN not in arrow_table.column_names: + # TODO: replace this with proper checking of the table schema first! + logger.warning(f"Delta table missing {self.RECORD_ID_COLUMN} column") + self._existing_ids_cache[record_key] = set() + else: + existing_ids = cast( + set[str], set(arrow_table[self.RECORD_ID_COLUMN].to_pylist()) + ) + self._existing_ids_cache[record_key] = existing_ids + + self._cache_dirty[record_key] = False + logger.debug( + f"Refreshed existing IDs cache: {len(self._existing_ids_cache)} IDs" + ) + + except Exception as e: + logger.error(f"Failed to refresh existing IDs cache: {e}") + self._existing_ids_cache[record_key] = set() + self._cache_dirty[record_key] = False + raise + + def _get_existing_ids(self, record_path: tuple[str, ...]) -> set[str]: + """Get the set of existing IDs in the Delta table, using cache when possible.""" + record_key = self._get_record_key(record_path) + if ( + self._cache_dirty.get(record_key) + or self._delta_table_cache.get(record_key) is None + ): + self._refresh_existing_ids_cache(record_path) + return self._existing_ids_cache.get(record_key) or set() + + def _invalidate_cache(self, record_path: tuple[str, ...]) -> None: + """Mark the existing IDs cache as dirty.""" + self._cache_dirty[self._get_record_key(record_path)] = True + + def add_record( + self, + record_path: tuple[str, ...], + record_id: str, + data: "pa.Table", + schema_handling: Literal["merge", "error", "coerce"] = "error", + skip_duplicates: bool = False, + flush: bool = False, + ) -> "pa.Table | None": + data_with_record_id = self._ensure_record_id_column(data, record_id) + return self.add_records( + record_path=record_path, + arrow_table=data_with_record_id, + id_column=self.RECORD_ID_COLUMN, + schema_handling=schema_handling, + skip_duplicates=skip_duplicates, + flush=flush, + ) + + def add_records( + self, + record_path: tuple[str, ...], + arrow_table: pa.Table, + id_column: str, + schema_handling: Literal["merge", "error", "coerce"] = "error", + skip_duplicates: bool = False, + flush: bool = False, + ) -> "pa.Table | None": + """ + Insert new records. By default, never overwrites existing records. + + Args: + arrow_table: Arrow table to insert + id_columns: Single column name or list of column names that form the record ID. + For composite keys, values are concatenated with '|' separator. + schema_handling: How to handle schema differences + skip_duplicates: If True, skip records with IDs that already exist. + If False, raise error on duplicates. + flush: Whether to flush immediately after the insert + + Raises: + ValueError: If any record IDs already exist and skip_duplicates=False + """ + if arrow_table.num_rows == 0: + return + + # Step 1: Validate that id column exist + if id_column not in arrow_table.column_names: + raise ValueError( + f"Specified ID column {id_column} not found in input table {arrow_table.column_names}" + ) + + # rename ID column to a standard name + if id_column != self.RECORD_ID_COLUMN: + rename_map = {id_column: self.RECORD_ID_COLUMN} + total_name_map = {k: rename_map.get(k, k) for k in arrow_table.column_names} + arrow_table = arrow_table.rename_columns(total_name_map) + + # Step 2: Deduplicate within input table (keep last occurrence) + deduplicated_table = self._deduplicate_within_table(arrow_table) + + # Step 3: Handle conflicts based on skip_duplicates setting + if skip_duplicates: + filtered_table = self._filter_existing_records( + record_path, deduplicated_table + ) + if filtered_table.num_rows == 0: + logger.debug("All records were duplicates, nothing to insert") + return None + else: + # Check for conflicts - insert never allows duplicates when skip_duplicates=False + self._check_all_conflicts(record_path, deduplicated_table) + filtered_table = deduplicated_table + + # Step 4: Handle schema compatibility + schema_compatible_table = self._handle_schema_compatibility( + record_path, filtered_table, schema_handling + ) + + # Step 5: Add to pending batch (no overwrite logic needed) + self._add_to_pending_batch(record_path, schema_compatible_table) + + # Step 6: Auto-flush if needed + if flush or self._should_auto_flush(record_path): + self.flush() + + def _deduplicate_within_table(self, table: pa.Table) -> pa.Table: + # TODO: consider erroring out if duplicates are found + """Remove duplicates within the input table, keeping the last occurrence.""" + if table.num_rows <= 1: + return table + + # Create row indices + indices = pa.array(range(table.num_rows)) + + # Add row index column temporarily + table_with_indices = table.add_column(0, self.ROW_INDEX_COLUMN, indices) + + # Group by RECORD_ID_COLUMN and get the maximum row index for each group + # This gives us the last occurrence of each ID + grouped = table_with_indices.group_by([self.RECORD_ID_COLUMN]).aggregate( + [(self.ROW_INDEX_COLUMN, "max")] + ) + + # Get the row indices to keep - the aggregated column name has "_max" suffix + max_indices_column = f"{self.ROW_INDEX_COLUMN}_max" + indices_to_keep = grouped[max_indices_column].to_pylist() + + # Filter original table to keep only these rows + mask = pc.is_in(indices, pa.array(indices_to_keep)) + return table.filter(mask) + + def _filter_existing_records( + self, record_path: tuple[str, ...], table: pa.Table + ) -> pa.Table: + """Filter out records that already exist (for skip_duplicates=True).""" + input_ids = set(table[self.RECORD_ID_COLUMN].to_pylist()) + record_key = self._get_record_key(record_path) + + # Get IDs that already exist in pending batch or Delta table + existing_in_pending = input_ids.intersection( + self._pending_record_ids[record_key] + ) + existing_in_delta = input_ids.intersection(self._get_existing_ids(record_path)) + all_existing = existing_in_pending.union(existing_in_delta) + + if not all_existing: + return table # No duplicates found + + # Filter out existing records + mask = pc.invert( + pc.is_in(table[self.RECORD_ID_COLUMN], pa.array(list(all_existing))) + ) + filtered = table.filter(mask) + + logger.debug(f"Skipped {len(all_existing)} duplicate records") + return filtered + + def _check_all_conflicts( + self, record_path: tuple[str, ...], table: pa.Table + ) -> None: + """Check for conflicts with both pending batch and Delta table.""" + input_ids = set(table[self.RECORD_ID_COLUMN].to_pylist()) + record_key = self._get_record_key(record_path) + # Check conflicts with pending batch + pending_conflicts = input_ids.intersection(self._pending_record_ids[record_key]) + if pending_conflicts: + raise ValueError( + f"Cannot insert records with IDs that already exist in pending batch: {pending_conflicts}. " + f"Use skip_duplicates=True to skip existing records or update() method to overwrite." + ) + + # Check conflicts with Delta table + existing_ids = self._get_existing_ids(record_path) + delta_conflicts = input_ids.intersection(existing_ids) + if delta_conflicts: + raise ValueError( + f"Cannot insert records with IDs that already exist in Delta table: {delta_conflicts}. " + f"Use skip_duplicates=True to skip existing records or update() method to overwrite." + ) + + def _handle_schema_compatibility( + self, record_path: tuple[str, ...], table: pa.Table, schema_handling: str + ) -> pa.Table: + """Handle schema differences between input and pending batch.""" + record_key = self._get_record_key(record_path) + pending_batch = self._pending_batches.get(record_key) + if pending_batch is None: + return table + + if pending_batch.schema.equals(table.schema): + # TODO: perform more careful check + return table + + if schema_handling == "error": + raise ValueError( + f"Schema mismatch between input {table.schema} and pending batch {pending_batch.schema}" + ) + elif schema_handling == "merge": + try: + # Unify schemas and cast both input table and pending batch + unified_schema = pa.unify_schemas([pending_batch.schema, table.schema]) + + # Cast the pending batch to unified schema (excluding tracking columns) + self._pending_batches[record_key] = pending_batch.cast(unified_schema) + + # Cast and return the input table + return table.cast(unified_schema) + except Exception as e: + # TODO: perform more careful error check + raise ValueError(f"Cannot merge schemas: {e}") + elif schema_handling == "coerce": + try: + # Coerce input table to match existing pending batch schema + return table.cast(pending_batch.schema) + except Exception as e: + raise ValueError(f"Cannot coerce schema: {e}") + else: + raise ValueError(f"Unknown schema handling: {schema_handling}") + + def _handle_delta_schema_compatibility( + self, record_path: tuple[str, ...], table: pa.Table, schema_handling: str + ) -> pa.Table: + """Handle schema differences between input and Delta table for updates.""" + record_key = self._get_record_key(record_path) + if self._delta_table_cache.get(record_key) is None: + return table + + delta_table = self._delta_table_cache[record_key] + + try: + # Get Delta table schema and convert from arro3 to pyarrow + arro3_schema = delta_table.schema().to_arrow() + delta_schema = pa.schema(arro3_schema) # type: ignore + except Exception as e: + logger.warning(f"Could not get Delta table schema: {e}") + return table + + if delta_schema.equals(table.schema): + return table + + if schema_handling == "error": + raise ValueError("Schema mismatch between input and Delta table") + elif schema_handling == "merge": + try: + # Unify schemas - this might require adding null columns + unified_schema = pa.unify_schemas([delta_schema, table.schema]) + return table.cast(unified_schema) + except Exception as e: + raise ValueError(f"Cannot merge schemas: {e}") + elif schema_handling == "coerce": + try: + # Coerce input table to match Delta table schema + return table.cast(delta_schema) + except Exception as e: + raise ValueError(f"Cannot coerce schema: {e}") + else: + raise ValueError(f"Unknown schema handling: {schema_handling}") + + def _add_to_pending_batch(self, record_path: tuple[str, ...], table: pa.Table): + """Add table to pending batch.""" + # Add row index column for internal tracking + record_key = self._get_record_key(record_path) + pending_batch = self._pending_batches.get(record_key) + if pending_batch is None: + self._pending_batches[record_key] = table + else: + self._pending_batches[record_key] = pa.concat_tables([pending_batch, table]) + + pending_ids = cast(list[str], table[self.RECORD_ID_COLUMN].to_pylist()) + self._pending_record_ids[record_key].update(pending_ids) + + def _should_auto_flush(self, record_path: tuple[str, ...]) -> bool: + """Check if auto-flush should be triggered.""" + record_key = self._get_record_key(record_path) + return ( + self._pending_batches.get(record_key) is not None + and self._pending_batches[record_key].num_rows >= self.batch_size + ) + + def get_all_records( + self, + record_path: tuple[str, ...], + record_id_column: str | None = None, + retrieve_pending: bool = True, + ) -> pa.Table | None: + """ + Get all records from both pending batch and Delta table. + + Returns: + Combined Arrow table with all records, or None if no records exist + """ + record_key = self._get_record_key(record_path) + + tables_to_combine = [] + + # Add Delta table data + if (delta_table := self._get_delta_table(record_path)) is not None: + try: + delta_table_data = delta_table.to_pyarrow_table() + if delta_table_data.num_rows > 0: + tables_to_combine.append(delta_table_data) + except Exception as e: + logger.warning(f"Error reading Delta table: {e}") + + # Add pending batch data + if ( + retrieve_pending + and (pending_batch := self._pending_batches.get(record_key)) is not None + ): + if pending_batch.num_rows > 0: + tables_to_combine.append(pending_batch) + + if not tables_to_combine: + return None + + if len(tables_to_combine) == 1: + table_to_return = tables_to_combine[0] + else: + table_to_return = pa.concat_tables(tables_to_combine) + + # Handle record_id_column if specified + return self._handle_record_id_column(table_to_return, record_id_column) + + def get_record_by_id( + self, + record_path: tuple[str, ...], + record_id: str, + record_id_column: str | None = None, + flush: bool = False, + ) -> "pa.Table | None": + """ + Get a specific record by record_id with schema preservation. + + Args: + record_path: Tuple of path components + record_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + + # check if record_id is found in pending batches + record_key = self._get_record_key(record_path) + if record_id in self._pending_record_ids[record_key]: + # Return the pending record after removing the entry id column + pending_batch = self._pending_batches[record_key] + assert pending_batch is not None, "Pending batch should not be None" + filtered_table = pending_batch.filter( + pc.field(self.RECORD_ID_COLUMN) == record_id + ) + if filtered_table.num_rows != 1: + raise ValueError( + f"Expected exactly one record in pending batch with record ID {record_id}, but found {filtered_table.num_rows}" + ) + return self._handle_record_id_column(filtered_table, record_id_column) + + # Now check the Delta table + delta_table = self._get_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_record_id_filter(record_id) + result = self._read_delta_table(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Handle (remove/rename) the record id column before returning + return self._handle_record_id_column(result, record_id_column) + + except Exception as e: + logger.error( + f"Error getting record {record_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_records_by_ids( + self, + record_path: tuple[str, ...], + record_ids: "list[str] | pl.Series | pa.Array", + record_id_column: str | None = None, + flush: bool = False, + ) -> "pa.Table | None": + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + record_path: Tuple of path components + record_ids: Entry IDs to retrieve + add_record_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + record_key = self._get_record_key(record_path) + if flush: + self.flush_batch(record_path) + + # Convert input to list of strings for consistency + if isinstance(record_ids, list): + if not record_ids: + return None + record_ids_list = record_ids + elif isinstance(record_ids, pl.Series): + if len(record_ids) == 0: + return None + record_ids_list = cast(list[str], record_ids.to_list()) + elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): + if len(record_ids) == 0: + return None + record_ids_list = cast(list[str], record_ids.to_pylist()) + else: + raise TypeError( + f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" + ) + + # check inside the batch + delta_table = self._get_delta_table(record_path) + if delta_table is None: + return None + try: + # Use schema-preserving read with filters + filter_expr = self._create_record_ids_filter(record_ids_list) + result = self._read_delta_table(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Handle record_id column based on parameter + return self._handle_record_id_column(result, record_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(record_path)}: {e}" + ) + return None + + def _read_delta_table( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> "pa.Table": + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def flush(self) -> None: + """Flush all pending batches.""" + # TODO: capture and re-raise exceptions at the end + for record_key in list(self._pending_batches.keys()): + record_path = tuple(record_key.split("/")) + try: + self.flush_batch(record_path) + except Exception as e: + logger.error(f"Error flushing batch for {record_key}: {e}") + + def flush_batch(self, record_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + record_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + record_key = self._get_record_key(record_path) + + if ( + record_key not in self._pending_batches + or not self._pending_batches[record_key] + ): + return + + # Get all pending records + pending_batch = self._pending_batches.pop(record_key) + pending_ids = self._pending_record_ids.pop(record_key) + + try: + # Combine all tables in the batch + combined_table = pending_batch.combine_chunks() + + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_delta_table(record_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {record_key} with {len(combined_table)} records" + ) + else: + delta_table.merge( + source=combined_table, + predicate=f"target.{self.RECORD_ID_COLUMN} = source.{self.RECORD_ID_COLUMN}", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {record_key}" + ) + + # Update cache + self._delta_table_cache[record_key] = DeltaTable(str(table_path)) + + # invalide record id cache + self._invalidate_cache(record_path) + + except Exception as e: + logger.error(f"Error flushing batch for {record_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[record_key] = pending_batch + self._pending_record_ids[record_key] = pending_ids + raise + + class BasicDeltaTableArrowStore: """ A basic Delta Table-based Arrow data store with flexible hierarchical path support. diff --git a/uv.lock b/uv.lock index ba522ac..03d0483 100644 --- a/uv.lock +++ b/uv.lock @@ -1216,6 +1216,7 @@ dev = [ { name = "deltalake" }, { name = "httpie" }, { name = "ipykernel" }, + { name = "pyarrow-stubs" }, { name = "pyiceberg" }, { name = "pytest" }, { name = "pytest-cov" }, @@ -1244,6 +1245,7 @@ dev = [ { name = "deltalake", specifier = ">=1.0.2" }, { name = "httpie", specifier = ">=3.2.4" }, { name = "ipykernel", specifier = ">=6.29.5" }, + { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-cov", specifier = ">=6.1.1" }, @@ -1546,6 +1548,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/37/40/ad395740cd641869a13bcf60851296c89624662575621968dcfafabaa7f6/pyarrow-20.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:82f1ee5133bd8f49d31be1299dc07f585136679666b502540db854968576faf9", size = 25944982, upload-time = "2025-04-27T12:33:04.72Z" }, ] +[[package]] +name = "pyarrow-stubs" +version = "20.0.0.20250716" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyarrow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/81/0506544eaa9719a4640e7949a1a3614732ab24790a3204dfb74ec5483d74/pyarrow_stubs-20.0.0.20250716.tar.gz", hash = "sha256:8fa8a93a7b7ec3c8d6df8c452628f4351419e8bc44ac45a298d7223d05dcdd0a", size = 236506, upload-time = "2025-07-16T02:28:54.907Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl", hash = "sha256:8ecfdd215af468d6b993e2290da7f3d51a32991c1d230b90682f7ee4bc5ee7cd", size = 235661, upload-time = "2025-07-16T02:28:53.394Z" }, +] + [[package]] name = "pycparser" version = "2.22" From 98a1f16bea10d2760314b86ccfa3c466392850bc Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 05:16:32 +0000 Subject: [PATCH 135/224] build: add pyarrow stubs --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index aa23332..d0bbe4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ "deltalake>=1.0.2", "httpie>=3.2.4", "ipykernel>=6.29.5", + "pyarrow-stubs>=20.0.0.20250716", "pyiceberg>=0.9.1", "pytest>=8.3.5", "pytest-cov>=6.1.1", From 527cdc0207635773bc12f20ec4702f2e737869b8 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 05:44:38 +0000 Subject: [PATCH 136/224] refactor: update store protocol --- src/orcapod/pipeline/nodes.py | 8 ++- src/orcapod/protocols/store_protocols.py | 30 +++++++----- src/orcapod/stores/delta_lake_stores.py | 62 ++++++++++++------------ 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index e6bdc6d..0d33c41 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -262,7 +262,7 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: self.pipeline_path, output_table, record_id_column=key_column_name, - ignore_duplicates=True, + skip_duplicates=True, ) def get_all_records( @@ -330,7 +330,7 @@ def add_pipeline_record( self.pipeline_path, entry_id, combined_record, - ignore_duplicates=False, + skip_duplicates=False, ) @@ -372,14 +372,12 @@ def call( packet: dp.Packet, skip_record_check: bool = False, skip_recording: bool = False, - overwrite_existing: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: tag, output_packet = super().call( tag, packet, skip_record_check=skip_record_check, skip_recording=skip_recording, - overwrite_existing=overwrite_existing, ) if output_packet is not None: retrieved = ( @@ -433,7 +431,7 @@ def add_pipeline_record( self.pipeline_path, entry_id, combined_record, - ignore_duplicates=False, + skip_duplicates=False, ) def get_all_records( diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 4940033..1f0fd16 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -1,6 +1,8 @@ -from typing import Protocol +from typing import Protocol, TYPE_CHECKING from collections.abc import Collection -import pyarrow as pa + +if TYPE_CHECKING: + import pyarrow as pa class ArrowDataStore(Protocol): @@ -8,32 +10,33 @@ def add_record( self, record_path: tuple[str, ...], record_id: str, - data: pa.Table, - ignore_duplicates: bool | None = None, - overwrite_existing: bool = False, - ) -> str | None: ... + record: "pa.Table", + skip_duplicates: bool = False, + flush: bool = False, + ) -> None: ... def add_records( self, record_path: tuple[str, ...], - records: pa.Table, + records: "pa.Table", record_id_column: str | None = None, - ignore_duplicates: bool | None = None, - overwrite_existing: bool = False, - ) -> list[str]: ... + skip_duplicates: bool = False, + flush: bool = False, + ) -> None: ... def get_record_by_id( self, record_path: tuple[str, ...], record_id: str, record_id_column: str | None = None, - ) -> pa.Table | None: ... + flush: bool = False, + ) -> "pa.Table | None": ... def get_all_records( self, record_path: tuple[str, ...], record_id_column: str | None = None, - ) -> pa.Table | None: + ) -> "pa.Table | None": """Retrieve all records for a given path as a stream.""" ... @@ -42,7 +45,8 @@ def get_records_by_ids( record_path: tuple[str, ...], record_ids: Collection[str], record_id_column: str | None = None, - ) -> pa.Table: ... + flush: bool = False, + ) -> "pa.Table | None": ... def flush(self) -> None: """Flush any buffered writes to the underlying storage.""" diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 0a316e4..12a528c 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -276,16 +276,16 @@ def add_record( self, record_path: tuple[str, ...], record_id: str, - data: "pa.Table", - schema_handling: Literal["merge", "error", "coerce"] = "error", + record: "pa.Table", skip_duplicates: bool = False, flush: bool = False, - ) -> "pa.Table | None": - data_with_record_id = self._ensure_record_id_column(data, record_id) - return self.add_records( + schema_handling: Literal["merge", "error", "coerce"] = "error", + ) -> None: + data_with_record_id = self._ensure_record_id_column(record, record_id) + self.add_records( record_path=record_path, - arrow_table=data_with_record_id, - id_column=self.RECORD_ID_COLUMN, + records=data_with_record_id, + record_id_column=self.RECORD_ID_COLUMN, schema_handling=schema_handling, skip_duplicates=skip_duplicates, flush=flush, @@ -294,12 +294,12 @@ def add_record( def add_records( self, record_path: tuple[str, ...], - arrow_table: pa.Table, - id_column: str, - schema_handling: Literal["merge", "error", "coerce"] = "error", + records: pa.Table, + record_id_column: str | None = None, skip_duplicates: bool = False, flush: bool = False, - ) -> "pa.Table | None": + schema_handling: Literal["merge", "error", "coerce"] = "error", + ) -> None: """ Insert new records. By default, never overwrites existing records. @@ -315,23 +315,26 @@ def add_records( Raises: ValueError: If any record IDs already exist and skip_duplicates=False """ - if arrow_table.num_rows == 0: + if records.num_rows == 0: return - # Step 1: Validate that id column exist - if id_column not in arrow_table.column_names: + if record_id_column is None: + record_id_column = records.column_names[0] + + # Step 1: Validate that record ID column exist + if record_id_column not in records.column_names: raise ValueError( - f"Specified ID column {id_column} not found in input table {arrow_table.column_names}" + f"Specified record ID column {record_id_column} not found in input table {records.column_names}" ) - # rename ID column to a standard name - if id_column != self.RECORD_ID_COLUMN: - rename_map = {id_column: self.RECORD_ID_COLUMN} - total_name_map = {k: rename_map.get(k, k) for k in arrow_table.column_names} - arrow_table = arrow_table.rename_columns(total_name_map) + # rename record ID column to a standard name + if record_id_column != self.RECORD_ID_COLUMN: + rename_map = {record_id_column: self.RECORD_ID_COLUMN} + total_name_map = {k: rename_map.get(k, k) for k in records.column_names} + records = records.rename_columns(total_name_map) # Step 2: Deduplicate within input table (keep last occurrence) - deduplicated_table = self._deduplicate_within_table(arrow_table) + deduplicated_table = self._deduplicate_within_table(records) # Step 3: Handle conflicts based on skip_duplicates setting if skip_duplicates: @@ -637,7 +640,7 @@ def get_record_by_id( def get_records_by_ids( self, record_path: tuple[str, ...], - record_ids: "list[str] | pl.Series | pa.Array", + record_ids: "Collection[str] | pl.Series | pa.Array", record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": @@ -658,22 +661,19 @@ def get_records_by_ids( self.flush_batch(record_path) # Convert input to list of strings for consistency - if isinstance(record_ids, list): - if not record_ids: - return None - record_ids_list = record_ids - elif isinstance(record_ids, pl.Series): - if len(record_ids) == 0: - return None + + if isinstance(record_ids, pl.Series): record_ids_list = cast(list[str], record_ids.to_list()) elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): - if len(record_ids) == 0: - return None record_ids_list = cast(list[str], record_ids.to_pylist()) + elif isinstance(record_ids, Collection): + record_ids_list = list(record_ids) else: raise TypeError( f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" ) + if len(record_ids) == 0: + return None # check inside the batch delta_table = self._get_delta_table(record_path) From ea85bcf224e001231bcd6d6aeeed1c11f8fc66b2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 06:20:42 +0000 Subject: [PATCH 137/224] fix: cleanup nodes --- src/orcapod/pipeline/nodes.py | 159 ---------------------------------- 1 file changed, 159 deletions(-) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index b0e8f1d..8566d99 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,37 +1,8 @@ -from abc import abstractmethod -from ast import Not -from abc import abstractmethod -from ast import Not from collections.abc import Collection, Iterator from datetime import datetime from orcapod.data.kernels import WrappedKernel, TrackedKernelBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import PodStream -from orcapod.types import TypeSpec -from orcapod.utils.lazy_module import LazyModule -from typing import TYPE_CHECKING, Any -from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.utils import arrow_utils - -if TYPE_CHECKING: - import pyarrow as pa - import polars as pl - import pandas as pd -else: - pa = LazyModule("pyarrow") - pl = LazyModule("polars") - pd = LazyModule("pandas") - - -class Node( - TrackedKernelBase, -): -from datetime import datetime -from orcapod.data.kernels import WrappedKernel, TrackedKernelBase -from orcapod.data.pods import ArrowDataStore, CachedPod -from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import PodStream from orcapod.types import TypeSpec from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any @@ -53,7 +24,6 @@ class Node( ): """ Mixin class for pipeline nodes - Mixin class for pipeline nodes """ def __init__( @@ -61,9 +31,6 @@ def __init__( input_streams: Collection[dp.Stream], pipeline_store: ArrowDataStore, pipeline_path_prefix: tuple[str, ...] = (), - input_streams: Collection[dp.Stream], - pipeline_store: ArrowDataStore, - pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ): super().__init__(**kwargs) @@ -76,27 +43,12 @@ def __init__( self.invocation_hash = self.data_context.object_hasher.hash_to_hex( self.identity_structure(()), prefix_hasher_id=True ) - ): - super().__init__(**kwargs) - self._cached_stream: dp.LiveStream | None = None - self.input_streams = tuple(input_streams) - self.pipeline_store = pipeline_store - self.pipeline_path_prefix = pipeline_path_prefix - # compute invocation hash - note that empty () is passed into identity_structure to signify - # identity structure of invocation with no input streams - self.invocation_hash = self.data_context.object_hasher.hash_to_hex( - self.identity_structure(()), prefix_hasher_id=True - ) @property def contained_kernel(self) -> dp.Kernel: raise NotImplementedError( "This property should be implemented by subclasses to return the contained kernel." ) - def contained_kernel(self) -> dp.Kernel: - raise NotImplementedError( - "This property should be implemented by subclasses to return the contained kernel." - ) @property def tag_keys(self) -> tuple[str, ...]: @@ -106,13 +58,6 @@ def tag_keys(self) -> tuple[str, ...]: """ tag_keys, _ = self.keys() return tag_keys - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - tag_keys, _ = self.keys() - return tag_keys @property def packet_keys(self) -> tuple[str, ...]: @@ -123,38 +68,18 @@ def packet_keys(self) -> tuple[str, ...]: # TODO: consider caching this _, packet_keys = self.keys() return packet_keys - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - # TODO: consider caching this - _, packet_keys = self.keys() - return packet_keys @property - def pipeline_path(self) -> tuple[str, ...]: def pipeline_path(self) -> tuple[str, ...]: """ Return the path to the pipeline run records. This is used to store the run-associated tag info. - Return the path to the pipeline run records. - This is used to store the run-associated tag info. """ return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) def validate_inputs(self, *processed_streams: dp.Stream) -> None: pass - def forward(self, *streams: dp.Stream) -> dp.Stream: - if len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) - - def validate_inputs(self, *processed_streams: dp.Stream) -> None: - pass - def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: raise NotImplementedError( @@ -180,58 +105,7 @@ def upstreams(self) -> tuple[dp.Stream, ...]: def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: return self().keys() - # TODO: re-evaluate the use here - # super().validate_inputs(*self.input_streams) - return super().forward(*self.input_streams) - - def __call__(self, *args, **kwargs) -> dp.LiveStream: - if self._cached_stream is None: - self._cached_stream = super().__call__(*args, **kwargs) - return self._cached_stream - # properties and methods to act as a dp.Stream - @property - def source(self) -> dp.Kernel | None: - return self - - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - return () - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - return self().keys() - - def types(self) -> tuple[TypeSpec, TypeSpec]: - return self().types() - - @property - def last_modified(self) -> datetime | None: - return self().last_modified - - @property - def is_current(self) -> bool: - return self().is_current - - def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().__iter__() - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().iter_packets() - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> "pa.Table": - return self().as_table( - include_data_context=include_data_context, - include_source=include_source, - include_content_hash=include_content_hash, - ) - - def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: - return self().flow() def types(self) -> tuple[TypeSpec, TypeSpec]: return self().types() @@ -277,25 +151,6 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An ) return self.contained_kernel.identity_structure(self.input_streams) - def get_all_records( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all records associated with the node. - If include_system_columns is True, system columns will be included in the result. - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - """ - Return the identity structure of the node. - This is used to compute the invocation hash. - """ - # construct identity structure from the node's information and the - # contained kernel - if streams is not None and len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) - return self.contained_kernel.identity_structure(self.input_streams) - def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": @@ -304,7 +159,6 @@ def get_all_records( If include_system_columns is True, system columns will be included in the result. """ raise NotImplementedError("This method should be implemented by subclasses.") - raise NotImplementedError("This method should be implemented by subclasses.") @property def lazy(self) -> "pl.LazyFrame | None": @@ -312,11 +166,6 @@ def lazy(self) -> "pl.LazyFrame | None": if records is not None: return pl.LazyFrame(records) return None - def lazy(self) -> "pl.LazyFrame | None": - records = self.get_all_records(include_system_columns=False) - if records is not None: - return pl.LazyFrame(records) - return None @property def df(self) -> "pl.DataFrame | None": @@ -360,17 +209,9 @@ def __init__( input_streams: Collection[dp.Stream], pipeline_store: ArrowDataStore, pipeline_path_prefix: tuple[str, ...] = (), - kernel: dp.Kernel, - input_streams: Collection[dp.Stream], - pipeline_store: ArrowDataStore, - pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ) -> None: super().__init__( - kernel=kernel, - input_streams=input_streams, - pipeline_store=pipeline_store, - pipeline_path_prefix=pipeline_path_prefix, kernel=kernel, input_streams=input_streams, pipeline_store=pipeline_store, From 773d5cab7bcc1898ff098557fbabbd45f3608898 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 17:56:38 +0000 Subject: [PATCH 138/224] refactor: update protocol over modification time --- src/orcapod/__init__.py | 4 ++-- src/orcapod/data/kernels.py | 28 +++++++++++++++++++++++-- src/orcapod/data/streams.py | 25 ++++++++++++++++++++-- src/orcapod/protocols/data_protocols.py | 8 +++++++ 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index b49b19c..dd5eb05 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,7 +1,7 @@ from .data import DEFAULT_TRACKER_MANAGER from .data.pods import function_pod, FunctionPod, CachedPod from .data import streams -from .stores.delta_lake_stores import BasicDeltaTableArrowStore +from .stores.delta_lake_stores import BatchedDeltaTableArrowStore no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking @@ -13,5 +13,5 @@ "FunctionPod", "CachedPod", "streams", - "BasicDeltaTableArrowStore", + "BatchedDeltaTableArrowStore", ] diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 1cda423..755b5b7 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from collections.abc import Collection +from datetime import datetime, timezone from typing import Any from orcapod.protocols import data_protocols as dp import logging @@ -39,6 +40,8 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER + self._last_modified = None + self._set_modified_time() @property def data_context(self) -> DataContext: @@ -49,6 +52,26 @@ def data_context_key(self) -> str: """Return the data context key.""" return self._data_context.context_key + @property + def last_modified(self) -> datetime | None: + """ + When the kernel was last modified. For most kernels, this is the timestamp + of the kernel creation. + """ + return self._last_modified + + def _set_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return + + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) + @property @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... @@ -68,7 +91,7 @@ def validate_inputs(self, *streams: dp.Stream) -> None: ... def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None - ) -> dp.LiveStream: + ) -> KernelStream: """ Prepare the output stream for the kernel invocation. This method is called after the main computation is performed. @@ -86,7 +109,7 @@ def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> Non def __call__( self, *streams: dp.Stream, label: str | None = None, **kwargs - ) -> dp.LiveStream: + ) -> KernelStream: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) output_stream = self.prepare_output_stream(*processed_streams, label=label) @@ -139,6 +162,7 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An # This can be achieved, for example, by returning a set over the streams instead of a tuple. if streams is not None: streams = self.pre_kernel_processing(*streams) + self.validate_inputs(*streams) return self.kernel_identity_structure(streams) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index f0178d5..f8076db 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -55,6 +55,9 @@ def __init__( self._set_modified_time() # note that this is not necessary for Stream protocol, but is provided # for convenience to resolve semantic types and other context-specific information + if data_context is None and source is not None: + # if source is provided, use its data context + data_context = source.data_context_key self._data_context = DataContext.resolve_data_context(data_context) @property @@ -114,6 +117,14 @@ def is_current(self) -> bool: # If there is no last_modified timestamp, we cannot determine if the stream is current return False + # check if the source kernel has been modified + if self.source is not None and ( + self.source.last_modified is None + or self.source.last_modified > self.last_modified + ): + return False + + # check if all upstreams are current for upstream in self.upstreams: if ( not upstream.is_current @@ -202,18 +213,28 @@ def __init__( ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) - table, data_context_table = arrow_utils.split_by_column_groups( + data_table, data_context_table = arrow_utils.split_by_column_groups( table, [constants.CONTEXT_KEY] ) + if data_table is None: + # TODO: provide better error message + raise ValueError( + "Table must contain at least one column to be used as a stream." + ) + if data_context_table is None: data_context_table = pa.table( - {constants.CONTEXT_KEY: pa.nulls(len(table), pa.large_string())} + {constants.CONTEXT_KEY: pa.nulls(len(data_table), pa.large_string())} ) prefix_info = {constants.SOURCE_PREFIX: source_info} # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + if delta := set(tag_columns) - set(self._tag_columns): + raise ValueError( + f"Specified tag columns {delta} are not present in the table." + ) table, prefix_tables = arrow_utils.prepare_prefixed_columns( table, prefix_info, exclude_columns=self._tag_columns ) diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index c262fb6..d334e14 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1283,6 +1283,14 @@ def data_context_key(self) -> str: """ ... + @property + def last_modified(self) -> datetime | None: + """ + When the kernel was last modified. For most kernels, this is the timestamp + of the kernel creation. + """ + ... + def __call__( self, *streams: Stream, label: str | None = None, **kwargs ) -> LiveStream: From 598a5cbb67791c8457d52e7faf115a10bca8a1fa Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 17:56:50 +0000 Subject: [PATCH 139/224] wip: implementation of manual table source --- src/orcapod/data/sources.py | 296 ++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 src/orcapod/data/sources.py diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py new file mode 100644 index 0000000..ec60cb1 --- /dev/null +++ b/src/orcapod/data/sources.py @@ -0,0 +1,296 @@ +from abc import abstractmethod + +from pyarrow.lib import Table +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from collections.abc import Collection, Iterator +from orcapod.data.streams import ImmutableTableStream +from typing import TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from orcapod.types import TypeSpec +from datetime import datetime +from orcapod.types import schemas +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from pathlib import Path + + +if TYPE_CHECKING: + import polars as pl + import pandas as pd + import pyarrow as pa +else: + pl = LazyModule("polars") + pd = LazyModule("pandas") + pa = LazyModule("pyarrow") + + +class SourceBase(TrackedKernelBase): + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + tag_keys, _ = self.keys() + return tag_keys + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + # TODO: consider caching this + _, packet_keys = self.keys() + return packet_keys + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> "pa.Table": + return self().as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + + def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + return self().flow() + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records associated with the node. + If include_system_columns is True, system columns will be included in the result. + """ + raise NotImplementedError("This method should be implemented by subclasses.") + + @property + def lazy(self) -> "pl.LazyFrame | None": + records = self.get_all_records(include_system_columns=False) + if records is not None: + return pl.LazyFrame(records) + return None + + @property + def df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + lazy_df = self.lazy + if lazy_df is not None: + return lazy_df.collect() + return None + + @property + def polars_df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + return self.df + + @property + def pandas_df(self) -> "pd.DataFrame | None": + """ + Return the pandas DataFrame representation of the pod's records. + """ + records = self.get_all_records(include_system_columns=False) + if records is not None: + pandas_df = records.to_pandas() + pandas_df.set_index(list(self.tag_keys), inplace=True) + return pandas_df + return None + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + tag_types, packet_types = self.output_types() + return tuple(tag_types.keys()), tuple(packet_types.keys()) + + def types(self) -> tuple[TypeSpec, TypeSpec]: + return self.output_types() + + def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().__iter__() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + return self().iter_packets() + + # properties and methods to act as a dp.Stream + @property + def source(self) -> dp.Kernel | None: + return self + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + return () + + def validate_inputs(self, *processed_streams: dp.Stream) -> None: + pass + + +class ManualDeltaTableSource(SourceBase): + """ + A source that allows manual delta updates to a table. + This is useful for testing and debugging purposes. + """ + + def __init__( + self, + table_path: str | Path, + schema: TypeSpec | None = None, + tag_columns: Collection[str] | None = None, + **kwargs, + ) -> None: + """ + Initialize the ManualDeltaTableSource with a label and optional data context. + """ + super().__init__(**kwargs) + + self.table_path = table_path + self._delta_table: DeltaTable | None = None + self.refresh() + + if self._delta_table is None: + if schema is None: + raise ValueError( + "Delta table not found and no schema provided. " + "Please provide a valid Delta table path or a schema to create a new table." + ) + if tag_columns is None: + raise ValueError( + "At least one tag column must be provided when creating a new Delta table." + ) + python_schema = schemas.PythonSchema(schema) + arrow_schema = python_schema.to_arrow_schema( + self.data_context.semantic_type_registry + ) + fields = [] + for field in arrow_schema: + if field.name in tag_columns: + field = field.with_metadata({b"table": b"True"}) + fields.append(field) + arrow_schema = pa.schema(fields) + + else: + arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) # type: ignore + python_schema = schemas.PythonSchema.from_arrow_schema( + arrow_schema, self.data_context.semantic_type_registry + ) + inferred_tag_columns = [] + for field in arrow_schema: + if ( + field.metadata is not None + and field.metadata.get(b"table", b"False").decode().lower() + == "true" + ): + inferred_tag_columns.append(field.name) + tag_columns = tag_columns or inferred_tag_columns + self.python_schema = python_schema + self.arrow_schema = arrow_schema + self.tag_columns = tag_columns + + self._is_current = True + + @property + def delta_table_version(self) -> int | None: + """ + Return the version of the delta table. + If the table does not exist, return None. + """ + if self._delta_table is not None: + return self._delta_table.version() + return None + + @property + def is_current(self) -> bool: + return self._is_current + + def forward(self, *streams: dp.Stream) -> dp.Stream: + if len(streams) > 0: + raise ValueError("ManualDeltaTableSource takes no input streams") + if self._delta_table is None: + arrow_data = pa.Table.from_pylist([], schema=self.arrow_schema) + else: + arrow_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + return ImmutableTableStream(arrow_data, self.tag_columns, source=self) + + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.__class__.__name__, str(self.table_path)) + + def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None): + """ + Return the identity structure of the kernel. + This is a unique identifier for the kernel based on its class name and table path. + """ + return (self.__class__.__name__, str(self.table_path)) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + tag_types: TypeSpec = {} + packet_types: TypeSpec = {} + for field, field_type in self.python_schema.items(): + if field in self.tag_columns: + tag_types[field] = field_type + else: + packet_types[field] = field_type + return tag_types, packet_types + + def get_all_records(self, include_system_columns: bool = False) -> Table | None: + if self._delta_table is None: + return None + arrow_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + if not include_system_columns: + arrow_data = arrow_data.drop( + [col for col in arrow_data.column_names if col.startswith("_")] + ) + return arrow_data + + def insert( + self, + data: "dict | pa.Table | pl.DataFrame | pd.DataFrame", + ) -> None: + """ + Insert data into the delta table. + """ + if isinstance(data, dict): + data = pa.Table.from_pylist([data], schema=self.arrow_schema) + elif isinstance(data, pl.DataFrame): + data = data.to_arrow() + elif isinstance(data, pd.DataFrame): + data = pa.Table.from_pandas(data, schema=self.arrow_schema) + + self._set_modified_time() + write_deltalake( + self.table_path, + data, + mode="append", + ) + + # update the delta table + self._delta_table = DeltaTable(self.table_path) + + def refresh(self) -> None: + """ + Refresh the delta table to ensure it is up-to-date. + """ + current_version = self.delta_table_version + try: + delta_table = DeltaTable(self.table_path) + except TableNotFoundError: + delta_table = None + new_version = self.delta_table_version + if (current_version is None and new_version is not None) or ( + current_version is not None + and new_version is not None + and current_version < new_version + ): + # delta table has been updated + self._set_modified_time() + self._delta_table = delta_table From 76e47714d8b29ac0fdba5e45c0cae9c211283488 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 17:57:02 +0000 Subject: [PATCH 140/224] feat: working join operator --- src/orcapod/data/operators.py | 208 +++++++++++++++++++--------------- 1 file changed, 115 insertions(+), 93 deletions(-) diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index f10bb2e..4c614eb 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -4,9 +4,16 @@ from orcapod.types import TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from abc import abstractmethod -from typing import Any +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection from orcapod.errors import InputValidationError +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + class Operator(TrackedKernelBase): """ @@ -25,21 +32,9 @@ class NonZeroInputOperator(Operator): such as joins, unions, etc. """ - def validate_inputs(self, *streams: dp.Stream) -> None: - self.verify_non_zero_input(*streams) - return self.op_validate_inputs(*streams) - - @abstractmethod - def op_validate_inputs(self, *streams: dp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - def verify_non_zero_input( self, - *streams: dp.Stream, + streams: Collection[dp.Stream], ) -> None: """ Check that the inputs to the variable inputs operator are valid. @@ -50,6 +45,10 @@ def verify_non_zero_input( f"Operator {self.__class__.__name__} requires at least one input stream." ) + def validate_inputs(self, *streams: dp.Stream) -> None: + self.verify_non_zero_input(streams) + return self.op_validate_inputs(*streams) + def forward(self, *streams: dp.Stream) -> dp.Stream: """ Forward method for variable inputs operators. @@ -57,18 +56,25 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: """ return self.op_forward(*streams) - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - self.validate_inputs(*streams) + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: return self.op_output_types(*streams) - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: """ Return a structure that represents the identity of this operator. This is used to ensure that the operator can be uniquely identified in the computational graph. """ - if len(streams) > 0: - self.verify_non_zero_input(*streams) - return self.op_identity_structure(*streams) + return self.op_identity_structure(streams) + + @abstractmethod + def op_validate_inputs(self, *streams: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... @abstractmethod def op_forward(self, *streams: dp.Stream) -> dp.Stream: @@ -87,7 +93,9 @@ def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... @abstractmethod - def op_identity_structure(self, *streams: dp.Stream) -> Any: + def op_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: """ This method should be implemented by subclasses to return a structure that represents the identity of the operator. It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. @@ -102,52 +110,55 @@ class BinaryOperator(Operator): Base class for all operators. """ - def validate_inputs(self, *streams: dp.Stream) -> None: - self.check_binary_inputs(*streams) - left_stream, right_stream = streams - return self.op_validate_inputs(left_stream, right_stream) - - @abstractmethod - def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - def check_binary_inputs( - self, *streams: dp.Stream, allow_zero: bool = False + self, + streams: Collection[dp.Stream], ) -> None: """ Check that the inputs to the binary operator are valid. This method is called before the forward method to ensure that the inputs are valid. """ - if not (allow_zero and len(streams) == 0) and len(streams) != 2: + if len(streams) != 2: raise ValueError("BinaryOperator requires exactly two input streams.") + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_binary_inputs(streams) + left_stream, right_stream = streams + return self.op_validate_inputs(left_stream, right_stream) + def forward(self, *streams: dp.Stream) -> dp.Stream: """ Forward method for binary operators. It expects exactly two streams as input. """ - self.check_binary_inputs(*streams) left_stream, right_stream = streams return self.op_forward(left_stream, right_stream) - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - self.check_binary_inputs(*streams) + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: left_stream, right_stream = streams return self.op_output_types(left_stream, right_stream) - def identity_structure(self, *streams: dp.Stream) -> Any: + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: """ Return a structure that represents the identity of this operator. This is used to ensure that the operator can be uniquely identified in the computational graph. """ - self.check_binary_inputs(*streams, allow_zero=True) - return self.op_identity_structure(*streams) + if streams is not None: + left_stream, right_stream = streams + self.op_identity_structure(left_stream, right_stream) + return self.op_identity_structure() + + @abstractmethod + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... @abstractmethod def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: @@ -248,73 +259,84 @@ def kernel_id(self) -> tuple[str, ...]: """ return (f"{self.__class__.__name__}",) - def op_identity_structure(self, *streams: dp.Stream) -> Any: - # Join does not depend on the order of the streams -- convert it onto a set - id_struct = (self.__class__.__name__,) - if len(streams) > 0: - id_struct += (set(streams),) - return id_struct + def op_validate_inputs(self, *streams: dp.Stream) -> None: + try: + self.op_output_types(*streams) + except Exception as e: + raise InputValidationError(f"Input streams are not compatible: {e}") + + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + if len(streams) == 1: + # If only one stream is provided, return its typespecs + return streams[0].types() + + stream = streams[0] + tag_typespec, packet_typespec = stream.types() + for other_stream in streams[1:]: + other_tag_typespec, other_packet_typespec = other_stream.types() + tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) + packet_typespec = union_typespecs(packet_typespec, other_packet_typespec) + + return tag_typespec, packet_typespec - def op_forward(self, *streams: dp.Stream) -> ImmutableTableStream: + def op_forward(self, *streams: dp.Stream) -> dp.Stream: """ Joins two streams together based on their tags. The resulting stream will contain all the tags from both streams. """ + if len(streams) == 1: + return streams[0] - all_tag_typespecs = [] - all_packet_typespecs = [] + COMMON_JOIN_KEY = "_common" - joined_stream = streams[0] - for stream in streams[1:]: - joined_tag_typespec, joined_packet_typespec = joined_stream.types() - stream_tag_typespec, stream_packet_typespec = stream.types() - joined_table = joined_stream.as_table().join( - stream.as_table(), - keys=intersection_typespecs(joined_tag_typespec, stream_tag_typespec), - join_type="inner", - ) + stream = streams[0] - for stream in streams: - tag_typespec, packet_typespec = stream.types() - all_tag_typespecs.append(tag_typespec) - all_packet_typespecs.append(packet_typespec) + tag_keys, _ = [set(k) for k in stream.keys()] + table = stream.as_table(include_source=True) + # trick to get cartesian product + table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) - common_tag_keys = tuple(intersection_typespecs(*all_tag_typespecs).keys()) - joined_tag_keys = tuple(union_typespecs(*all_tag_typespecs).keys()) + for next_stream in streams[1:]: + next_tag_keys, _ = next_stream.keys() + next_table = next_stream.as_table(include_source=True) + next_table = next_table.add_column( + 0, COMMON_JOIN_KEY, pa.array([0] * len(next_table)) + ) + common_tag_keys = tag_keys.intersection(next_tag_keys) + common_tag_keys.add(COMMON_JOIN_KEY) - # performing a check to ensure that packets are compatible - union_typespecs(*all_packet_typespecs) + table = table.join( + next_table, keys=list(common_tag_keys), join_type="inner" + ) + tag_keys.update(next_tag_keys) - joined_table = left_stream.as_table().join( - right_stream.as_table(), - keys=common_tag_keys, - join_type="inner", - ) + # reorder columns to bring tag columns to the front + # TODO: come up with a better algorithm + table = table.drop(COMMON_JOIN_KEY) + reordered_columns = [col for col in table.column_names if col in tag_keys] + reordered_columns += [col for col in table.column_names if col not in tag_keys] return ImmutableTableStream( - joined_table, - tag_columns=tuple(joined_tag_keys), + table.select(reordered_columns), + tag_columns=tuple(tag_keys), source=self, upstreams=streams, ) - def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - left_stream, right_stream = streams - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() - joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) - joined_packet_typespec = union_typespecs( - left_packet_typespec, right_packet_typespec + def op_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return ( + (self.__class__.__name__,) + (set(streams),) if streams is not None else () ) - return joined_tag_typespec, joined_packet_typespec - - def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> None: - try: - self.op_output_types(left_stream, right_stream) - except Exception as e: - raise InputValidationError(f"Input streams are not compatible: {e}") def __repr__(self) -> str: return "Join()" + + +def op_identity_structure(self, *streams: dp.Stream) -> Any: + # Join does not depend on the order of the streams -- convert it onto a set + id_struct = (self.__class__.__name__,) + if len(streams) > 0: + id_struct += (set(streams),) + return id_struct From cd9cebbb08395788bb397fbb5af5eae582d436db Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 22:01:09 +0000 Subject: [PATCH 141/224] fix: accidental trigger of computation and add tutorial --- .../01_orcapod_quick_exploration.ipynb | 1517 +++++++++++++++++ src/orcapod/__init__.py | 8 +- src/orcapod/data/kernels.py | 54 +- src/orcapod/data/operators.py | 3 +- src/orcapod/data/streams.py | 15 +- src/orcapod/data/trackers.py | 10 + src/orcapod/pipeline/__init__.py | 6 + src/orcapod/pipeline/graph.py | 7 + src/orcapod/pipeline/nodes.py | 16 +- src/orcapod/stores/__init__.py | 2 + 10 files changed, 1594 insertions(+), 44 deletions(-) create mode 100644 notebooks/tutorials/01_orcapod_quick_exploration.ipynb diff --git a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb new file mode 100644 index 0000000..22bce61 --- /dev/null +++ b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb @@ -0,0 +1,1517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "27cdd37d", + "metadata": {}, + "outputs": [], + "source": [ + "import orcapod as op\n", + "import shutil" + ] + }, + { + "cell_type": "markdown", + "id": "14852fb6", + "metadata": {}, + "source": [ + "We will also make heavy use of PyArrow:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e6a9e8b6", + "metadata": {}, + "outputs": [], + "source": [ + "import pyarrow as pa" + ] + }, + { + "cell_type": "markdown", + "id": "f0157ee4", + "metadata": {}, + "source": [ + "### Preparing the environment" + ] + }, + { + "cell_type": "markdown", + "id": "4773c9d7", + "metadata": {}, + "source": [ + "In this notebook, we will create a local directory called `pipeline_data` and store results in there. To make sure we get reproducibile results, we start by making sure that this directory does not exist locally." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "420477e8", + "metadata": {}, + "outputs": [], + "source": [ + "shutil.rmtree(\"./pipeline_data\", ignore_errors=True)" + ] + }, + { + "cell_type": "markdown", + "id": "64f69204", + "metadata": {}, + "source": [ + "### Creating streams" + ] + }, + { + "cell_type": "markdown", + "id": "c86895f3", + "metadata": {}, + "source": [ + "At the moment, there is only one way to create stream and that is by wrapping a PyArrow table." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dab6bf9c", + "metadata": {}, + "outputs": [], + "source": [ + "table = pa.Table.from_pydict(\n", + " {\n", + " \"a\": [1, 2, 3],\n", + " \"b\": [\"x\", \"y\", \"z\"],\n", + " \"c\": [True, False, True],\n", + " \"d\": [1.1, 2.2, 3.3],\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c2ac8f32", + "metadata": {}, + "source": [ + "Use `op.streams.ImmutableTableStream` to turn table into a stream. You will also have to specify which columns are the tags." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cd0394d8", + "metadata": {}, + "outputs": [], + "source": [ + "stream = op.streams.ImmutableTableStream(table, tag_columns=[\"a\", \"b\"])" + ] + }, + { + "cell_type": "markdown", + "id": "93ac78cc", + "metadata": {}, + "source": [ + "### Working with streams" + ] + }, + { + "cell_type": "markdown", + "id": "08a854e7", + "metadata": {}, + "source": [ + "Once you have a stream, you can iterate through tag, packet pair:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d4a0812", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'a': 1, 'b': 'x'}, Packet: {'c': True, 'd': 1.1}\n", + "Tag: {'a': 2, 'b': 'y'}, Packet: {'c': False, 'd': 2.2}\n", + "Tag: {'a': 3, 'b': 'z'}, Packet: {'c': True, 'd': 3.3}\n" + ] + } + ], + "source": [ + "for tag, packet in stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "id": "41c7876b", + "metadata": {}, + "source": [ + "You can also get all tag packet pairs as a list of tuples by calling `.flow()`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "79e67bfc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(ArrowTag(data={'a': 1, 'b': 'x'}, meta_columns=0, context='std:v0.1.0:default'),\n", + " ArrowPacket(data={'c': True, 'd': 1.1}, meta_columns=0, context='std:v0.1.0:default')),\n", + " (ArrowTag(data={'a': 2, 'b': 'y'}, meta_columns=0, context='std:v0.1.0:default'),\n", + " ArrowPacket(data={'c': False, 'd': 2.2}, meta_columns=0, context='std:v0.1.0:default')),\n", + " (ArrowTag(data={'a': 3, 'b': 'z'}, meta_columns=0, context='std:v0.1.0:default'),\n", + " ArrowPacket(data={'c': True, 'd': 3.3}, meta_columns=0, context='std:v0.1.0:default'))]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.flow()" + ] + }, + { + "cell_type": "markdown", + "id": "20fa500e", + "metadata": {}, + "source": [ + "Every stream can be converted into a table with `as_table()` method" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "52baee9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_table()" + ] + }, + { + "cell_type": "markdown", + "id": "a7b29786", + "metadata": {}, + "source": [ + "Optionally, you can pass in arguments to `as_table` to have system columns included in the table" + ] + }, + { + "cell_type": "markdown", + "id": "49b297f6", + "metadata": {}, + "source": [ + "`include_source` adds `source` column for each data (non-tag) column patterned like `_source_{column}` and will contain information about where that particular value orginated from." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4648fbe9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "_source_c: large_string\n", + "_source_d: large_string\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]\n", + "_source_c: [[null,null,null]]\n", + "_source_d: [[null,null,null]]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_table(include_source=True)" + ] + }, + { + "cell_type": "markdown", + "id": "83ec7b19", + "metadata": {}, + "source": [ + "`include_content_hash` will compute `content_hash` for each packet and include it as `_content_hash` column" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "001b2a9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "_content_hash: large_string\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]\n", + "_content_hash: [[\"arrow_v0.1@3de5f8a7b9a2fe5e6cc3c84e0368a21e807abe655b5a4dc58efc9b5487e3d9a8\",\"arrow_v0.1@cc022b33fc80a6639d2051d6d19a0162a832ce309367e426433e7401390b6e20\",\"arrow_v0.1@b0bb7434c813b4d5d7c3a5445a0ac3804739388a20a78d6d910b8c02d9ec5653\"]]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_table(include_content_hash=True)" + ] + }, + { + "cell_type": "markdown", + "id": "58d74238", + "metadata": {}, + "source": [ + "Alternatively, you can pass in a custom column name to use for the content hash column" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d3b9e394", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "my_hash_values: large_string\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]\n", + "my_hash_values: [[\"arrow_v0.1@3de5f8a7b9a2fe5e6cc3c84e0368a21e807abe655b5a4dc58efc9b5487e3d9a8\",\"arrow_v0.1@cc022b33fc80a6639d2051d6d19a0162a832ce309367e426433e7401390b6e20\",\"arrow_v0.1@b0bb7434c813b4d5d7c3a5445a0ac3804739388a20a78d6d910b8c02d9ec5653\"]]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_table(include_content_hash=\"my_hash_values\")" + ] + }, + { + "cell_type": "markdown", + "id": "b7012c5a", + "metadata": {}, + "source": [ + "Finally, `include_data_context` adds data context column as `_context_key` which captures information about the OrcaPod version, hasher version etc that were used when generting that packet." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "92cbfa50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "_context_key: large_string\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]\n", + "_context_key: [[null,null,null]]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_table(include_data_context=True)" + ] + }, + { + "cell_type": "markdown", + "id": "7ce05b68", + "metadata": {}, + "source": [ + "### Tags and Packets" + ] + }, + { + "cell_type": "markdown", + "id": "20783626", + "metadata": {}, + "source": [ + "The tags and packets returned by the streams can be thought of as special dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c78096a7", + "metadata": {}, + "outputs": [], + "source": [ + "all_tags_and_packets = stream.flow()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6f8a2f0b", + "metadata": {}, + "outputs": [], + "source": [ + "tag, packet = all_tags_and_packets[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e1ac13b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ArrowTag(data={'a': 1, 'b': 'x'}, meta_columns=0, context='std:v0.1.0:default')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "263fa1c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ArrowPacket(data={'c': True, 'd': 1.1}, meta_columns=0, context='std:v0.1.0:default')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet" + ] + }, + { + "cell_type": "markdown", + "id": "17be117a", + "metadata": {}, + "source": [ + "The element of tag/packet can be accessed just like dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "42158816", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag[\"a\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6a792175", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'x'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag[\"b\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "a28f2051", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet[\"c\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "981e6c44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.1" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet[\"d\"]" + ] + }, + { + "cell_type": "markdown", + "id": "c992134a", + "metadata": {}, + "source": [ + "They have a few methods that will provide additional insights:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "56423d2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': bool, 'd': float}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Returns typespec (dictionary of key to type)\n", + "packet.types()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d5e02f81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('c', 'd')" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# entry names as strings\n", + "packet.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "fd70ee75", + "metadata": {}, + "source": [ + "They can also be converted to an Arrow table by calling `as_table`" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "b1b18ee4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "c: bool\n", + "d: double\n", + "----\n", + "c: [[true]]\n", + "d: [[1.1]]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet.as_table()" + ] + }, + { + "cell_type": "markdown", + "id": "f4e4a38f", + "metadata": {}, + "source": [ + "And schema is conveniently available as:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3aa4020e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "c: bool\n", + "d: double" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet.arrow_schema()" + ] + }, + { + "cell_type": "markdown", + "id": "37ad91d0", + "metadata": {}, + "source": [ + "You can also get a plain dictionary from tag/packet with `as_dict`" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "bea6c771", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': 1, 'b': 'x'}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag.as_dict()" + ] + }, + { + "cell_type": "markdown", + "id": "4fadd572", + "metadata": {}, + "source": [ + "Packet contains some additional data such as `source_info`" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "92f00feb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': None, 'd': None}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet.source_info()" + ] + }, + { + "cell_type": "markdown", + "id": "1d755600", + "metadata": {}, + "source": [ + "These additional data can be included when converting to dict or table" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "bba2bc5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': True, 'd': 1.1, '_source_c': None, '_source_d': None}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet.as_dict(include_source=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "bd09d9d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "c: bool\n", + "d: double\n", + "_source_c: large_string\n", + "_source_d: large_string\n", + "----\n", + "c: [[true]]\n", + "d: [[1.1]]\n", + "_source_c: [[null]]\n", + "_source_d: [[null]]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "packet.as_table(include_source=True)" + ] + }, + { + "cell_type": "markdown", + "id": "98ab6fc7", + "metadata": {}, + "source": [ + "The hash of tag/packet can be computed with `content_hash()` method. The result will be cached so that it won't be computed again unnecessarily." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "03219fd3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'arrow_v0.1@6e1143896d73d370757811b52ceeea8d1d456cd30206416fbf81754e1cea5568'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tag.content_hash()" + ] + }, + { + "cell_type": "markdown", + "id": "ce0ab6c6", + "metadata": {}, + "source": [ + "## Working with operators" + ] + }, + { + "cell_type": "markdown", + "id": "6a9dd928", + "metadata": {}, + "source": [ + "We start getting into orcapod computation when we use operators. At the time of the writing, only `Join` operator is implemented fully but more are to come very shortly." + ] + }, + { + "cell_type": "markdown", + "id": "7ef99b67", + "metadata": {}, + "source": [ + "Let's prepare two streams:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "11ee5130", + "metadata": {}, + "outputs": [], + "source": [ + "table1 = pa.Table.from_pydict(\n", + " {\n", + " \"id\": [0, 1, 4],\n", + " \"a\": [1, 2, 3],\n", + " \"b\": [\"x\", \"y\", \"z\"],\n", + " }\n", + ")\n", + "\n", + "table2 = pa.Table.from_pydict(\n", + " {\n", + " \"id\": [0, 1, 2],\n", + " \"c\": [True, False, True],\n", + " \"d\": [1.1, 2.2, 3.3],\n", + " }\n", + ")\n", + "\n", + "stream1 = op.streams.ImmutableTableStream(table1, tag_columns=[\"id\"])\n", + "stream2 = op.streams.ImmutableTableStream(table2, tag_columns=[\"id\"])" + ] + }, + { + "cell_type": "markdown", + "id": "6f87fcf3", + "metadata": {}, + "source": [ + "We now join the two streams by instantiating the Join operator and then passing in the two streams:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "8299d4b1", + "metadata": {}, + "outputs": [], + "source": [ + "join = op.operators.Join()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "dfc7ee9f", + "metadata": {}, + "outputs": [], + "source": [ + "joined_stream = join(stream1, stream2)" + ] + }, + { + "cell_type": "markdown", + "id": "4f24a492", + "metadata": {}, + "source": [ + "Calling an operator on stream(s) immediately performs checks to make sure that the input streams are comaptible with the operator but otherwise it does NOT trigger any computation. Computation occurs only when you try to **access the output stream's content via iteration, flow, or through conversion to table**." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "092abff5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'id': 0}, Packet: {'a': 1, 'b': 'x', 'c': True, 'd': 1.1}\n", + "Tag: {'id': 1}, Packet: {'a': 2, 'b': 'y', 'c': False, 'd': 2.2}\n" + ] + } + ], + "source": [ + "for tag, packet in joined_stream:\n", + " print(f\"Tag: {tag}, Packet: {packet}\")" + ] + }, + { + "cell_type": "markdown", + "id": "095856e3", + "metadata": {}, + "source": [ + "The output of the computation is automatically cached so that as long as you access the same output stream, you won't be triggering unnecessary recomputation!" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "48ef0a8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "id: int64\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "----\n", + "id: [[0,1]]\n", + "a: [[1,2]]\n", + "b: [[\"x\",\"y\"]]\n", + "c: [[true,false]]\n", + "d: [[1.1,2.2]]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joined_stream.as_table()" + ] + }, + { + "cell_type": "markdown", + "id": "5869a1da", + "metadata": {}, + "source": [ + "## Working with Function Pods" + ] + }, + { + "cell_type": "markdown", + "id": "e4b7991a", + "metadata": {}, + "source": [ + "Now we have explored the basics of streams, tags, packets, and operators (i.e. Join), it's time to explore the meat of `orcapod` -- `FunctionPod`s! Let's start by defining a very simple function pod that takes in two numbers and return the sum." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "35423d9a", + "metadata": {}, + "outputs": [], + "source": [ + "@op.function_pod(output_keys=[\"sum\"])\n", + "def add_numbers(a: int, b: int) -> int:\n", + " \"\"\"A simple function pod that adds two numbers.\"\"\"\n", + " return a + b" + ] + }, + { + "cell_type": "markdown", + "id": "f737eeac", + "metadata": {}, + "source": [ + "You'll notice that, aside from the `op.function_pod` decorator, this is nothing but an ordinary Python function with type hints! The type hints are crucial however, as this will be used by `orcapod` system to validate the input streams into your pods and to be able to predict if the output of your pod can be fed into another operator/pod without an issue." + ] + }, + { + "cell_type": "markdown", + "id": "caf23360", + "metadata": {}, + "source": [ + "Once you have function pod defined, you can already use it on streams just like operators. Let's prepare a stream that has entries for `a` and `b` and then feed them into the function pod." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "119d33a3", + "metadata": {}, + "outputs": [], + "source": [ + "input_table = pa.Table.from_pydict(\n", + " {\n", + " \"id\": [0, 1, 2, 3, 4],\n", + " \"a\": [1, 2, 3, 4, 5],\n", + " \"b\": [10, 20, 30, 40, 50],\n", + " }\n", + ")\n", + "\n", + "input_stream = op.streams.ImmutableTableStream(input_table, tag_columns=[\"id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "2b3b42ff", + "metadata": {}, + "outputs": [], + "source": [ + "# run the stream through the function pod!\n", + "output_stream = add_numbers(input_stream)" + ] + }, + { + "cell_type": "markdown", + "id": "5b5beae2", + "metadata": {}, + "source": [ + "And that's it! Believe it or not, that is all it takes to set up the computation. The actual computation will be triggered the first time you access the content of the output stream." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "ff05a8fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(ImmutableTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_stream" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "6431180f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'id': 0}, Packet: {'sum': 11}\n", + "Tag: {'id': 1}, Packet: {'sum': 22}\n", + "Tag: {'id': 2}, Packet: {'sum': 33}\n", + "Tag: {'id': 3}, Packet: {'sum': 44}\n", + "Tag: {'id': 4}, Packet: {'sum': 55}\n" + ] + } + ], + "source": [ + "for t, p in output_stream:\n", + " print(f\"Tag: {t}, Packet: {p}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6ff00efa", + "metadata": {}, + "source": [ + "Simple, right?" + ] + }, + { + "cell_type": "markdown", + "id": "04b0a24e", + "metadata": {}, + "source": [ + "## Chaining operators and pods into a pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "21fa1e75", + "metadata": {}, + "source": [ + "Now that we have seen how to define and run pods, it's time to put them together into a concrete pipeline. To do so, we will construct a `Pipeline` instance. When doing so, we have to pass in a place to save data to, so we will also prepare a data store." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "cb4bc91a", + "metadata": {}, + "outputs": [], + "source": [ + "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", + "\n", + "pipeline = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" + ] + }, + { + "cell_type": "markdown", + "id": "ef281a1e", + "metadata": {}, + "source": [ + "Once we have the pipeline ready, we can define the pipeline by simply running & chaining operators and pods **inside the pipeline context**. Typically, you'd want to define your function pods before hand:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "f371822b", + "metadata": {}, + "outputs": [], + "source": [ + "@op.function_pod(output_keys=[\"sum\"])\n", + "def add_numbers(a: int, b: int) -> int:\n", + " \"\"\"A simple function pod that adds two numbers.\"\"\"\n", + " return a + b\n", + "\n", + "\n", + "@op.function_pod(output_keys=[\"product\"])\n", + "def multiply_numbers(a: int, b: int) -> int:\n", + " \"\"\"A simple function pod that multiplies two numbers.\"\"\"\n", + " return a * b\n", + "\n", + "\n", + "@op.function_pod(output_keys=[\"result\"])\n", + "def combine_results(sum: int, product: int) -> str:\n", + " \"\"\"A simple function pod that combines results.\"\"\"\n", + " return f\"Sum: {sum}, Product: {product}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e132fc93", + "metadata": {}, + "outputs": [], + "source": [ + "# now defien the pipeline\n", + "with pipeline:\n", + " sum_results = add_numbers(input_stream)\n", + " product_results = multiply_numbers(input_stream)\n", + " final_results = combine_results(sum_results, product_results)" + ] + }, + { + "cell_type": "markdown", + "id": "dad175c6", + "metadata": {}, + "source": [ + "You can access individual elements of the pipeline as an attribute. By default, the attribute is named after the operator/pod name." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "cca9e0d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PodNode(pod=FunctionPod:add_numbers)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.add_numbers" + ] + }, + { + "cell_type": "markdown", + "id": "5f33f5a9", + "metadata": {}, + "source": [ + "Notice that elements of the pipeline is wrapped in a `Node`, being either `PodNode` or `KernelNode`." + ] + }, + { + "cell_type": "markdown", + "id": "2b6bc8df", + "metadata": {}, + "source": [ + "You can fetch results of the pipeline through these nodes. For example, you can access the saved results of the pipeline as Polars dataframe by access the `df` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "21086f72", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.add_numbers.df" + ] + }, + { + "cell_type": "markdown", + "id": "1920d65c", + "metadata": {}, + "source": [ + "You'll notice that `df` comes back empty because the pipeline is yet to run. Let's now trigger the pipeline to fill the nodes with computation results!" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "1e741659", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.run()" + ] + }, + { + "cell_type": "markdown", + "id": "6e4341d5", + "metadata": {}, + "source": [ + "This will cause all nodes in the pipeline to run and store the results." + ] + }, + { + "cell_type": "markdown", + "id": "50891c40", + "metadata": {}, + "source": [ + "Now let's take a look at the computed results:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "c77154ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.add_numbers.df" + ] + }, + { + "cell_type": "markdown", + "id": "43931402", + "metadata": {}, + "source": [ + "You now have the computations saved at each node!" + ] + }, + { + "cell_type": "markdown", + "id": "82312bda", + "metadata": {}, + "source": [ + "### Labeling nodes in the pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "1f0a8f8f", + "metadata": {}, + "source": [ + "When constructing the pipeline, each invocation of the operator/pod results in a new node getting added, with the name of the node defaulting to the name of the operator/pod. If you use the same pod multiple times, then the nodes will be given names of form `{pod_name}_0`, `{pod_name}_1`, and so on.\n", + "\n", + "While this is helpful default behavior, you'd likely want to explicitly name each node so you can more easily understand what you are accessing within the pipeline. To achieve this, you can explicitly label each invocation with `label=` argument in the call." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "37e65e33", + "metadata": {}, + "outputs": [], + "source": [ + "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", + "\n", + "pipeline = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "3bad8332", + "metadata": {}, + "outputs": [], + "source": [ + "# now defien the pipeline\n", + "with pipeline:\n", + " sum_results = add_numbers(input_stream, label=\"my_summation\")\n", + " product_results = multiply_numbers(input_stream, label=\"my_product\")\n", + " final_results = combine_results(\n", + " sum_results, product_results, label=\"my_final_result\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "8f146ae7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.my_summation.df" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "8fd7bf4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idproduct
i64i64
010
140
290
3160
4250
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────────┐\n", + "│ id ┆ product │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════════╡\n", + "│ 0 ┆ 10 │\n", + "│ 1 ┆ 40 │\n", + "│ 2 ┆ 90 │\n", + "│ 3 ┆ 160 │\n", + "│ 4 ┆ 250 │\n", + "└─────┴─────────┘" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.my_product.df" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "2a918db1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idresult
i64str
0"Sum: 11, Product: 10"
1"Sum: 22, Product: 40"
2"Sum: 33, Product: 90"
3"Sum: 44, Product: 160"
4"Sum: 55, Product: 250"
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬───────────────────────┐\n", + "│ id ┆ result │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═════╪═══════════════════════╡\n", + "│ 0 ┆ Sum: 11, Product: 10 │\n", + "│ 1 ┆ Sum: 22, Product: 40 │\n", + "│ 2 ┆ Sum: 33, Product: 90 │\n", + "│ 3 ┆ Sum: 44, Product: 160 │\n", + "│ 4 ┆ Sum: 55, Product: 250 │\n", + "└─────┴───────────────────────┘" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.my_final_result.df" + ] + }, + { + "cell_type": "markdown", + "id": "5380dad8", + "metadata": {}, + "source": [ + "Notice that despite just freshly creating the pipeline, each node already had results filled in! This is because the results from the previous pipeline execution was smartly fetched back. Critically, this was done only because Orcapod noticed that you had an identical pipeline with the same inputs and same operators/pods so that you can reuse the result as is. Should the structure of pipeline been different, the wront results would not be loaded." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index dd5eb05..5d7c423 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,7 +1,9 @@ from .data import DEFAULT_TRACKER_MANAGER from .data.pods import function_pod, FunctionPod, CachedPod from .data import streams -from .stores.delta_lake_stores import BatchedDeltaTableArrowStore +from .data import operators +from . import stores +from .pipeline import Pipeline no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking @@ -13,5 +15,7 @@ "FunctionPod", "CachedPod", "streams", - "BatchedDeltaTableArrowStore", + "stores", + "operators", + "Pipeline" ] diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 755b5b7..702760f 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -76,6 +76,22 @@ def _set_modified_time( @abstractmethod def kernel_id(self) -> tuple[str, ...]: ... + @abstractmethod + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior + """ + + @abstractmethod + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + + @abstractmethod + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: ... + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing @@ -87,7 +103,11 @@ def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return streams @abstractmethod - def validate_inputs(self, *streams: dp.Stream) -> None: ... + def validate_inputs(self, *streams: dp.Stream) -> None: + """ + Valide the input streams before the main computation but after the pre-kernel processing + """ + ... def prepare_output_stream( self, *streams: dp.Stream, label: str | None = None @@ -116,35 +136,11 @@ def __call__( self.track_invocation(*processed_streams, label=label) return output_stream - @abstractmethod - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior - """ - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) return self.kernel_output_types(*processed_streams) - @abstractmethod - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... - - def __repr__(self): - return self.__class__.__name__ - - def __str__(self): - if self._label is not None: - return f"{self.__class__.__name__}({self._label})" - return self.__class__.__name__ - - @abstractmethod - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: ... - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: # Default implementation of identity_structure for the kernel only # concerns the kernel class and the streams if present. Subclasses of @@ -165,6 +161,14 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An self.validate_inputs(*streams) return self.kernel_identity_structure(streams) + def __repr__(self): + return self.__class__.__name__ + + def __str__(self): + if self._label is not None: + return f"{self.__class__.__name__}({self._label})" + return self.__class__.__name__ + class WrappedKernel(TrackedKernelBase): """ diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 4c614eb..9fbf3a5 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -263,7 +263,8 @@ def op_validate_inputs(self, *streams: dp.Stream) -> None: try: self.op_output_types(*streams) except Exception as e: - raise InputValidationError(f"Input streams are not compatible: {e}") + # raise InputValidationError(f"Input streams are not compatible: {e}") from e + raise e def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: if len(streams) == 1: diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index f8076db..cc73206 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -412,6 +412,7 @@ def __init__( upstreams = upstreams or output_stream.upstreams super().__init__(source=source, upstreams=upstreams, **kwargs) + self.kernel = source self._cached_stream = output_stream def clear_cache(self) -> None: @@ -427,22 +428,15 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.keys() + tag_types, packet_types = self.kernel.output_types(*self.upstreams) + return tuple(tag_types.keys()), tuple(packet_types.keys()) def types(self) -> tuple[TypeSpec, TypeSpec]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.types() + return self.kernel.output_types(*self.upstreams) @property def is_current(self) -> bool: @@ -530,6 +524,7 @@ def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): # Packet-level caching (from your PodStream) self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + self._cached_output_table: pa.Table | None = None def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: if self._prepared_stream_iterator is not None: diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 70e27d9..cb89d5b 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -8,6 +8,8 @@ from typing import Any, TYPE_CHECKING from contextlib import contextmanager +from orcapod.types import TypeSpec + if TYPE_CHECKING: import networkx as nx @@ -129,6 +131,14 @@ def __init__(self, stream: dp.Stream, label: str | None = None) -> None: self.label = label or stream.label self.stream = stream + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + assert len(streams) == 0, "StubKernel should not have any input streams." + return self.stream.types() + @property def kernel_id(self) -> tuple[str, ...]: return (self.stream.__class__.__name__,) diff --git a/src/orcapod/pipeline/__init__.py b/src/orcapod/pipeline/__init__.py index 9d3e0f5..616846a 100644 --- a/src/orcapod/pipeline/__init__.py +++ b/src/orcapod/pipeline/__init__.py @@ -3,3 +3,9 @@ # __all__ = [ # "Pipeline", # ] + +from .graph import Pipeline + +__all__ = [ + "Pipeline", +] diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 0ba9bf8..72ab320 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -90,6 +90,13 @@ def compile(self) -> None: invocation_to_stream_lut[invocation] = node() self.nodes[node.label] = node + def run(self) -> None: + # FIXME: perform more efficient traversal through the graph! + for node in self.nodes.values(): + node.flow() + + self.flush() + def wrap_invocation( self, invocation: Invocation, diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 8566d99..bb91557 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -99,15 +99,19 @@ def __call__(self, *args, **kwargs) -> dp.LiveStream: def source(self) -> dp.Kernel | None: return self - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - return () - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - return self().keys() + tag_types, packet_types = self.types() + return tuple(tag_types.keys()), tuple(packet_types.keys()) def types(self) -> tuple[TypeSpec, TypeSpec]: - return self().types() + return self.contained_kernel.output_types(*self.input_streams) + + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + Return the output types of the node. + This is used to determine the types of the output streams. + """ + return self.contained_kernel.output_types(*self.input_streams) @property def last_modified(self) -> datetime | None: diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 434e2f4..4c5fff7 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -12,3 +12,5 @@ # "MockArrowDataStore", # "SimpleParquetDataStore", # ] + +from .delta_lake_stores import BatchedDeltaTableArrowStore From db9b3fec24820e1dc053ea68941e45562e2a577b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 22:09:31 +0000 Subject: [PATCH 142/224] refactor: remove unused streams --- src/orcapod/data/streams.py | 326 +----------------------------------- 1 file changed, 7 insertions(+), 319 deletions(-) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index cc73206..6517257 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,11 +1,10 @@ import logging from pathlib import Path -import warnings from abc import ABC, abstractmethod from collections.abc import Collection, Iterator from datetime import datetime, timezone from itertools import repeat -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext @@ -598,159 +597,16 @@ def as_table( # FIXME: this skips the semantic version conversion and thus is not # fully correct! - all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) - all_packets: pa.Table = pa.Table.from_pylist( - all_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - drop_columns = [] - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - - output_table = self._cached_output_table.drop(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - for tag, packet in self.iter_packets(): - content_hashes.append(packet.content_hash()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema ) - return output_table - - -class PodStream(StreamBase): - def __init__( - self, - pod: dp.Pod, - input_streams: tuple[dp.Stream, ...], - error_handling: Literal["raise", "ignore", "warn"] = "raise", - **kwargs, - ) -> None: - super().__init__(upstreams=input_streams, **kwargs) - self.pod = pod - self.input_streams = input_streams - self.error_handling = error_handling - self._source = pod - - # Cache for processed packets - # This is a dictionary mapping the index of the packet in the input stream to a tuple of (Tag, Packet) - # This allows us to efficiently access the processed packets without re-processing them - self._cached_forward_stream: dp.Stream | None = None - self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet]] = {} - self._computation_complete: bool = False - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - @property - def source(self) -> dp.Pod | None: - """ - The source of the stream, which is the pod that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._source - - def forward_stream(self) -> dp.Stream: - if self._cached_forward_stream is None: - self._cached_forward_stream = self.pod.forward(*self.input_streams) - return self._cached_forward_stream - - @property - def is_current(self) -> bool: - return self.forward_stream().is_current - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.forward_stream().keys() - packet_keys = tuple(self.pod.output_packet_types().keys()) - return tag_keys, packet_keys - - def types(self) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, _ = self.forward_stream().types() - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_packet_types()) - return tag_typespec, packet_typespec - - def clear_cache(self) -> None: - """ - Clears the cached results of the processed stream. - This is useful for re-processing the stream with the same processor. - """ - self._cached_forward_stream = None - self._cached_output_packets = {} - self._computation_complete = False - self._cached_output_table = None - self._cached_content_hash_column = None - - def refresh(self, force: bool = False) -> bool: - if not self.is_current or force: - self.invalidate() - return True - return False - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> pa.Table: - # TODO: note that this is likely NOT multi-thread safe - self.refresh() - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(): - if tag_schema is None: - tag_schema = tag.arrow_schema() - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict()) - all_packets.append( - packet.as_dict(include_context=True, include_source=True) - ) - - all_tags: pa.Table = pa.Table.from_pylist(all_tags, schema=tag_schema) - all_packets: pa.Table = pa.Table.from_pylist( + all_packets_as_tables: pa.Table = pa.Table.from_pylist( all_packets, schema=packet_schema ) - self._cached_output_table = arrow_utils.hstack_tables(all_tags, all_packets) + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) assert self._cached_output_table is not None, ( "_cached_output_table should not be None here." ) @@ -785,41 +641,6 @@ def as_table( ) return output_table - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.refresh() - if not self._computation_complete or self._cached_output_packets is None: - for i, (tag, packet) in enumerate(self.forward_stream().iter_packets()): - if i not in self._cached_output_packets: - try: - processed_tag, processed_packet = self.pod.call(tag, packet) - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - elif self.error_handling == "ignore": - continue - else: - raise ValueError( - f"Unknown error handling mode: {self.error_handling} encountered while handling error:" - ) from e - if processed_packet is None: - # call returning None means the packet should be skipped - logger.debug( - f"Packet {packet} with tag {tag} was processed but returned None, skipping." - ) - continue - self._cached_output_packets[i] = (processed_tag, processed_packet) - yield processed_tag, processed_packet - self._computation_complete = True - self._set_modified_time() - - else: - for i in range(len(self._cached_output_packets)): - yield self._cached_output_packets[i] - class WrappedStream(StreamBase): def __init__( @@ -872,136 +693,3 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: def identity_structure(self) -> Any: return self._stream.identity_structure() - - -class InvokedPodStream(StreamBase): - """ - Recomputable stream that wraps a streams produced by a kernel to provide - an abstraction over the stream, taking the stream's source and upstreams as the basis of - recomputing the stream. - - This stream is used to represent the output of a kernel invocation. - """ - - def __init__( - self, - pod_stream: PodStream | None = None, - source: dp.Pod | None = None, - upstreams: tuple[ - dp.Stream, ... - ] = (), # if provided, this will override the upstreams of the output_stream - **kwargs, - ) -> None: - if (pod_stream is None or output_stream.source is None) and source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - if source is None: - if output_stream is None or output_stream.source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - source = output_stream.source - upstreams = upstreams or output_stream.upstreams - - super().__init__(source=source, upstreams=upstreams, **kwargs) - self._cached_stream = output_stream - - def clear_cache(self) -> None: - """ - Clears the cached stream. - This is useful for re-processing the stream with the same kernel. - """ - self._cached_stream = None - self._set_modified_time(invalidate=True) - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.keys() - - def types(self) -> tuple[TypeSpec, TypeSpec]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - self.refresh() - assert self._cached_stream is not None, ( - "_cached_stream should not be None here." - ) - return self._cached_stream.types() - - @property - def is_current(self) -> bool: - if self._cached_stream is None or not super().is_current: - status = self.refresh() - if not status: # if it failed to update for whatever reason - return False - return True - - def refresh(self, force: bool = False) -> bool: - updated = False - if force or (self._cached_stream is not None and not super().is_current): - self.clear_cache() - - if self._cached_stream is None: - assert self.source is not None, ( - "Stream source must be set to recompute the stream." - ) - self._cached_stream = self.source.forward(*self.upstreams) - self._set_modified_time() - updated = True - - if self._cached_stream is None: - # TODO: use beter error type - raise ValueError( - "Stream could not be updated. Ensure that the source is valid and upstreams are correct." - ) - - return updated - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - @property - def last_modified(self) -> datetime | None: - if self._cached_stream is None: - return None - return self._cached_stream.last_modified - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> pa.Table: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_content_hash=include_content_hash, - ) - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - yield from self._cached_stream.iter_packets() - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" From 625204f3d3d150f634d4c4c299f9ffc039e5e8e2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 22:13:08 +0000 Subject: [PATCH 143/224] doc: cleanup tutorial notebook --- .../01_orcapod_quick_exploration.ipynb | 689 +++--------------- 1 file changed, 90 insertions(+), 599 deletions(-) diff --git a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb index 22bce61..99c8810 100644 --- a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb +++ b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "27cdd37d", "metadata": {}, "outputs": [], @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "e6a9e8b6", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "420477e8", "metadata": {}, "outputs": [], @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "dab6bf9c", "metadata": {}, "outputs": [], @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "cd0394d8", "metadata": {}, "outputs": [], @@ -124,20 +124,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "2d4a0812", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag: {'a': 1, 'b': 'x'}, Packet: {'c': True, 'd': 1.1}\n", - "Tag: {'a': 2, 'b': 'y'}, Packet: {'c': False, 'd': 2.2}\n", - "Tag: {'a': 3, 'b': 'z'}, Packet: {'c': True, 'd': 3.3}\n" - ] - } - ], + "outputs": [], "source": [ "for tag, packet in stream:\n", " print(f\"Tag: {tag}, Packet: {packet}\")" @@ -153,26 +143,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "79e67bfc", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(ArrowTag(data={'a': 1, 'b': 'x'}, meta_columns=0, context='std:v0.1.0:default'),\n", - " ArrowPacket(data={'c': True, 'd': 1.1}, meta_columns=0, context='std:v0.1.0:default')),\n", - " (ArrowTag(data={'a': 2, 'b': 'y'}, meta_columns=0, context='std:v0.1.0:default'),\n", - " ArrowPacket(data={'c': False, 'd': 2.2}, meta_columns=0, context='std:v0.1.0:default')),\n", - " (ArrowTag(data={'a': 3, 'b': 'z'}, meta_columns=0, context='std:v0.1.0:default'),\n", - " ArrowPacket(data={'c': True, 'd': 3.3}, meta_columns=0, context='std:v0.1.0:default'))]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.flow()" ] @@ -187,30 +161,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "52baee9c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "----\n", - "a: [[1,2,3]]\n", - "b: [[\"x\",\"y\",\"z\"]]\n", - "c: [[true,false,true]]\n", - "d: [[1.1,2.2,3.3]]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.as_table()" ] @@ -233,34 +187,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "4648fbe9", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "_source_c: large_string\n", - "_source_d: large_string\n", - "----\n", - "a: [[1,2,3]]\n", - "b: [[\"x\",\"y\",\"z\"]]\n", - "c: [[true,false,true]]\n", - "d: [[1.1,2.2,3.3]]\n", - "_source_c: [[null,null,null]]\n", - "_source_d: [[null,null,null]]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.as_table(include_source=True)" ] @@ -275,32 +205,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "001b2a9c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "_content_hash: large_string\n", - "----\n", - "a: [[1,2,3]]\n", - "b: [[\"x\",\"y\",\"z\"]]\n", - "c: [[true,false,true]]\n", - "d: [[1.1,2.2,3.3]]\n", - "_content_hash: [[\"arrow_v0.1@3de5f8a7b9a2fe5e6cc3c84e0368a21e807abe655b5a4dc58efc9b5487e3d9a8\",\"arrow_v0.1@cc022b33fc80a6639d2051d6d19a0162a832ce309367e426433e7401390b6e20\",\"arrow_v0.1@b0bb7434c813b4d5d7c3a5445a0ac3804739388a20a78d6d910b8c02d9ec5653\"]]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.as_table(include_content_hash=True)" ] @@ -315,32 +223,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "d3b9e394", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "my_hash_values: large_string\n", - "----\n", - "a: [[1,2,3]]\n", - "b: [[\"x\",\"y\",\"z\"]]\n", - "c: [[true,false,true]]\n", - "d: [[1.1,2.2,3.3]]\n", - "my_hash_values: [[\"arrow_v0.1@3de5f8a7b9a2fe5e6cc3c84e0368a21e807abe655b5a4dc58efc9b5487e3d9a8\",\"arrow_v0.1@cc022b33fc80a6639d2051d6d19a0162a832ce309367e426433e7401390b6e20\",\"arrow_v0.1@b0bb7434c813b4d5d7c3a5445a0ac3804739388a20a78d6d910b8c02d9ec5653\"]]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.as_table(include_content_hash=\"my_hash_values\")" ] @@ -355,32 +241,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "92cbfa50", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "_context_key: large_string\n", - "----\n", - "a: [[1,2,3]]\n", - "b: [[\"x\",\"y\",\"z\"]]\n", - "c: [[true,false,true]]\n", - "d: [[1.1,2.2,3.3]]\n", - "_context_key: [[null,null,null]]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "stream.as_table(include_data_context=True)" ] @@ -403,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "c78096a7", "metadata": {}, "outputs": [], @@ -413,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "6f8a2f0b", "metadata": {}, "outputs": [], @@ -423,42 +287,20 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "e1ac13b1", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ArrowTag(data={'a': 1, 'b': 'x'}, meta_columns=0, context='std:v0.1.0:default')" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tag" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "263fa1c5", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ArrowPacket(data={'c': True, 'd': 1.1}, meta_columns=0, context='std:v0.1.0:default')" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet" ] @@ -473,84 +315,40 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "42158816", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tag[\"a\"]" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "6a792175", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'x'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tag[\"b\"]" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "a28f2051", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet[\"c\"]" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "981e6c44", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.1" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet[\"d\"]" ] @@ -565,21 +363,10 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "56423d2c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'c': bool, 'd': float}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Returns typespec (dictionary of key to type)\n", "packet.types()" @@ -587,21 +374,10 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "d5e02f81", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('c', 'd')" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# entry names as strings\n", "packet.keys()" @@ -617,26 +393,10 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "b1b18ee4", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "c: bool\n", - "d: double\n", - "----\n", - "c: [[true]]\n", - "d: [[1.1]]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet.as_table()" ] @@ -651,22 +411,10 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "3aa4020e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "c: bool\n", - "d: double" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet.arrow_schema()" ] @@ -681,21 +429,10 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "bea6c771", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'a': 1, 'b': 'x'}" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tag.as_dict()" ] @@ -710,21 +447,10 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "92f00feb", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'c': None, 'd': None}" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet.source_info()" ] @@ -739,51 +465,20 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "bba2bc5c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'c': True, 'd': 1.1, '_source_c': None, '_source_d': None}" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet.as_dict(include_source=True)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "bd09d9d1", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "c: bool\n", - "d: double\n", - "_source_c: large_string\n", - "_source_d: large_string\n", - "----\n", - "c: [[true]]\n", - "d: [[1.1]]\n", - "_source_c: [[null]]\n", - "_source_d: [[null]]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "packet.as_table(include_source=True)" ] @@ -798,21 +493,10 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "id": "03219fd3", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'arrow_v0.1@6e1143896d73d370757811b52ceeea8d1d456cd30206416fbf81754e1cea5568'" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tag.content_hash()" ] @@ -843,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "11ee5130", "metadata": {}, "outputs": [], @@ -878,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "8299d4b1", "metadata": {}, "outputs": [], @@ -888,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "dfc7ee9f", "metadata": {}, "outputs": [], @@ -906,19 +590,10 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "092abff5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag: {'id': 0}, Packet: {'a': 1, 'b': 'x', 'c': True, 'd': 1.1}\n", - "Tag: {'id': 1}, Packet: {'a': 2, 'b': 'y', 'c': False, 'd': 2.2}\n" - ] - } - ], + "outputs": [], "source": [ "for tag, packet in joined_stream:\n", " print(f\"Tag: {tag}, Packet: {packet}\")" @@ -934,32 +609,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "48ef0a8f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyarrow.Table\n", - "id: int64\n", - "a: int64\n", - "b: string\n", - "c: bool\n", - "d: double\n", - "----\n", - "id: [[0,1]]\n", - "a: [[1,2]]\n", - "b: [[\"x\",\"y\"]]\n", - "c: [[true,false]]\n", - "d: [[1.1,2.2]]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "joined_stream.as_table()" ] @@ -982,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "35423d9a", "metadata": {}, "outputs": [], @@ -1011,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "id": "119d33a3", "metadata": {}, "outputs": [], @@ -1029,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "2b3b42ff", "metadata": {}, "outputs": [], @@ -1048,43 +701,20 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "ff05a8fc", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(ImmutableTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "output_stream" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "id": "6431180f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag: {'id': 0}, Packet: {'sum': 11}\n", - "Tag: {'id': 1}, Packet: {'sum': 22}\n", - "Tag: {'id': 2}, Packet: {'sum': 33}\n", - "Tag: {'id': 3}, Packet: {'sum': 44}\n", - "Tag: {'id': 4}, Packet: {'sum': 55}\n" - ] - } - ], + "outputs": [], "source": [ "for t, p in output_stream:\n", " print(f\"Tag: {t}, Packet: {p}\")" @@ -1116,7 +746,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "id": "cb4bc91a", "metadata": {}, "outputs": [], @@ -1136,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "f371822b", "metadata": {}, "outputs": [], @@ -1161,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "e132fc93", "metadata": {}, "outputs": [], @@ -1183,21 +813,10 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "id": "cca9e0d0", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PodNode(pod=FunctionPod:add_numbers)" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.add_numbers" ] @@ -1220,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "id": "21086f72", "metadata": {}, "outputs": [], @@ -1238,7 +857,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "id": "1e741659", "metadata": {}, "outputs": [], @@ -1264,42 +883,10 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "c77154ec", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────┐\n", - "│ id ┆ sum │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════╡\n", - "│ 0 ┆ 11 │\n", - "│ 1 ┆ 22 │\n", - "│ 2 ┆ 33 │\n", - "│ 3 ┆ 44 │\n", - "│ 4 ┆ 55 │\n", - "└─────┴─────┘" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.add_numbers.df" ] @@ -1332,25 +919,25 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "id": "37e65e33", "metadata": {}, "outputs": [], "source": [ "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", "\n", - "pipeline = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" + "pipeline2 = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "id": "3bad8332", "metadata": {}, "outputs": [], "source": [ "# now defien the pipeline\n", - "with pipeline:\n", + "with pipeline2:\n", " sum_results = add_numbers(input_stream, label=\"my_summation\")\n", " product_results = multiply_numbers(input_stream, label=\"my_product\")\n", " final_results = combine_results(\n", @@ -1360,128 +947,32 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "id": "8f146ae7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────┐\n", - "│ id ┆ sum │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════╡\n", - "│ 0 ┆ 11 │\n", - "│ 1 ┆ 22 │\n", - "│ 2 ┆ 33 │\n", - "│ 3 ┆ 44 │\n", - "│ 4 ┆ 55 │\n", - "└─────┴─────┘" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.my_summation.df" + "outputs": [], + "source": [ + "pipeline2.my_summation.df" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "id": "8fd7bf4e", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idproduct
i64i64
010
140
290
3160
4250
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────────┐\n", - "│ id ┆ product │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════════╡\n", - "│ 0 ┆ 10 │\n", - "│ 1 ┆ 40 │\n", - "│ 2 ┆ 90 │\n", - "│ 3 ┆ 160 │\n", - "│ 4 ┆ 250 │\n", - "└─────┴─────────┘" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.my_product.df" + "outputs": [], + "source": [ + "pipeline2.my_product.df" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "id": "2a918db1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idresult
i64str
0"Sum: 11, Product: 10"
1"Sum: 22, Product: 40"
2"Sum: 33, Product: 90"
3"Sum: 44, Product: 160"
4"Sum: 55, Product: 250"
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬───────────────────────┐\n", - "│ id ┆ result │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═════╪═══════════════════════╡\n", - "│ 0 ┆ Sum: 11, Product: 10 │\n", - "│ 1 ┆ Sum: 22, Product: 40 │\n", - "│ 2 ┆ Sum: 33, Product: 90 │\n", - "│ 3 ┆ Sum: 44, Product: 160 │\n", - "│ 4 ┆ Sum: 55, Product: 250 │\n", - "└─────┴───────────────────────┘" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.my_final_result.df" + "outputs": [], + "source": [ + "pipeline2.my_final_result.df" ] }, { From d87ab533d1539be68c8bc6efae3a20fbceba843d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 29 Jul 2025 22:15:14 +0000 Subject: [PATCH 144/224] doc: fix typo --- notebooks/tutorials/01_orcapod_quick_exploration.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb index 99c8810..d0eb591 100644 --- a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb +++ b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb @@ -753,7 +753,7 @@ "source": [ "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", "\n", - "pipeline = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" + "pipeline = op.Pipeline(name=\"my_pipeline\", pipeline_store=data_store)" ] }, { @@ -926,7 +926,7 @@ "source": [ "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", "\n", - "pipeline2 = op.Pipeline(name=\"MyPipelin\", pipeline_store=data_store)" + "pipeline2 = op.Pipeline(name=\"my_pipeline\", pipeline_store=data_store)" ] }, { @@ -980,7 +980,7 @@ "id": "5380dad8", "metadata": {}, "source": [ - "Notice that despite just freshly creating the pipeline, each node already had results filled in! This is because the results from the previous pipeline execution was smartly fetched back. Critically, this was done only because Orcapod noticed that you had an identical pipeline with the same inputs and same operators/pods so that you can reuse the result as is. Should the structure of pipeline been different, the wront results would not be loaded." + "Notice that despite just freshly creating the pipeline, each node already had results filled in! This is because the results from the previous pipeline with the same name (`my_pipeline`) was smartly fetched back. Critically, this was done only because Orcapod noticed that you had an identical pipeline with the same inputs and same operators/pods so that you can reuse the result as is. Should the structure of pipeline been different, the wront results would not be loaded." ] } ], From e13b3f8359e05c77bf335b28f433afd5a5d897d3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 00:18:25 +0000 Subject: [PATCH 145/224] fix: signature of dict datagram methods --- .../data/datagrams/arrow_tag_packet.py | 1 - src/orcapod/data/datagrams/dict_datagram.py | 35 +++--- src/orcapod/data/kernels.py | 115 ++++++++++-------- src/orcapod/data/pods.py | 15 ++- 4 files changed, 90 insertions(+), 76 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 976a392..ff11767 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -1,4 +1,3 @@ -from hmac import new import logging from collections.abc import Collection, Mapping from typing import Self diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 6cacb0c..9d6664a 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -1,4 +1,3 @@ -from curses import meta import logging from collections.abc import Collection, Iterator, Mapping from typing import Self, cast @@ -473,7 +472,7 @@ def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: return self._meta_data.get(key, default) - def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": + def with_meta_columns(self, **meta_updates: DataValue) -> Self: """ Create a new DictDatagram with updated meta columns. Maintains immutability by returning a new instance. @@ -499,15 +498,13 @@ def with_meta_columns(self, **meta_updates: DataValue) -> "DictDatagram": full_data = dict(self._data) # User data full_data.update(new_meta_data) # Meta data - return DictDatagram( + return self.__class__( data=full_data, semantic_converter=self._semantic_converter, data_context=self._data_context, ) - def drop_meta_columns( - self, *keys: str, ignore_missing: bool = False - ) -> "DictDatagram": + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: """ Create a new DictDatagram with specified meta columns dropped. Maintains immutability by returning a new instance. @@ -544,14 +541,14 @@ def drop_meta_columns( full_data = dict(self._data) # User data full_data.update(new_meta_data) # Filtered meta data - return DictDatagram( + return self.__class__( data=full_data, semantic_converter=self._semantic_converter, data_context=self._data_context, ) # 6. Data Column Operations - def select(self, *column_names: str) -> "DictDatagram": + def select(self, *column_names: str) -> Self: """ Create a new DictDatagram with only specified data columns. Maintains immutability by returning a new instance. @@ -574,13 +571,13 @@ def select(self, *column_names: str) -> "DictDatagram": full_data = new_data # Selected user data full_data.update(self._meta_data) # Keep existing meta data - return DictDatagram( + return self.__class__( data=full_data, semantic_converter=self._semantic_converter, data_context=self._data_context, ) - def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagram": + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: """ Create a new DictDatagram with specified data columns dropped. Maintains immutability by returning a new instance. @@ -607,13 +604,13 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> "DictDatagra full_data = new_data # Filtered user data full_data.update(self._meta_data) # Keep existing meta data - return DictDatagram( + return self.__class__( data=full_data, semantic_converter=self._semantic_converter, data_context=self._data_context, ) - def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": + def rename(self, column_mapping: Mapping[str, str]) -> Self: """ Create a new DictDatagram with data columns renamed. Maintains immutability by returning a new instance. @@ -647,14 +644,14 @@ def rename(self, column_mapping: Mapping[str, str]) -> "DictDatagram": full_data = new_data # Renamed user data full_data.update(self._meta_data) # Keep existing meta data - return DictDatagram( + return self.__class__( data=full_data, typespec=new_typespec, semantic_converter=self._semantic_converter, data_context=self._data_context, ) - def update(self, **updates: DataValue) -> "DictDatagram": + def update(self, **updates: DataValue) -> Self: """ Create a new DictDatagram with existing column values updated. Maintains immutability by returning a new instance. @@ -687,7 +684,7 @@ def update(self, **updates: DataValue) -> "DictDatagram": full_data = new_data # Updated user data full_data.update(self._meta_data) # Keep existing meta data - return DictDatagram( + return self.__class__( data=full_data, semantic_converter=self._semantic_converter, # Keep existing converter data_context=self._data_context, @@ -697,7 +694,7 @@ def with_columns( self, column_types: Mapping[str, type] | None = None, **updates: DataValue, - ) -> "DictDatagram": + ) -> Self: """ Create a new DictDatagram with new data columns added. Maintains immutability by returning a new instance. @@ -744,7 +741,7 @@ def with_columns( full_data = new_data # Updated user data full_data.update(self._meta_data) # Keep existing meta data - return DictDatagram( + return self.__class__( data=full_data, typespec=new_typespec, # semantic converter needs to be rebuilt for new columns @@ -752,7 +749,7 @@ def with_columns( ) # 7. Context Operations - def with_context_key(self, new_context_key: str) -> "DictDatagram": + def with_context_key(self, new_context_key: str) -> Self: """ Create a new DictDatagram with a different data context key. Maintains immutability by returning a new instance. @@ -767,7 +764,7 @@ def with_context_key(self, new_context_key: str) -> "DictDatagram": full_data = dict(self._data) # User data full_data.update(self._meta_data) # Meta data - return DictDatagram( + return self.__class__( data=full_data, data_context=new_context_key, # New context # Note: semantic_converter will be rebuilt for new context diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 702760f..eedb80b 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -63,6 +63,11 @@ def last_modified(self) -> datetime | None: def _set_modified_time( self, timestamp: datetime | None = None, invalidate: bool = False ) -> None: + """ + Sets the last modified time of the kernel. + If `invalidate` is True, it resets the last modified time to None to indicate unstable state that'd signal downstream + to recompute when using the kernel. Othewrise, sets the last modified time to the current time or to the provided timestamp. + """ if invalidate: self._last_modified = None return @@ -74,24 +79,59 @@ def _set_modified_time( @property @abstractmethod - def kernel_id(self) -> tuple[str, ...]: ... + def kernel_id(self) -> tuple[str, ...]: + """ + Return a unique identifier for the kernel. + This identifier is used to track the kernel and its invocations. Kernels with distinct identifiers + are considered distinct, even if they have the same label or content. + """ + ... @abstractmethod - def forward(self, *streams: dp.Stream) -> dp.Stream: + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior + Return the output types of the kernel given the input streams. """ + ... - @abstractmethod - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: ... + def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + processed_streams = self.pre_kernel_processing(*streams) + self.validate_inputs(*processed_streams) + return self.kernel_output_types(*processed_streams) @abstractmethod def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None ) -> Any: ... + def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + # Default implementation of identity_structure for the kernel only + # concerns the kernel class and the streams if present. Subclasses of + # Kernels should override this method to provide a more meaningful + # representation of the kernel. Note that kernel must provide the notion + # of identity under possibly two distinct contexts: + # 1) identity of the kernel in itself when invoked without any stream + # 2) identity of the specific invocation of the kernel with a collection of streams + # While the latter technically corresponds to the identity of the invocation and not + # the kernel, only kernel can provide meaningful information as to the uniqueness of + # the invocation as only kernel would know if / how the input stream(s) alter the identity + # of the invocation. For example, if the kernel corresponds to an commutative computation + # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the + # equivalence of the two by returning the same identity structure for both invocations. + # This can be achieved, for example, by returning a set over the streams instead of a tuple. + if streams is not None: + streams = self.pre_kernel_processing(*streams) + self.validate_inputs(*streams) + return self.kernel_identity_structure(streams) + + @abstractmethod + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Trigger the main computation of the kernel on a collection of streams. + This method is called when the kernel is invoked with a collection of streams. + Subclasses should override this method to provide the kernel with its unique behavior + """ + def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing @@ -105,7 +145,7 @@ def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: @abstractmethod def validate_inputs(self, *streams: dp.Stream) -> None: """ - Valide the input streams before the main computation but after the pre-kernel processing + Validate the input streams before the main computation but after the pre-kernel processing """ ... @@ -136,31 +176,6 @@ def __call__( self.track_invocation(*processed_streams, label=label) return output_stream - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - processed_streams = self.pre_kernel_processing(*streams) - self.validate_inputs(*processed_streams) - return self.kernel_output_types(*processed_streams) - - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - # Default implementation of identity_structure for the kernel only - # concerns the kernel class and the streams if present. Subclasses of - # Kernels should override this method to provide a more meaningful - # representation of the kernel. Note that kernel must provide the notion - # of identity under possibly two distinct contexts: - # 1) identity of the kernel in itself when invoked without any stream - # 2) identity of the specific invocation of the kernel with a collection of streams - # While the latter technically corresponds to the identity of the invocation and not - # the kernel, only kernel can provide meaningful information as to the uniqueness of - # the invocation as only kernel would know if / how the input stream(s) alter the identity - # of the invocation. For example, if the kernel corresponds to an commutative computation - # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the - # equivalence of the two by returning the same identity structure for both invocations. - # This can be achieved, for example, by returning a set over the streams instead of a tuple. - if streams is not None: - streams = self.pre_kernel_processing(*streams) - self.validate_inputs(*streams) - return self.kernel_identity_structure(streams) - def __repr__(self): return self.__class__.__name__ @@ -172,9 +187,13 @@ def __str__(self): class WrappedKernel(TrackedKernelBase): """ - A wrapper for a kernel that allows it to be used as a stream source. - This is useful for cases where you want to use a kernel as a source of data - in a pipeline or other data processing context. + A wrapper for a kernels useful when you want to use an existing kernel + but need to provide some extra functionality. + + Default implementation provides a simple passthrough to the wrapped kernel. + If you want to provide a custom behavior, be sure to override the methods + that you want to change. Note that the wrapped kernel must implement the + `Kernel` protocol. Refer to `orcapod.protocols.data_protocols.Kernel` for more details. """ def __init__(self, kernel: dp.Kernel, **kwargs) -> None: @@ -182,10 +201,6 @@ def __init__(self, kernel: dp.Kernel, **kwargs) -> None: super().__init__(**kwargs) self.kernel = kernel - @property - def kernel_id(self) -> tuple[str, ...]: - return self.kernel.kernel_id - def computed_label(self) -> str | None: """ Compute a label for this kernel based on its content. @@ -194,22 +209,26 @@ def computed_label(self) -> str | None: """ return self.kernel.label - def forward(self, *streams: dp.Stream) -> dp.Stream: - return self.kernel.forward(*streams) + @property + def kernel_id(self) -> tuple[str, ...]: + return self.kernel.kernel_id def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: return self.kernel.output_types(*streams) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return self.kernel.identity_structure(streams) + def validate_inputs(self, *streams: dp.Stream) -> None: - pass + return self.kernel.validate_inputs(*streams) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + return self.kernel.forward(*streams) def __repr__(self): return f"WrappedKernel({self.kernel!r})" def __str__(self): return f"WrappedKernel:{self.kernel!s}" - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - return self.kernel.identity_structure(streams) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index d6fe933..087c2f7 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,28 +1,27 @@ -from datetime import datetime, timezone import logging import sys from abc import abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence -from typing import Any, Literal, cast, TYPE_CHECKING +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any, Literal +from orcapod.data.context import DataContext from orcapod.data.datagrams import ( - DictPacket, ArrowPacket, + DictPacket, ) -from orcapod.data.context import DataContext from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join -from orcapod.data.streams import LazyPodResultStream, PodStream +from orcapod.data.streams import LazyPodResultStream +from orcapod.data.system_constants import orcapod_constants as constants from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore from orcapod.types import TypeSpec +from orcapod.types import typespec_utils as tsutils from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter -from orcapod.types import typespec_utils as tsutils -from orcapod.utils import arrow_utils -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: From fc6c828c0dddb056a745eccd3cd4c5848523831b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 01:56:37 +0000 Subject: [PATCH 146/224] feat: add more operators and improved stream integration --- src/orcapod/data/operators.py | 424 ++++++++++++++++++++---- src/orcapod/data/streams.py | 52 ++- src/orcapod/protocols/data_protocols.py | 48 +++ 3 files changed, 457 insertions(+), 67 deletions(-) diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 9fbf3a5..42b3532 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -6,8 +6,9 @@ from abc import abstractmethod from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection +from collections.abc import Collection, Mapping from orcapod.errors import InputValidationError +from orcapod.data.system_constants import orcapod_constants as constants if TYPE_CHECKING: import pyarrow as pa @@ -24,6 +25,20 @@ class Operator(TrackedKernelBase): and returns a new stream as output (note that output stream is always singular). """ + def __init__(self, **kwargs: Any): + super().__init__(**kwargs) + self._operator_hash = self.data_context.object_hasher.hash_to_hex( + self.identity_structure(), prefix_hasher_id=True + ) + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}", self._operator_hash) + class NonZeroInputOperator(Operator): """ @@ -179,7 +194,11 @@ def op_output_types( ... @abstractmethod - def op_identity_structure(self, *streams: dp.Stream) -> Any: + def op_identity_structure( + self, + left_stream: dp.Stream | None = None, + right_stream: dp.Stream | None = None, + ) -> Any: """ This method should be implemented by subclasses to return a structure that represents the identity of the operator. It takes two streams as input and returns a tuple containing the operator name and a set of streams. @@ -187,67 +206,81 @@ def op_identity_structure(self, *streams: dp.Stream) -> Any: ... -class BinaryJoin(BinaryOperator): - def op_identity_structure(self, *streams: dp.Stream) -> Any: - # Join does not depend on the order of the streams -- convert it onto a set - id_struct = (self.__class__.__name__,) - if len(streams) == 2: - id_struct += (set(streams),) - return id_struct +class UnaryOperator(Operator): + """ + Base class for all operators. + """ - def op_forward( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> ImmutableTableStream: + def check_unary_input( + self, + streams: Collection[dp.Stream], + ) -> None: """ - Joins two streams together based on their tags. - The resulting stream will contain all the tags from both streams. + Check that the inputs to the unary operator are valid. """ + if len(streams) != 1: + raise ValueError("UnaryOperator requires exactly one input stream.") - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_unary_input(streams) + stream = streams[0] + return self.op_validate_inputs(stream) - common_tag_keys = tuple( - intersection_typespecs(left_tag_typespec, right_tag_typespec).keys() - ) - joined_tag_keys = tuple( - union_typespecs(left_tag_typespec, right_tag_typespec).keys() - ) + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for unary operators. + It expects exactly one stream as input. + """ + stream = streams[0] + return self.op_forward(stream) - # performing a check to ensure that packets are compatible - union_typespecs(left_packet_typespec, right_packet_typespec) + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + stream = streams[0] + return self.op_output_types(stream) - joined_table = left_stream.as_table().join( - right_stream.as_table(), - keys=common_tag_keys, - join_type="inner", - ) + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + if streams is not None: + stream = list(streams)[0] + self.op_identity_structure(stream) + return self.op_identity_structure() - return ImmutableTableStream( - joined_table, - tag_columns=tuple(joined_tag_keys), - source=self, - upstreams=(left_stream, right_stream), - ) + @abstractmethod + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... - def op_output_types(self, left_stream, right_stream) -> tuple[TypeSpec, TypeSpec]: - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() - joined_tag_typespec = union_typespecs(left_tag_typespec, right_tag_typespec) - joined_packet_typespec = union_typespecs( - left_packet_typespec, right_packet_typespec - ) - return joined_tag_typespec, joined_packet_typespec + @abstractmethod + def op_forward(self, stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + ... - def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> None: - try: - self.op_output_types(left_stream, right_stream) - except Exception as e: - raise InputValidationError(f"Input streams are not compatible: {e}") + @abstractmethod + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + ... - def __repr__(self) -> str: - return "Join()" + @abstractmethod + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes two streams as input and returns a tuple containing the operator name and a set of streams. + """ + ... class Join(NonZeroInputOperator): @@ -276,7 +309,13 @@ def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: for other_stream in streams[1:]: other_tag_typespec, other_packet_typespec = other_stream.types() tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) - packet_typespec = union_typespecs(packet_typespec, other_packet_typespec) + packet_typespec = intersection_typespecs( + packet_typespec, other_packet_typespec + ) + if packet_typespec: + raise InputValidationError( + f"Packets should not have overlapping keys, but {packet_typespec.keys()} found in {stream} and {other_stream}." + ) return tag_typespec, packet_typespec @@ -335,9 +374,276 @@ def __repr__(self) -> str: return "Join()" -def op_identity_structure(self, *streams: dp.Stream) -> Any: - # Join does not depend on the order of the streams -- convert it onto a set - id_struct = (self.__class__.__name__,) - if len(streams) > 0: - id_struct += (set(streams),) - return id_struct +class SemiJoin(BinaryOperator): + """ + Binary operator that performs a semi-join between two streams. + + A semi-join returns only the entries from the left stream that have + matching entries in the right stream, based on equality of values + in overlapping columns (columns with the same name and compatible types). + + If there are no overlapping columns between the streams, the entire + left stream is returned unchanged. + + The output stream preserves the schema of the left stream exactly. + """ + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}",) + + def op_identity_structure( + self, + left_stream: dp.Stream | None = None, + right_stream: dp.Stream | None = None, + ) -> Any: + """ + Return a structure that represents the identity of this operator. + Unlike Join, SemiJoin depends on the order of streams (left vs right). + """ + id_struct = (self.__class__.__name__,) + if left_stream is not None and right_stream is not None: + # Order matters for semi-join: (left_stream, right_stream) + id_struct += (left_stream, right_stream) + return id_struct + + def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + """ + Performs a semi-join between left and right streams. + Returns entries from left stream that have matching entries in right stream. + """ + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + # Find overlapping columns across all columns (tags + packets) + left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) + right_all_typespec = union_typespecs(right_tag_typespec, right_packet_typespec) + + common_keys = tuple( + intersection_typespecs(left_all_typespec, right_all_typespec).keys() + ) + + # If no overlapping columns, return the left stream unmodified + if not common_keys: + return left_stream + + # include source info for left stream + left_table = left_stream.as_table(include_source=True) + + # Get the right table for matching + right_table = right_stream.as_table() + + # Perform left semi join using PyArrow's built-in functionality + semi_joined_table = left_table.join( + right_table, + keys=list(common_keys), + join_type="left semi", + ) + + return ImmutableTableStream( + semi_joined_table, + tag_columns=tuple(left_tag_typespec.keys()), + source=self, + upstreams=(left_stream, right_stream), + ) + + def op_output_types( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the output types for the semi-join operation. + The output preserves the exact schema of the left stream. + """ + # Semi-join preserves the left stream's schema exactly + return left_stream.types() + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + Validates that the input streams are compatible for semi-join. + Checks that overlapping columns have compatible types. + """ + try: + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + # Check that overlapping columns have compatible types across all columns + left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) + right_all_typespec = union_typespecs( + right_tag_typespec, right_packet_typespec + ) + + # intersection_typespecs will raise an error if types are incompatible + intersection_typespecs(left_all_typespec, right_all_typespec) + + except Exception as e: + raise InputValidationError( + f"Input streams are not compatible for semi-join: {e}" + ) from e + + def __repr__(self) -> str: + return "SemiJoin()" + + +class MapPackets(UnaryOperator): + """ + Operator that maps packets in a stream using a user-defined function. + The function is applied to each packet in the stream, and the resulting packets + are returned as a new stream. + """ + + def __init__( + self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + ): + self.name_map = dict(name_map) + self.drop_unmapped = drop_unmapped + super().__init__(**kwargs) + + def op_forward(self, stream: dp.Stream) -> dp.Stream: + tag_columns, packet_columns = stream.keys() + + if not any(n in packet_columns for n in self.name_map): + # nothing to rename in the packet, return stream as is + return stream + + table = stream.as_table(include_source=True) + + name_map = {tc: tc for tc in tag_columns} # no renaming on tag columns + for c in packet_columns: + if c in self.name_map: + name_map[c] = self.name_map[c] + name_map[f"{constants.SOURCE_PREFIX}{c}"] = ( + f"{constants.SOURCE_PREFIX}{self.name_map[c]}" + ) + else: + name_map[c] = c + + renamed_table = table.rename_columns(name_map) + return ImmutableTableStream( + renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) + ) + + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # verify that renamed value does NOT collide with other columns + tag_columns, packet_columns = stream.keys() + relevant_source = [] + relevant_target = [] + for source, target in self.name_map.items(): + if source in packet_columns: + relevant_source.append(source) + relevant_target.append(target) + remaining_packet_columns = set(packet_columns) - set(relevant_source) + overlapping_packet_columns = remaining_packet_columns.intersection( + relevant_target + ) + overlapping_tag_columns = set(tag_columns).intersection(relevant_target) + + if overlapping_packet_columns or overlapping_tag_columns: + message = f"Renaming {self.name_map} would cause collisions with existing columns: " + if overlapping_packet_columns: + message += f"overlapping packet columns: {overlapping_packet_columns}, " + if overlapping_tag_columns: + message += f"overlapping tag columns: {overlapping_tag_columns}." + raise InputValidationError(message) + + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, packet_typespec = stream.types() + + # Create new packet typespec with renamed keys + new_packet_typespec = { + self.name_map.get(k, k): v for k, v in packet_typespec.items() + } + + return tag_typespec, new_packet_typespec + + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.name_map, + self.drop_unmapped, + ) + ((stream,) if stream is not None else ()) + + +class MapTags(UnaryOperator): + """ + Operator that maps tags in a stream using a user-defined function. + The function is applied to each tag in the stream, and the resulting tags + are returned as a new stream. + """ + + def __init__( + self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + ): + self.name_map = dict(name_map) + self.drop_unmapped = drop_unmapped + super().__init__(**kwargs) + + def op_forward(self, stream: dp.Stream) -> dp.Stream: + tag_columns, packet_columns = stream.keys() + + if not any(n in tag_columns for n in self.name_map): + # nothing to rename in the tags, return stream as is + return stream + + table = stream.as_table(include_source=True) + + name_map = { + tc: self.name_map.get(tc, tc) for tc in tag_columns + } # rename the tag as necessary + new_tag_columns = [name_map[tc] for tc in tag_columns] + for c in packet_columns: + name_map[c] = c # no renaming on packet columns + + renamed_table = table.rename_columns(name_map) + return ImmutableTableStream( + renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) + ) + + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # verify that renamed value does NOT collide with other columns + tag_columns, packet_columns = stream.keys() + relevant_source = [] + relevant_target = [] + for source, target in self.name_map.items(): + if source in tag_columns: + relevant_source.append(source) + relevant_target.append(target) + remaining_tag_columns = set(tag_columns) - set(relevant_source) + overlapping_tag_columns = remaining_tag_columns.intersection(relevant_target) + overlapping_packet_columns = set(packet_columns).intersection(relevant_target) + + if overlapping_tag_columns or overlapping_packet_columns: + message = f"Renaming {self.name_map} would cause collisions with existing columns: " + if overlapping_tag_columns: + message += f"overlapping tag columns: {overlapping_tag_columns}." + if overlapping_packet_columns: + message += f"overlapping packet columns: {overlapping_packet_columns}." + raise InputValidationError(message) + + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, packet_typespec = stream.types() + + # Create new packet typespec with renamed keys + new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} + + return new_tag_typespec, packet_typespec + + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.name_map, + self.drop_unmapped, + ) + ((stream,) if stream is not None else ()) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 6517257..499b6a0 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,9 +1,9 @@ import logging -from pathlib import Path from abc import ABC, abstractmethod -from collections.abc import Collection, Iterator +from collections.abc import Collection, Iterator, Mapping from datetime import datetime, timezone from itertools import repeat +from pathlib import Path from typing import TYPE_CHECKING, Any from orcapod.data.base import LabeledContentIdentifiableBase @@ -170,12 +170,6 @@ def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ return [e for e in self] - # --------------------- Recursive methods --------------------------- - # These methods form a step in the multi-class recursive invocation that follows the pattern of - # Stream -> Invocation -> Kernel -> Stream ... -> Invocation -> Kernel - # Most of the method logic would be found in Kernel's implementation of the method with - # Stream and Invocation simply serving as recursive steps - def identity_structure(self) -> Any: """ Identity structure of a stream is deferred to the identity structure @@ -188,6 +182,48 @@ def identity_structure(self) -> Any: return self.source.identity_structure(self.upstreams) return super().identity_structure() + # ---------------Built in operators---------------- + + def join(self, other_stream: dp.Stream) -> dp.Stream: + """ + Joins this stream with another stream, returning a new stream that contains + the combined data from both streams. + """ + from orcapod.data.operators import Join + + return Join()(self, other_stream) + + def semi_join(self, other_stream: dp.Stream) -> dp.Stream: + """ + Performs a semi-join with another stream, returning a new stream that contains + only the packets from this stream that have matching tags in the other stream. + """ + from orcapod.data.operators import SemiJoin + + return SemiJoin()(self, other_stream) + + def map_tags( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> dp.Stream: + """ + Maps the tags in this stream according to the provided name_map. + If drop_unmapped is True, any tags that are not in the name_map will be dropped. + """ + from orcapod.data.operators import MapTags + + return MapTags(name_map, drop_unmapped)(self) + + def map_packets( + self, packet_map: Mapping[str, str], drop_unmapped: bool = True + ) -> dp.Stream: + """ + Maps the packets in this stream according to the provided packet_map. + If drop_unmapped is True, any packets that are not in the packet_map will be dropped. + """ + from orcapod.data.operators import MapPackets + + return MapPackets(packet_map, drop_unmapped)(self) + class ImmutableTableStream(StreamBase): """ diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index d334e14..ac436b3 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1158,6 +1158,54 @@ def flow(self) -> Collection[tuple[Tag, Packet]]: """ ... + def join(self, other_stream: "Stream") -> "Stream": + """ + Join this stream with another stream. + + Combines two streams into a single stream by merging their content. + The resulting stream contains all (tag, packet) pairs from both + streams, preserving their order. + + Args: + other_stream: The other stream to join with this one. + + Returns: + Self: New stream containing combined content from both streams. + """ + ... + + def semi_join(self, other_stream: "Stream") -> "Stream": + """ + Perform a semi-join with another stream. + + This operation filters this stream to only include packets that have + corresponding tags in the other stream. The resulting stream contains + all (tag, packet) pairs from this stream that match tags in the other. + + Args: + other_stream: The other stream to semi-join with this one. + + Returns: + Self: New stream containing filtered content based on the semi-join. + """ + ... + + def map_tags( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> "Stream": + """ + Map tag names in this stream to new names based on the provided mapping. + """ + ... + + def map_packets( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> "Stream": + """ + Map packet names in this stream to new names based on the provided mapping. + """ + ... + class LiveStream(Stream, Protocol): """ From b10669d61a75a34a5e08dadac2002aa3e34a7adf Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 01:59:39 +0000 Subject: [PATCH 147/224] fix: mismatched protocol signature --- src/orcapod/data/streams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 499b6a0..832e2e9 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -214,7 +214,7 @@ def map_tags( return MapTags(name_map, drop_unmapped)(self) def map_packets( - self, packet_map: Mapping[str, str], drop_unmapped: bool = True + self, name_map: Mapping[str, str], drop_unmapped: bool = True ) -> dp.Stream: """ Maps the packets in this stream according to the provided packet_map. @@ -222,7 +222,7 @@ def map_packets( """ from orcapod.data.operators import MapPackets - return MapPackets(packet_map, drop_unmapped)(self) + return MapPackets(name_map, drop_unmapped)(self) class ImmutableTableStream(StreamBase): From fce4420979d035676cf306621cb5274a638a0b71 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 05:09:19 +0000 Subject: [PATCH 148/224] refactor: move kernel id logic to kernel base and add clean source base implementation --- src/orcapod/data/kernels.py | 25 ++-- src/orcapod/data/operators.py | 14 -- src/orcapod/data/sources.py | 271 +++++++++++++++++++++++----------- 3 files changed, 198 insertions(+), 112 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index eedb80b..ad4c900 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -41,8 +41,23 @@ def __init__( self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self._last_modified = None + self._kernel_hash = None self._set_modified_time() + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + if self._kernel_hash is None: + # If the kernel hash is not set, compute it based on the class name and label. + # This is a simple way to ensure that each kernel has a unique identifier. + self._kernel_hash = self.data_context.object_hasher.hash_to_hex( + self.identity_structure(), prefix_hasher_id=True + ) + return (f"{self.__class__.__name__}", self._kernel_hash) + @property def data_context(self) -> DataContext: return self._data_context @@ -77,16 +92,6 @@ def _set_modified_time( else: self._last_modified = datetime.now(timezone.utc) - @property - @abstractmethod - def kernel_id(self) -> tuple[str, ...]: - """ - Return a unique identifier for the kernel. - This identifier is used to track the kernel and its invocations. Kernels with distinct identifiers - are considered distinct, even if they have the same label or content. - """ - ... - @abstractmethod def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py index 42b3532..d18de9a 100644 --- a/src/orcapod/data/operators.py +++ b/src/orcapod/data/operators.py @@ -25,20 +25,6 @@ class Operator(TrackedKernelBase): and returns a new stream as output (note that output stream is always singular). """ - def __init__(self, **kwargs: Any): - super().__init__(**kwargs) - self._operator_hash = self.data_context.object_hasher.hash_to_hex( - self.identity_structure(), prefix_hasher_id=True - ) - - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Returns a unique identifier for the kernel. - This is used to identify the kernel in the computational graph. - """ - return (f"{self.__class__.__name__}", self._operator_hash) - class NonZeroInputOperator(Operator): """ diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index ec60cb1..d1b627b 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -1,23 +1,22 @@ from abc import abstractmethod - -from pyarrow.lib import Table -from orcapod.data.kernels import TrackedKernelBase -from orcapod.protocols import data_protocols as dp from collections.abc import Collection, Iterator -from orcapod.data.streams import ImmutableTableStream -from typing import TYPE_CHECKING -from orcapod.utils.lazy_module import LazyModule -from orcapod.types import TypeSpec from datetime import datetime -from orcapod.types import schemas +from pathlib import Path +from typing import TYPE_CHECKING, Any + from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -from pathlib import Path +from pyarrow.lib import Table +from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.streams import ImmutableTableStream, KernelStream +from orcapod.protocols import data_protocols as dp +from orcapod.types import TypeSpec, schemas +from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: - import polars as pl import pandas as pd + import polars as pl import pyarrow as pa else: pl = LazyModule("polars") @@ -26,24 +25,96 @@ class SourceBase(TrackedKernelBase): - @property - def tag_keys(self) -> tuple[str, ...]: + """ + Base class for sources that act as both Kernels and LiveStreams. + + Design Philosophy: + 1. Source is fundamentally a Kernel (data loader) + 2. forward() returns static snapshots as a stream (pure computation) + 3. __call__() returns a cached KernelStream (live, tracked) + 4. All stream methods delegate to the cached KernelStream + + This ensures that direct source iteration and source() iteration + are identical and both benefit from KernelStream's lifecycle management. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # Cache the KernelStream for reuse across all stream method calls + self._cached_kernel_stream: KernelStream | None = None + + # =========================== Kernel Methods =========================== + + @abstractmethod + def forward(self, *streams: dp.Stream) -> dp.Stream: """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. + Pure computation: return a static snapshot of the data. + + This is the core method that subclasses must implement. + Each call should return a fresh stream representing the current state of the data. + This is what KernelStream calls when it needs to refresh its data. """ - tag_keys, _ = self.keys() - return tag_keys + ... + + @abstractmethod + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """Return the tag and packet types this source produces.""" + ... + + @abstractmethod + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> dp.Any: ... + + def validate_inputs(self, *streams: dp.Stream) -> None: + """Sources take no input streams.""" + if len(streams) > 0: + raise ValueError( + f"{self.__class__.__name__} is a source and takes no input streams" + ) + + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> KernelStream: + if self._cached_kernel_stream is None: + self._cached_kernel_stream = super().prepare_output_stream( + *streams, label=label + ) + return self._cached_kernel_stream + + # ==================== Stream Protocol (Delegation) ==================== @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - # TODO: consider caching this - _, packet_keys = self.keys() - return packet_keys + def source(self) -> dp.Kernel | None: + """Sources are their own source.""" + return self + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """Sources have no upstream dependencies.""" + return () + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """Delegate to the cached KernelStream.""" + return self().keys() + + def types(self) -> tuple[TypeSpec, TypeSpec]: + """Delegate to the cached KernelStream.""" + return self().types() + + @property + def last_modified(self): + """Delegate to the cached KernelStream.""" + return self().last_modified + + @property + def is_current(self) -> bool: + """Delegate to the cached KernelStream.""" + return self().is_current + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """Delegate to the cached KernelStream.""" + return self().iter_packets() def as_table( self, @@ -51,6 +122,7 @@ def as_table( include_source: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": + """Delegate to the cached KernelStream.""" return self().as_table( include_data_context=include_data_context, include_source=include_source, @@ -58,77 +130,74 @@ def as_table( ) def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + """Delegate to the cached KernelStream.""" return self().flow() - def get_all_records( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all records associated with the node. - If include_system_columns is True, system columns will be included in the result. - """ - raise NotImplementedError("This method should be implemented by subclasses.") + # ==================== LiveStream Protocol (Delegation) ==================== - @property - def lazy(self) -> "pl.LazyFrame | None": - records = self.get_all_records(include_system_columns=False) - if records is not None: - return pl.LazyFrame(records) - return None + def refresh(self, force: bool = False) -> bool: + """Delegate to the cached KernelStream.""" + return self().refresh(force=force) - @property - def df(self) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - lazy_df = self.lazy - if lazy_df is not None: - return lazy_df.collect() - return None + def invalidate(self) -> None: + """Delegate to the cached KernelStream.""" + return self().invalidate() - @property - def polars_df(self) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - return self.df + # ==================== Source-Specific Utilities ==================== - @property - def pandas_df(self) -> "pd.DataFrame | None": + def reset_cache(self) -> None: """ - Return the pandas DataFrame representation of the pod's records. + Clear the cached KernelStream, forcing a fresh one on next access. + + Useful when the underlying data source has fundamentally changed + (e.g., file path changed, database connection reset). """ - records = self.get_all_records(include_system_columns=False) - if records is not None: - pandas_df = records.to_pandas() - pandas_df.set_index(list(self.tag_keys), inplace=True) - return pandas_df - return None + if self._cached_kernel_stream is not None: + self._cached_kernel_stream.invalidate() + self._cached_kernel_stream = None - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - tag_types, packet_types = self.output_types() - return tuple(tag_types.keys()), tuple(packet_types.keys()) - def types(self) -> tuple[TypeSpec, TypeSpec]: - return self.output_types() +# ==================== Example Implementation ==================== - def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().__iter__() - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().iter_packets() +class CSVSource(SourceBase): + """Loads data from a CSV file.""" - # properties and methods to act as a dp.Stream - @property - def source(self) -> dp.Kernel | None: - return self + def __init__(self, file_path: str, tag_columns: list[str] | None = None, **kwargs): + super().__init__(**kwargs) + self.file_path = file_path + self.tag_columns = tag_columns or [] - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - return () + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return (self.__class__.__name__, self.file_path, tuple(self.tag_columns)) - def validate_inputs(self, *processed_streams: dp.Stream) -> None: - pass + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + import pyarrow.csv as csv + + from orcapod.data.streams import ImmutableTableStream + + # Load current state of the file + table = csv.read_csv(self.file_path) + + return ImmutableTableStream( + table=table, + tag_columns=self.tag_columns, + source=self, + upstreams=(), + ) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """Infer types from the file (could be cached).""" + # For demonstration - in practice you might cache this + sample_stream = self.forward() + return sample_stream.types() class ManualDeltaTableSource(SourceBase): @@ -151,7 +220,7 @@ def __init__( self.table_path = table_path self._delta_table: DeltaTable | None = None - self.refresh() + self.load_delta_table() if self._delta_table is None: if schema is None: @@ -219,10 +288,6 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: ).to_table() return ImmutableTableStream(arrow_data, self.tag_columns, source=self) - @property - def kernel_id(self) -> tuple[str, ...]: - return (self.__class__.__name__, str(self.table_path)) - def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None): """ Return the identity structure of the kernel. @@ -276,9 +341,9 @@ def insert( # update the delta table self._delta_table = DeltaTable(self.table_path) - def refresh(self) -> None: + def load_delta_table(self) -> None: """ - Refresh the delta table to ensure it is up-to-date. + Try loading the delta table from the file system. """ current_version = self.delta_table_version try: @@ -294,3 +359,33 @@ def refresh(self) -> None: # delta table has been updated self._set_modified_time() self._delta_table = delta_table + + +class GlobSource(SourceBase): + """ + A source that reads files from the file system using a glob pattern. + It generates its own stream from the file system. + """ + + def __init__( + self, + name: str, + file_path: str | Path, + glob_pattern: str, + **kwargs, + ): + super().__init__(name=name, **kwargs) + self.file_path = Path(file_path) + self.glob_pattern = glob_pattern + + @staticmethod + def default_tag_function(file_path: Path) -> dict: + return {"file_path": str(file_path)} + + def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None): + hash_function_kwargs = { + "include_declaration": True, + "include_source": True, + "include_content_hash": True, + "include_data_context": True, + } From f956fea25b0ed5510b23c5eb01fca1caf2d24a10 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 06:05:00 +0000 Subject: [PATCH 149/224] feat: complete implementation of manual source and adjustment of source protocol --- src/orcapod/data/sources.py | 303 +++++++++++++++++++++--- src/orcapod/pipeline/nodes.py | 6 +- src/orcapod/protocols/data_protocols.py | 44 +++- 3 files changed, 312 insertions(+), 41 deletions(-) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index d1b627b..c9b39c9 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -2,7 +2,7 @@ from collections.abc import Collection, Iterator from datetime import datetime from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError @@ -112,6 +112,14 @@ def is_current(self) -> bool: """Delegate to the cached KernelStream.""" return self().is_current + def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Iterate over the cached KernelStream. + + This allows direct iteration over the source as if it were a stream. + """ + return self().iter_packets() + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets() @@ -143,7 +151,53 @@ def invalidate(self) -> None: """Delegate to the cached KernelStream.""" return self().invalidate() - # ==================== Source-Specific Utilities ==================== + # ==================== Source Protocol ==================== + + @abstractmethod + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records from the source. + + This method should be implemented by subclasses to return the full dataset. + If the source has no records, return None. + """ + ... + + @property + def lazy(self) -> "pl.LazyFrame | None": + records = self.get_all_records(include_system_columns=False) + if records is not None: + return pl.LazyFrame(records) + return None + + @property + def df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + lazy_df = self.lazy + if lazy_df is not None: + return lazy_df.collect() + return None + + @property + def polars_df(self) -> "pl.DataFrame | None": + """ + Return the DataFrame representation of the pod's records. + """ + return self.df + + @property + def pandas_df(self) -> "pd.DataFrame | None": + """ + Return the pandas DataFrame representation of the pod's records. + """ + records = self.get_all_records(include_system_columns=False) + if records is not None: + return records.to_pandas() + return None def reset_cache(self) -> None: """ @@ -200,10 +254,20 @@ def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: return sample_stream.types() +class DuplicateTagError(ValueError): + """Raised when duplicate tag values are found and skip_duplicates=False""" + + pass + + class ManualDeltaTableSource(SourceBase): """ A source that allows manual delta updates to a table. This is useful for testing and debugging purposes. + + Supports duplicate tag handling: + - skip_duplicates=True: Use merge operation to only insert new tag combinations + - skip_duplicates=False: Raise error if duplicate tags would be created """ def __init__( @@ -218,7 +282,7 @@ def __init__( """ super().__init__(**kwargs) - self.table_path = table_path + self.table_path = Path(table_path) self._delta_table: DeltaTable | None = None self.load_delta_table() @@ -239,12 +303,12 @@ def __init__( fields = [] for field in arrow_schema: if field.name in tag_columns: - field = field.with_metadata({b"table": b"True"}) + field = field.with_metadata({b"tag": b"True"}) fields.append(field) arrow_schema = pa.schema(fields) else: - arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) # type: ignore + arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) python_schema = schemas.PythonSchema.from_arrow_schema( arrow_schema, self.data_context.semantic_type_registry ) @@ -252,16 +316,18 @@ def __init__( for field in arrow_schema: if ( field.metadata is not None - and field.metadata.get(b"table", b"False").decode().lower() - == "true" + and field.metadata.get(b"tag", b"False").decode().lower() == "true" ): inferred_tag_columns.append(field.name) tag_columns = tag_columns or inferred_tag_columns + self.python_schema = python_schema self.arrow_schema = arrow_schema - self.tag_columns = tag_columns + self.tag_columns = list(tag_columns) if tag_columns else [] - self._is_current = True + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.__class__.__name__, str(self.table_path)) @property def delta_table_version(self) -> int | None: @@ -273,22 +339,25 @@ def delta_table_version(self) -> int | None: return self._delta_table.version() return None - @property - def is_current(self) -> bool: - return self._is_current - def forward(self, *streams: dp.Stream) -> dp.Stream: + """Load current delta table data as a stream.""" if len(streams) > 0: raise ValueError("ManualDeltaTableSource takes no input streams") + if self._delta_table is None: arrow_data = pa.Table.from_pylist([], schema=self.arrow_schema) else: arrow_data = self._delta_table.to_pyarrow_dataset( as_large_types=True ).to_table() - return ImmutableTableStream(arrow_data, self.tag_columns, source=self) - def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None): + return ImmutableTableStream( + arrow_data, tag_columns=self.tag_columns, source=self, upstreams=() + ) + + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: """ Return the identity structure of the kernel. This is a unique identifier for the kernel based on its class name and table path. @@ -296,6 +365,7 @@ def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None return (self.__class__.__name__, str(self.table_path)) def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """Return tag and packet types based on schema and tag columns.""" tag_types: TypeSpec = {} packet_types: TypeSpec = {} for field, field_type in self.python_schema.items(): @@ -305,42 +375,200 @@ def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: packet_types[field] = field_type return tag_types, packet_types - def get_all_records(self, include_system_columns: bool = False) -> Table | None: + def get_all_records(self, include_system_columns: bool = False) -> pa.Table | None: + """Get all records from the delta table.""" if self._delta_table is None: return None + arrow_data = self._delta_table.to_pyarrow_dataset( as_large_types=True ).to_table() + if not include_system_columns: arrow_data = arrow_data.drop( [col for col in arrow_data.column_names if col.startswith("_")] ) return arrow_data + def _normalize_data_to_table( + self, data: "dict | pa.Table | pl.DataFrame | pd.DataFrame" + ) -> pa.Table: + """Convert input data to PyArrow Table with correct schema.""" + if isinstance(data, dict): + return pa.Table.from_pylist([data], schema=self.arrow_schema) + elif isinstance(data, pa.Table): + return data + else: + # Handle polars/pandas DataFrames + if hasattr(data, "to_arrow"): # Polars DataFrame + return data.to_arrow() # type: ignore + elif hasattr(data, "to_pandas"): # Polars to pandas fallback + return pa.Table.from_pandas(data.to_pandas(), schema=self.arrow_schema) # type: ignore + else: # Assume pandas DataFrame + return pa.Table.from_pandas( + cast(pd.DataFrame, data), schema=self.arrow_schema + ) + + def _check_for_duplicates(self, new_data: pa.Table) -> None: + """ + Check if new data contains tag combinations that already exist. + Raises DuplicateTagError if duplicates found. + """ + if self._delta_table is None or not self.tag_columns: + return # No existing data or no tag columns to check + + # Get existing tag combinations + existing_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + if len(existing_data) == 0: + return # No existing data + + # Extract tag combinations from existing data + existing_tags = existing_data.select(self.tag_columns) + new_tags = new_data.select(self.tag_columns) + + # Convert to sets of tuples for comparison + existing_tag_tuples = set() + for i in range(len(existing_tags)): + tag_tuple = tuple( + existing_tags.column(col)[i].as_py() for col in self.tag_columns + ) + existing_tag_tuples.add(tag_tuple) + + # Check for duplicates in new data + duplicate_tags = [] + for i in range(len(new_tags)): + tag_tuple = tuple( + new_tags.column(col)[i].as_py() for col in self.tag_columns + ) + if tag_tuple in existing_tag_tuples: + duplicate_tags.append(tag_tuple) + + if duplicate_tags: + tag_names = ", ".join(self.tag_columns) + duplicate_strs = [str(tags) for tags in duplicate_tags] + raise DuplicateTagError( + f"Duplicate tag combinations found for columns [{tag_names}]: " + f"{duplicate_strs}. Use skip_duplicates=True to merge instead." + ) + + def _merge_data(self, new_data: pa.Table) -> None: + """ + Merge new data using Delta Lake merge operation. + Only inserts rows where tag combinations don't already exist. + """ + if self._delta_table is None: + # No existing table, just write the data + write_deltalake( + self.table_path, + new_data, + mode="overwrite", + ) + else: + # Use merge operation - only insert if tag combination doesn't exist + # Build merge condition based on tag columns + # Format: "target.col1 = source.col1 AND target.col2 = source.col2" + merge_conditions = " AND ".join( + f"target.{col} = source.{col}" for col in self.tag_columns + ) + + try: + # Use Delta Lake's merge functionality + ( + self._delta_table.merge( + source=new_data, + predicate=merge_conditions, + source_alias="source", + target_alias="target", + ) + .when_not_matched_insert_all() # Insert when no match found + .execute() + ) + except Exception: + # Fallback: manual duplicate filtering if merge fails + self._manual_merge_fallback(new_data) + + def _manual_merge_fallback(self, new_data: pa.Table) -> None: + """ + Fallback merge implementation that manually filters duplicates. + """ + if self._delta_table is None or not self.tag_columns: + write_deltalake(self.table_path, new_data, mode="append") + return + + # Get existing tag combinations + existing_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + existing_tags = existing_data.select(self.tag_columns) + + # Create set of existing tag tuples + existing_tag_tuples = set() + for i in range(len(existing_tags)): + tag_tuple = tuple( + existing_tags.column(col)[i].as_py() for col in self.tag_columns + ) + existing_tag_tuples.add(tag_tuple) + + # Filter new data to only include non-duplicate rows + filtered_rows = [] + new_tags = new_data.select(self.tag_columns) + + for i in range(len(new_data)): + tag_tuple = tuple( + new_tags.column(col)[i].as_py() for col in self.tag_columns + ) + if tag_tuple not in existing_tag_tuples: + # Extract this row + row_dict = {} + for col_name in new_data.column_names: + row_dict[col_name] = new_data.column(col_name)[i].as_py() + filtered_rows.append(row_dict) + + # Only append if there are new rows to add + if filtered_rows: + filtered_table = pa.Table.from_pylist( + filtered_rows, schema=self.arrow_schema + ) + write_deltalake(self.table_path, filtered_table, mode="append") + def insert( self, data: "dict | pa.Table | pl.DataFrame | pd.DataFrame", + skip_duplicates: bool = False, ) -> None: """ Insert data into the delta table. + + Args: + data: Data to insert (dict, PyArrow Table, Polars DataFrame, or Pandas DataFrame) + skip_duplicates: If True, use merge operation to skip duplicate tag combinations. + If False, raise error if duplicate tag combinations are found. + + Raises: + DuplicateTagError: If skip_duplicates=False and duplicate tag combinations are found. """ - if isinstance(data, dict): - data = pa.Table.from_pylist([data], schema=self.arrow_schema) - elif isinstance(data, pl.DataFrame): - data = data.to_arrow() - elif isinstance(data, pd.DataFrame): - data = pa.Table.from_pandas(data, schema=self.arrow_schema) + # Normalize data to PyArrow Table + new_data_table = self._normalize_data_to_table(data) - self._set_modified_time() - write_deltalake( - self.table_path, - data, - mode="append", - ) + if skip_duplicates: + # Use merge operation to only insert new tag combinations + self._merge_data(new_data_table) + else: + # Check for duplicates first, raise error if found + self._check_for_duplicates(new_data_table) - # update the delta table + # No duplicates found, safe to append + write_deltalake(self.table_path, new_data_table, mode="append") + + # Update our delta table reference and mark as modified + self._set_modified_time() self._delta_table = DeltaTable(self.table_path) + # Invalidate any cached streams + self.invalidate() + def load_delta_table(self) -> None: """ Try loading the delta table from the file system. @@ -350,14 +578,15 @@ def load_delta_table(self) -> None: delta_table = DeltaTable(self.table_path) except TableNotFoundError: delta_table = None - new_version = self.delta_table_version - if (current_version is None and new_version is not None) or ( - current_version is not None - and new_version is not None - and current_version < new_version - ): - # delta table has been updated - self._set_modified_time() + + if delta_table is not None: + new_version = delta_table.version() + if (current_version is None) or ( + current_version is not None and new_version > current_version + ): + # Delta table has been updated + self._set_modified_time() + self._delta_table = delta_table diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index bb91557..ffb2bcb 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,6 +1,6 @@ from collections.abc import Collection, Iterator from datetime import datetime -from orcapod.data.kernels import WrappedKernel, TrackedKernelBase +from orcapod.data.kernels import KernelStream, WrappedKernel, TrackedKernelBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec @@ -34,7 +34,7 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - self._cached_stream: dp.LiveStream | None = None + self._cached_stream: KernelStream | None = None self.input_streams = tuple(input_streams) self.pipeline_store = pipeline_store self.pipeline_path_prefix = pipeline_path_prefix @@ -89,7 +89,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: # super().validate_inputs(*self.input_streams) return super().forward(*self.input_streams) - def __call__(self, *args, **kwargs) -> dp.LiveStream: + def __call__(self, *args, **kwargs) -> KernelStream: if self._cached_stream is None: self._cached_stream = super().__call__(*args, **kwargs) return self._cached_stream diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index ac436b3..2a60ad2 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -6,6 +6,8 @@ if TYPE_CHECKING: import pyarrow as pa + import polars as pl + import pandas as pd class Datagram(Protocol): @@ -1589,7 +1591,47 @@ class Source(Kernel, Stream, Protocol): - May have their own refresh/update mechanisms """ - pass + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + ... + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + ... + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records from the source. + + Args: + include_system_columns: Whether to include system columns in the output + + Returns: + pa.Table | None: A table containing all records, or None if no records are available + """ + ... + + @property + def lazy(self) -> "pl.LazyFrame | None": ... + + @property + def df(self) -> "pl.DataFrame | None": ... + + @property + def polars_df(self) -> "pl.DataFrame | None": ... + + @property + def pandas_df(self) -> "pd.DataFrame | None": ... class Tracker(Protocol): From de93e310ea3c9e14618fd802081767f7a108bc91 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 30 Jul 2025 06:31:52 +0000 Subject: [PATCH 150/224] refactor: introduce source as first class kernel in tracker --- src/orcapod/data/sources.py | 66 ++++++++++---- src/orcapod/data/streams.py | 86 +++++++++--------- src/orcapod/data/trackers.py | 65 ++++++-------- src/orcapod/pipeline/graph.py | 9 ++ src/orcapod/pipeline/nodes.py | 113 ++---------------------- src/orcapod/protocols/data_protocols.py | 34 +++++++ 6 files changed, 165 insertions(+), 208 deletions(-) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index c9b39c9..32ae6b7 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -9,7 +9,11 @@ from pyarrow.lib import Table from orcapod.data.kernels import TrackedKernelBase -from orcapod.data.streams import ImmutableTableStream, KernelStream +from orcapod.data.streams import ( + ImmutableTableStream, + KernelStream, + OperatorStreamBaseMixin, +) from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas from orcapod.utils.lazy_module import LazyModule @@ -24,7 +28,7 @@ pa = LazyModule("pyarrow") -class SourceBase(TrackedKernelBase): +class SourceBase(TrackedKernelBase, OperatorStreamBaseMixin): """ Base class for sources that act as both Kernels and LiveStreams. @@ -45,26 +49,27 @@ def __init__(self, **kwargs): # =========================== Kernel Methods =========================== - @abstractmethod - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Pure computation: return a static snapshot of the data. + # The following are inherited from TrackedKernelBase as abstract methods. + # @abstractmethod + # def forward(self, *streams: dp.Stream) -> dp.Stream: + # """ + # Pure computation: return a static snapshot of the data. - This is the core method that subclasses must implement. - Each call should return a fresh stream representing the current state of the data. - This is what KernelStream calls when it needs to refresh its data. - """ - ... + # This is the core method that subclasses must implement. + # Each call should return a fresh stream representing the current state of the data. + # This is what KernelStream calls when it needs to refresh its data. + # """ + # ... - @abstractmethod - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - """Return the tag and packet types this source produces.""" - ... + # @abstractmethod + # def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + # """Return the tag and packet types this source produces.""" + # ... - @abstractmethod - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> dp.Any: ... + # @abstractmethod + # def kernel_identity_structure( + # self, streams: Collection[dp.Stream] | None = None + # ) -> dp.Any: ... def validate_inputs(self, *streams: dp.Stream) -> None: """Sources take no input streams.""" @@ -82,6 +87,10 @@ def prepare_output_stream( ) return self._cached_kernel_stream + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_source_invocation(self, label=label) + # ==================== Stream Protocol (Delegation) ==================== @property @@ -153,6 +162,25 @@ def invalidate(self) -> None: # ==================== Source Protocol ==================== + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + tag_keys, _ = self.keys() + return tag_keys + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + # TODO: consider caching this + _, packet_keys = self.keys() + return packet_keys + @abstractmethod def get_all_records( self, include_system_columns: bool = False diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 832e2e9..6665a33 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -31,7 +31,49 @@ logger = logging.getLogger(__name__) -class StreamBase(ABC, LabeledContentIdentifiableBase): +class OperatorStreamBaseMixin: + def join(self, other_stream: dp.Stream) -> dp.Stream: + """ + Joins this stream with another stream, returning a new stream that contains + the combined data from both streams. + """ + from orcapod.data.operators import Join + + return Join()(self, other_stream) # type: ignore[return-value] + + def semi_join(self, other_stream: dp.Stream) -> dp.Stream: + """ + Performs a semi-join with another stream, returning a new stream that contains + only the packets from this stream that have matching tags in the other stream. + """ + from orcapod.data.operators import SemiJoin + + return SemiJoin()(self, other_stream) # type: ignore[return-value] + + def map_tags( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> dp.Stream: + """ + Maps the tags in this stream according to the provided name_map. + If drop_unmapped is True, any tags that are not in the name_map will be dropped. + """ + from orcapod.data.operators import MapTags + + return MapTags(name_map, drop_unmapped)(self) # type: ignore[return-value] + + def map_packets( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> dp.Stream: + """ + Maps the packets in this stream according to the provided packet_map. + If drop_unmapped is True, any packets that are not in the packet_map will be dropped. + """ + from orcapod.data.operators import MapPackets + + return MapPackets(name_map, drop_unmapped)(self) # type: ignore[return-value] + + +class StreamBase(ABC, OperatorStreamBaseMixin, LabeledContentIdentifiableBase): """ A stream is a collection of tagged-packets that are generated by an operation. The stream is iterable and can be used to access the packets in the stream. @@ -182,48 +224,6 @@ def identity_structure(self) -> Any: return self.source.identity_structure(self.upstreams) return super().identity_structure() - # ---------------Built in operators---------------- - - def join(self, other_stream: dp.Stream) -> dp.Stream: - """ - Joins this stream with another stream, returning a new stream that contains - the combined data from both streams. - """ - from orcapod.data.operators import Join - - return Join()(self, other_stream) - - def semi_join(self, other_stream: dp.Stream) -> dp.Stream: - """ - Performs a semi-join with another stream, returning a new stream that contains - only the packets from this stream that have matching tags in the other stream. - """ - from orcapod.data.operators import SemiJoin - - return SemiJoin()(self, other_stream) - - def map_tags( - self, name_map: Mapping[str, str], drop_unmapped: bool = True - ) -> dp.Stream: - """ - Maps the tags in this stream according to the provided name_map. - If drop_unmapped is True, any tags that are not in the name_map will be dropped. - """ - from orcapod.data.operators import MapTags - - return MapTags(name_map, drop_unmapped)(self) - - def map_packets( - self, name_map: Mapping[str, str], drop_unmapped: bool = True - ) -> dp.Stream: - """ - Maps the packets in this stream according to the provided packet_map. - If drop_unmapped is True, any packets that are not in the packet_map will be dropped. - """ - from orcapod.data.operators import MapPackets - - return MapPackets(name_map, drop_unmapped)(self) - class ImmutableTableStream(StreamBase): """ diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index cb89d5b..d7d1597 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -66,6 +66,16 @@ def record_kernel_invocation( for tracker in self.get_active_trackers(): tracker.record_kernel_invocation(kernel, upstreams, label=label) + def record_source_invocation( + self, source: dp.Source, label: str | None = None + ) -> None: + """ + Record the output stream of a source invocation in the tracker. + This is used to track the computational graph and the invocations of sources. + """ + for tracker in self.get_active_trackers(): + tracker.record_source_invocation(source, label=label) + def record_pod_invocation( self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: @@ -109,6 +119,11 @@ def record_kernel_invocation( label: str | None = None, ) -> None: ... + @abstractmethod + def record_source_invocation( + self, source: dp.Source, label: str | None = None + ) -> None: ... + @abstractmethod def record_pod_invocation( self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None @@ -234,9 +249,7 @@ def __init__( # This is used to track the computational graph and the invocations of kernels self.kernel_invocations: set[Invocation] = set() self.invocation_to_pod_lut: dict[Invocation, dp.Pod] = {} - self.id_to_invocation_lut: dict[str, Invocation] = {} - self.id_to_label_lut: dict[str, list[str]] = defaultdict(list) - self.id_to_pod_lut: dict[str, dp.Pod] = {} + self.invocation_to_source_lut: dict[Invocation, dp.Source] = {} def _record_kernel_and_get_invocation( self, @@ -260,6 +273,15 @@ def record_kernel_invocation( """ self._record_kernel_and_get_invocation(kernel, upstreams, label) + def record_source_invocation( + self, source: dp.Source, label: str | None = None + ) -> None: + """ + Record the output stream of a source invocation in the tracker. + """ + invocation = self._record_kernel_and_get_invocation(source, (), label) + self.invocation_to_source_lut[invocation] = source + def record_pod_invocation( self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None ) -> None: @@ -289,42 +311,5 @@ def generate_graph(self) -> "nx.DiGraph": G.add_edge(upstream_invocation, invocation) return G - # def generate_namemap(self) -> dict[Invocation, str]: - # namemap = {} - # for kernel, invocations in self.invocation_lut.items(): - # # if only one entry present, use the kernel name alone - # if kernel.label is not None: - # node_label = kernel.label - # else: - # node_label = str(kernel) - # if len(invocations) == 1: - # namemap[invocations[0]] = node_label - # continue - # # if multiple entries, use the kernel name and index - # for idx, invocation in enumerate(invocations): - # namemap[invocation] = f"{node_label}_{idx}" - # return namemap - - # def draw_graph(self): - # import networkx as nx - # import matplotlib.pyplot as plt - - # G = self.generate_graph() - # labels = self.generate_namemap() - - # pos = nx.drawing.nx_agraph.graphviz_layout(G, prog="dot") - # nx.draw( - # G, - # pos, - # labels=labels, - # node_size=2000, - # node_color="lightblue", - # with_labels=True, - # font_size=10, - # font_weight="bold", - # arrowsize=20, - # ) - # plt.tight_layout() - DEFAULT_TRACKER_MANAGER = BasicTrackerManager() diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 72ab320..29fd1bb 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -113,6 +113,15 @@ def wrap_invocation( pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) + elif invocation in self.invocation_to_source_lut: + source = self.invocation_to_source_lut[invocation] + node = KernelNode( + kernel=source, + input_streams=new_input_streams, + pipeline_store=self.pipeline_store, + pipeline_path_prefix=self.pipeline_store_path_prefix, + label=invocation.label, + ) else: node = KernelNode( kernel=invocation.kernel, diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index ffb2bcb..ffbc4d5 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,6 +1,7 @@ from collections.abc import Collection, Iterator from datetime import datetime from orcapod.data.kernels import KernelStream, WrappedKernel, TrackedKernelBase +from orcapod.data.sources import SourceBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec @@ -20,7 +21,7 @@ class Node( - TrackedKernelBase, + SourceBase, ): """ Mixin class for pipeline nodes @@ -50,25 +51,6 @@ def contained_kernel(self) -> dp.Kernel: "This property should be implemented by subclasses to return the contained kernel." ) - @property - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - tag_keys, _ = self.keys() - return tag_keys - - @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - # TODO: consider caching this - _, packet_keys = self.keys() - return packet_keys - @property def pipeline_path(self) -> tuple[str, ...]: """ @@ -77,9 +59,6 @@ def pipeline_path(self) -> tuple[str, ...]: """ return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) - def validate_inputs(self, *processed_streams: dp.Stream) -> None: - pass - def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: raise NotImplementedError( @@ -87,62 +66,18 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: ) # TODO: re-evaluate the use here # super().validate_inputs(*self.input_streams) - return super().forward(*self.input_streams) - - def __call__(self, *args, **kwargs) -> KernelStream: - if self._cached_stream is None: - self._cached_stream = super().__call__(*args, **kwargs) - return self._cached_stream + return super().forward(*self.input_streams) # type: ignore[return-value] - # properties and methods to act as a dp.Stream - @property - def source(self) -> dp.Kernel | None: - return self - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - tag_types, packet_types = self.types() - return tuple(tag_types.keys()), tuple(packet_types.keys()) - - def types(self) -> tuple[TypeSpec, TypeSpec]: - return self.contained_kernel.output_types(*self.input_streams) - - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ Return the output types of the node. This is used to determine the types of the output streams. """ return self.contained_kernel.output_types(*self.input_streams) - @property - def last_modified(self) -> datetime | None: - return self().last_modified - - @property - def is_current(self) -> bool: - return self().is_current - - def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().__iter__() - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - return self().iter_packets() - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_content_hash: bool | str = False, - ) -> "pa.Table": - return self().as_table( - include_data_context=include_data_context, - include_source=include_source, - include_content_hash=include_content_hash, - ) - - def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: - return self().flow() - - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: """ Return the identity structure of the node. This is used to compute the invocation hash. @@ -164,40 +99,6 @@ def get_all_records( """ raise NotImplementedError("This method should be implemented by subclasses.") - @property - def lazy(self) -> "pl.LazyFrame | None": - records = self.get_all_records(include_system_columns=False) - if records is not None: - return pl.LazyFrame(records) - return None - - @property - def df(self) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - lazy_df = self.lazy - if lazy_df is not None: - return lazy_df.collect() - return None - - @property - def polars_df(self) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - return self.df - - @property - def pandas_df(self) -> "pd.DataFrame | None": - """ - Return the pandas DataFrame representation of the pod's records. - """ - records = self.get_all_records(include_system_columns=False) - if records is not None: - return records.to_pandas() - return None - class KernelNode(Node, WrappedKernel): """ diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 2a60ad2..405b6a0 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1695,6 +1695,23 @@ def record_kernel_invocation( """ ... + def record_source_invocation( + self, source: Source, label: str | None = None + ) -> None: + """ + Record a source invocation in the computational graph. + + This method is called whenever a source is invoked. The tracker + should record: + - The source and its properties + - Timing and performance information + - Any relevant metadata + + Args: + source: The source that was invoked + """ + ... + def record_pod_invocation( self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: @@ -1787,6 +1804,23 @@ def record_kernel_invocation( """ ... + def record_source_invocation( + self, source: Source, label: str | None = None + ) -> None: + """ + Record a source invocation in the computational graph. + + This method is called whenever a source is invoked. The tracker + should record: + - The source and its properties + - Timing and performance information + - Any relevant metadata + + Args: + source: The source that was invoked + """ + ... + def record_pod_invocation( self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None ) -> None: From b1d06313cb43dd63af8a9aee7b4c81fd3f737bd9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 03:37:55 +0000 Subject: [PATCH 151/224] feat: add refined function info extrators --- src/orcapod/hashing/hash_utils.py | 130 ++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 790b49f..904d55e 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -362,3 +362,133 @@ def get_function_signature( if "returns" in parts: fn_string = fn_string + f"-> {str(parts['returns'])}" return fn_string + + +def _is_in_string(line, pos): + """Helper to check if a position in a line is inside a string literal.""" + # This is a simplified check - would need proper parsing for robust handling + in_single = False + in_double = False + for i in range(pos): + if line[i] == "'" and not in_double and (i == 0 or line[i - 1] != "\\"): + in_single = not in_single + elif line[i] == '"' and not in_single and (i == 0 or line[i - 1] != "\\"): + in_double = not in_double + return in_single or in_double + + +def get_function_components( + func: Callable, + name_override: str | None = None, + include_name: bool = True, + include_module: bool = True, + include_declaration: bool = True, + include_docstring: bool = True, + include_comments: bool = True, + preserve_whitespace: bool = True, + include_annotations: bool = True, + include_code_properties: bool = True, +) -> list: + """ + Extract the components of a function that determine its identity for hashing. + + Args: + func: The function to process + include_name: Whether to include the function name + include_module: Whether to include the module name + include_declaration: Whether to include the function declaration line + include_docstring: Whether to include the function's docstring + include_comments: Whether to include comments in the function body + preserve_whitespace: Whether to preserve original whitespace/indentation + include_annotations: Whether to include function type annotations + include_code_properties: Whether to include code object properties + + Returns: + A list of string components + """ + components = [] + + # Add function name + if include_name: + components.append(f"name:{name_override or func.__name__}") + + # Add module + if include_module and hasattr(func, "__module__"): + components.append(f"module:{func.__module__}") + + # Get the function's source code + try: + source = inspect.getsource(func) + + # Handle whitespace preservation + if not preserve_whitespace: + source = inspect.cleandoc(source) + + # Process source code components + if not include_declaration: + # Remove function declaration line + lines = source.split("\n") + for i, line in enumerate(lines): + if line.strip().startswith("def "): + lines.pop(i) + break + source = "\n".join(lines) + + # Extract and handle docstring separately if needed + if not include_docstring and func.__doc__: + # This approach assumes the docstring is properly indented + # For multi-line docstrings, we need more sophisticated parsing + doc_str = inspect.getdoc(func) + if doc_str: + doc_lines = doc_str.split("\n") + else: + doc_lines = [] + doc_pattern = '"""' + "\\n".join(doc_lines) + '"""' + # Try different quote styles + if doc_pattern not in source: + doc_pattern = "'''" + "\\n".join(doc_lines) + "'''" + source = source.replace(doc_pattern, "") + + # Handle comments (this is more complex and may need a proper parser) + if not include_comments: + # This is a simplified approach - would need a proper parser for robust handling + lines = source.split("\n") + for i, line in enumerate(lines): + comment_pos = line.find("#") + if comment_pos >= 0 and not _is_in_string(line, comment_pos): + lines[i] = line[:comment_pos].rstrip() + source = "\n".join(lines) + + components.append(f"source:{source}") + + except (IOError, TypeError): + # If source can't be retrieved, fall back to signature + components.append(f"name:{name_override or func.__name__}") + try: + sig = inspect.signature(func) + components.append(f"signature:{str(sig)}") + except ValueError: + components.append("builtin:True") + + # Add function annotations if requested + if ( + include_annotations + and hasattr(func, "__annotations__") + and func.__annotations__ + ): + sorted_annotations = sorted(func.__annotations__.items()) + annotations_str = ";".join(f"{k}:{v}" for k, v in sorted_annotations) + components.append(f"annotations:{annotations_str}") + + # Add code object properties if requested + if include_code_properties: + code = func.__code__ + stable_code_props = { + "co_argcount": code.co_argcount, + "co_kwonlyargcount": getattr(code, "co_kwonlyargcount", 0), + "co_nlocals": code.co_nlocals, + "co_varnames": code.co_varnames[: code.co_argcount], + } + components.append(f"code_properties:{stable_code_props}") + + return components From 8031bf7867c33e5cf40a7bb62fb5150d44ba8cc2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 03:50:19 +0000 Subject: [PATCH 152/224] refactor: add new caching and pod streaming implementation --- src/orcapod/data/context.py | 1 - src/orcapod/data/pods.py | 267 ++++++++++++++++++++- src/orcapod/data/sources.py | 105 +++++--- src/orcapod/data/streams.py | 203 ++++++++++++++++ src/orcapod/data/system_constants.py | 29 ++- src/orcapod/errors.py | 6 + src/orcapod/pipeline/nodes.py | 34 +-- src/orcapod/protocols/data_protocols.py | 76 ++++++ src/orcapod/protocols/hashing_protocols.py | 2 + src/orcapod/protocols/store_protocols.py | 12 +- src/orcapod/stores/delta_lake_stores.py | 67 +++++- src/orcapod/types/arrow_utils.py | 123 ---------- 12 files changed, 730 insertions(+), 195 deletions(-) delete mode 100644 src/orcapod/types/arrow_utils.py diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py index 20bc43a..9e402ab 100644 --- a/src/orcapod/data/context.py +++ b/src/orcapod/data/context.py @@ -47,7 +47,6 @@ def resolve_context(self, context_info: str | DataContext | None) -> DataContext raise ValueError(f"DataContext with key {context_info} not found.") - default_data_context = DataContext( "std:v0.1.0:default", default_registry, diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 087c2f7..e777d93 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -12,7 +12,7 @@ ) from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join -from orcapod.data.streams import LazyPodResultStream +from orcapod.data.streams import LazyPodResultStream, EfficientPodResultStream from orcapod.data.system_constants import orcapod_constants as constants from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp @@ -23,11 +23,14 @@ from orcapod.types.schemas import PythonSchema from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils.lazy_module import LazyModule +from orcapod.hashing.hash_utils import get_function_signature, get_function_components if TYPE_CHECKING: import pyarrow as pa + import pyarrow.compute as pc else: pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") logger = logging.getLogger(__name__) @@ -54,6 +57,21 @@ def output_packet_types(self) -> TypeSpec: """ ... + @abstractmethod + def get_record_id(self, packet: dp.Packet) -> str: + """ + Return the record ID for the input packet. This is used to identify the pod in the system. + """ + ... + + @property + @abstractmethod + def tiered_pod_id(self) -> dict[str, str]: + """ + Return the tiered pod ID for the pod. This is used to identify the pod in a tiered architecture. + """ + ... + def __init__( self, error_handling: error_handling_options = "raise", @@ -147,6 +165,7 @@ def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> Non def function_pod( output_keys: str | Collection[str] | None = None, function_name: str | None = None, + version: str = "v0.0", label: str | None = None, **kwargs, ) -> Callable[..., "FunctionPod"]: @@ -186,6 +205,7 @@ def decorator(func) -> FunctionPod: function=func, output_keys=output_keys, function_name=function_name or base_function_name, + version=version, label=label, **kwargs, ) @@ -200,6 +220,7 @@ def __init__( function: dp.PodFunction, output_keys: str | Collection[str] | None = None, function_name=None, + version: str = "v0.0", input_typespec: TypeSpec | None = None, output_typespec: TypeSpec | Sequence[type] | None = None, label: str | None = None, @@ -221,6 +242,7 @@ def __init__( "function_name must be provided if function has no __name__ attribute" ) self.function_name = function_name + self.version = version super().__init__(label=label or self.function_name, **kwargs) # extract input and output types from the function signature @@ -238,15 +260,30 @@ def __init__( ) ) self._function_info_extractor = function_info_extractor - - # now compute hash for the self and store that info - self._pod_hash = self.data_context.object_hasher.hash_to_hex( - self, prefix_hasher_id=True + object_hasher = self.data_context.object_hasher + self._function_signature_hash = object_hasher.hash_to_hex( + get_function_signature(self.function), prefix_hasher_id=True + ) + self._function_content_hash = object_hasher.hash_to_hex( + get_function_components(self.function), prefix_hasher_id=True ) + @property + def tiered_pod_id(self) -> dict[str, str]: + return { + "signature": self._function_signature_hash, + "content": self._function_content_hash, + } + @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name, self._pod_hash) + return (self.function_name, self.version) + + def get_record_id(self, packet: dp.Packet) -> str: + content = (packet.content_hash(), self.tiered_pod_id) + return self.data_context.object_hasher.hash_to_hex( + content, prefix_hasher_id=True + ) def input_packet_types(self) -> PythonSchema: """ @@ -303,9 +340,9 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) output_data = {k: v for k, v in zip(self.output_keys, output_values)} + record_id = self.get_record_id(packet) source_info = { - k: ":".join(self.kernel_id + (packet.content_hash(), k)) - for k in output_data + k: ":".join(self.kernel_id + (record_id, k)) for k in output_data } output_packet = DictPacket( @@ -381,6 +418,16 @@ def kernel_id(self) -> tuple[str, ...]: """ return self.pod.kernel_id + def get_record_id(self, packet: dp.Packet) -> str: + return self.pod.get_record_id(packet) + + @property + def tiered_pod_id(self) -> dict[str, str]: + """ + Return the tiered pod ID for the wrapped pod. This is used to identify the pod in a tiered architecture. + """ + return self.pod.tiered_pod_id + def computed_label(self) -> str | None: return self.pod.label @@ -402,8 +449,7 @@ def validate_inputs(self, *streams: dp.Stream) -> None: self.pod.validate_inputs(*streams) def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: - output_tag, output_packet = self.pod.call(tag, packet) - return output_tag, output_packet + return self.pod.call(tag, packet) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -417,7 +463,7 @@ def __str__(self) -> str: return f"WrappedPod:{self.pod!s}" -class CachedPod(WrappedPod): +class CachedPod2(WrappedPod): """ A pod that caches the results of the wrapped pod. This is useful for pods that are expensive to compute and can benefit from caching. @@ -439,10 +485,18 @@ def __init__( self.result_store = result_store # unset data_context native to the object - self.pod_hash = self.data_context.object_hasher.hash_to_hex( + self._pod_hash = self.data_context.object_hasher.hash_to_hex( self.pod, prefix_hasher_id=True ) + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return the pod ID, which is the function name of the wrapped pod. + This is used to identify the pod in the system. + """ + return self.pod.kernel_id + (self._pod_hash,) + @property def record_path(self) -> tuple[str, ...]: """ @@ -510,3 +564,192 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non result_table, meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, ) + + +class CachedPod(WrappedPod): + """ + A pod that caches the results of the wrapped pod. + This is useful for pods that are expensive to compute and can benefit from caching. + """ + + # name of the column in the tag store that contains the packet hash + PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" + DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" + + def __init__( + self, + pod: dp.Pod, + result_store: ArrowDataStore, + record_path_prefix: tuple[str, ...] = (), + match_tier: str | None = None, + retrieval_mode: Literal["latest", "most_specific"] = "latest", + **kwargs, + ): + super().__init__(pod, **kwargs) + self.record_path_prefix = record_path_prefix + self.result_store = result_store + self.match_tier = match_tier + self.retrieval_mode = retrieval_mode + + @property + def record_path(self) -> tuple[str, ...]: + """ + Return the path to the record in the result store. + This is used to store the results of the pod. + """ + return self.record_path_prefix + self.kernel_id + + def call( + self, + tag: dp.Tag, + packet: dp.Packet, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: + # TODO: consider logic for overwriting existing records + output_packet = None + if not skip_cache_lookup: + output_packet = self.get_recorded_output_packet(packet) + if output_packet is None: + tag, output_packet = super().call(tag, packet) + if output_packet is not None and not skip_cache_insert: + self.record_packet(packet, output_packet) + + return tag, output_packet + + def forward(self, *streams: dp.Stream) -> dp.Stream: + assert len(streams) == 1, "PodBase.forward expects exactly one input stream" + return EfficientPodResultStream(pod=self, input_stream=streams[0]) + + def record_packet( + self, + input_packet: dp.Packet, + output_packet: dp.Packet, + skip_duplicates: bool = False, + ) -> dp.Packet: + """ + Record the output packet against the input packet in the result store. + """ + data_table = output_packet.as_table(include_context=True, include_source=True) + + for i, (k, v) in enumerate(self.tiered_pod_id.items()): + # add the tiered pod ID to the data table + data_table = data_table.add_column( + i, + f"{constants.POD_ID_PREFIX}{k}", + pa.array([v], type=pa.large_string()), + ) + + # add the input packet hash as a column + data_table = data_table.add_column( + 0, + constants.INPUT_PACKET_HASH, + pa.array([input_packet.content_hash()], type=pa.large_string()), + ) + + result_flag = self.result_store.add_record( + self.record_path, + self.pod.get_record_id(input_packet), + data_table, + skip_duplicates=skip_duplicates, + ) + # if result_flag is None: + # # TODO: do more specific error handling + # raise ValueError( + # f"Failed to record packet {input_packet} in result store {self.result_store}" + # ) + # # TODO: make store return retrieved table + return output_packet + + def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | None: + """ + Retrieve the output packet from the result store based on the input packet. + If more than one output packet is found, conflict resolution strategy + will be applied. + If the output packet is not found, return None. + """ + # result_table = self.result_store.get_record_by_id( + # self.record_path, + # self.get_entry_hash(input_packet), + # ) + + # get all records with matching the input packet hash + # TODO: add match based on match_tier if specified + constraints = {constants.INPUT_PACKET_HASH: input_packet.content_hash()} + if self.match_tier is not None: + constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( + self.pod.tiered_pod_id[self.match_tier] + ) + + result_table = self.result_store.get_records_with_column_value( + self.record_path, + constraints, + ) + if result_table is None or result_table.num_rows == 0: + return None + + if result_table.num_rows > 1: + logger.info( + f"Performing conflict resolution for multiple records for {input_packet.content_hash()}" + ) + if self.retrieval_mode == "latest": + result_table = result_table.sort_by( + self.DATA_RETRIEVED_FLAG, ascending=False + ).take([0]) + elif self.retrieval_mode == "most_specific": + # match by the most specific pod ID + # trying next level if not found + for k, v in reversed(self.tiered_pod_id.items()): + search_result = result_table.filter( + pc.field(f"{constants.POD_ID_PREFIX}{k}") == v + ) + if search_result.num_rows > 0: + result_table = search_result.take([0]) + break + if result_table.num_rows > 1: + logger.warning( + f"No matching record found for {input_packet.content_hash()} with tiered pod ID {self.tiered_pod_id}" + ) + result_table = result_table.sort_by( + self.DATA_RETRIEVED_FLAG, ascending=False + ).take([0]) + + else: + raise ValueError( + f"Unknown retrieval mode: {self.retrieval_mode}. Supported modes are 'latest' and 'most_specific'." + ) + + pod_id_columns = [ + f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() + ] + result_table = result_table.drop_columns(pod_id_columns) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + + # note that data context will be loaded from the result store + return ArrowPacket( + result_table, + meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, + ) + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Get all records from the result store for this pod. + If include_system_columns is True, include system columns in the result. + """ + result_table = self.result_store.get_all_records( + self.record_path, record_id_column=constants.INPUT_PACKET_HASH + ) + if result_table is None or result_table.num_rows == 0: + return None + + if not include_system_columns: + # remove input packet hash and tiered pod ID columns + pod_id_columns = [ + f"{constants.POD_ID_PREFIX}{k}" for k in self.tiered_pod_id.keys() + ] + result_table = result_table.drop_columns(pod_id_columns) + result_table = result_table.drop_columns(constants.INPUT_PACKET_HASH) + + return result_table diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index 32ae6b7..ec7fa0c 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -1,6 +1,5 @@ from abc import abstractmethod from collections.abc import Collection, Iterator -from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -9,14 +8,18 @@ from pyarrow.lib import Table from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.pods import PythonSchema from orcapod.data.streams import ( ImmutableTableStream, KernelStream, OperatorStreamBaseMixin, ) +from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec, schemas +from orcapod.types import DataValue, TypeSpec, schemas, typespec_utils +from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule +from orcapod.data.system_constants import orcapod_constants as constants if TYPE_CHECKING: import pandas as pd @@ -282,12 +285,6 @@ def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: return sample_stream.types() -class DuplicateTagError(ValueError): - """Raised when duplicate tag values are found and skip_duplicates=False""" - - pass - - class ManualDeltaTableSource(SourceBase): """ A source that allows manual delta updates to a table. @@ -618,31 +615,79 @@ def load_delta_table(self) -> None: self._delta_table = delta_table -class GlobSource(SourceBase): - """ - A source that reads files from the file system using a glob pattern. - It generates its own stream from the file system. - """ +class DictSource(SourceBase): + """Construct source from a collection of dictionaries""" def __init__( self, - name: str, - file_path: str | Path, - glob_pattern: str, + tags: Collection[dict[str, DataValue]], + packets: Collection[dict[str, DataValue]], + tag_typespec: TypeSpec | None = None, + packet_typespec: TypeSpec | None = None, **kwargs, ): - super().__init__(name=name, **kwargs) - self.file_path = Path(file_path) - self.glob_pattern = glob_pattern - - @staticmethod - def default_tag_function(file_path: Path) -> dict: - return {"file_path": str(file_path)} - - def kernel_identity_structure(self, streams: Collection[dp.Stream] | None = None): - hash_function_kwargs = { - "include_declaration": True, - "include_source": True, - "include_content_hash": True, - "include_data_context": True, + super().__init__(**kwargs) + self.tags = list(tags) + self.packets = list(packets) + if len(self.tags) != len(self.packets) or len(self.tags) == 0: + raise ValueError( + "Tags and packets must be non-empty collections of equal length" + ) + self.tag_typespec = tag_typespec or typespec_utils.get_typespec_from_dict( + self.tags[0] + ) + self.packet_typespec = packet_typespec or typespec_utils.get_typespec_from_dict( + self.packets[0] + ) + source_info = ":".join(self.kernel_id) + self.source_info = { + f"{constants.SOURCE_PREFIX}{k}": f"{source_info}:{k}" + for k in self.tag_typespec } + + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return ( + self.__class__.__name__, + tuple(self.tag_typespec.items()), + tuple(self.packet_typespec.items()), + ) + + def get_all_records(self, include_system_columns: bool = False) -> Table | None: + return self().as_table(include_source=include_system_columns) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + tag_arrow_schema = schemas.PythonSchema(self.tag_typespec).to_arrow_schema( + self.data_context.semantic_type_registry + ) + packet_arrow_schema = schemas.PythonSchema( + self.packet_typespec + ).to_arrow_schema(self.data_context.semantic_type_registry) + + joined_data = [ + {**tag, **packet} for tag, packet in zip(self.tags, self.packets) + ] + + table = pa.Table.from_pylist( + joined_data, + schema=arrow_utils.join_arrow_schemas( + tag_arrow_schema, packet_arrow_schema + ), + ) + + return ImmutableTableStream( + table=table, + tag_columns=self.tag_keys, + source=self, + upstreams=(), + ) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """Return tag and packet types based on provided typespecs.""" + return self.tag_typespec, self.packet_typespec diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 6665a33..40e9878 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -22,8 +22,10 @@ if TYPE_CHECKING: import pyarrow as pa + import pyarrow.compute as pc else: pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -678,6 +680,207 @@ def as_table( return output_table +class EfficientPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + # TODO: define interface for storage or pod storage + def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(input_stream,), **kwargs) + self.pod = pod + self.input_stream = input_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the input stream + + self._prepared_stream_iterator = input_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + self._cached_output_table: pa.Table | None = None + + def process_inputs(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Processes the input stream and prepares the output stream. + This is typically called before iterating over the packets. + """ + # identify all entries in the input stream for which we still don't have computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH + ) + existing_entries = self.pod.get_all_records(include_system_columns=True) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = all_results.filter(pc.is_valid(pc.field("_exists"))).drop( + target_entries.column_names + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) + yield from existing_stream.iter_packets() + + if missing is not None and missing.num_rows > 0: + for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call(tag, packet, skip_cache_lookup=True) + if packet is not None: + yield tag, packet + + self._set_modified_time() + + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + if self._prepared_stream_iterator is not None: + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + processed = self.pod.call(tag, packet) + if processed is not None: + # Update shared cache for future iterators (optimization) + self._cached_output_packets[i] = processed + tag, packet = processed + if packet is not None: + yield tag, packet + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.input_stream.keys() + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types(self) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.input_stream.types() + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> pa.Table: + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(): + if tag_schema is None: + tag_schema = tag.arrow_schema() + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict()) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + # FIXME: this skips the semantic version conversion and thus is not + # fully correct! + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + all_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + return output_table + + class WrappedStream(StreamBase): def __init__( self, diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py index de1bebc..325bf83 100644 --- a/src/orcapod/data/system_constants.py +++ b/src/orcapod/data/system_constants.py @@ -1,8 +1,11 @@ # Constants used for source info keys SYSTEM_COLUMN_PREFIX = "__" -SOURCE_INFO_PREFIX = "_source_" - -DATA_CONTEXT_KEY = "_context_key" +DATAGRAM_PREFIX = "_" +SOURCE_INFO_PREFIX = "source_" +POD_ID_PREFIX = "pod_id_" +DATA_CONTEXT_KEY = "context_key" +INPUT_PACKET_HASH = "input_packet_hash" +PACKET_RECORD_ID = "packet_id" class SystemConstant: @@ -13,13 +16,29 @@ def __init__(self, global_prefix: str = ""): def META_PREFIX(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}" + @property + def DATAGRAM_PREFIX(self) -> str: + return f"{self._global_prefix}{DATAGRAM_PREFIX}" + @property def SOURCE_PREFIX(self) -> str: - return f"{self._global_prefix}{SOURCE_INFO_PREFIX}" + return f"{self._global_prefix}{DATAGRAM_PREFIX}{SOURCE_INFO_PREFIX}" @property def CONTEXT_KEY(self) -> str: - return f"{self._global_prefix}{DATA_CONTEXT_KEY}" + return f"{self._global_prefix}{DATAGRAM_PREFIX}{DATA_CONTEXT_KEY}" + + @property + def POD_ID_PREFIX(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{POD_ID_PREFIX}" + + @property + def INPUT_PACKET_HASH(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{INPUT_PACKET_HASH}" + + @property + def PACKET_RECORD_ID(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{PACKET_RECORD_ID}" orcapod_constants = SystemConstant() diff --git a/src/orcapod/errors.py b/src/orcapod/errors.py index b1566cd..3775ee9 100644 --- a/src/orcapod/errors.py +++ b/src/orcapod/errors.py @@ -3,3 +3,9 @@ class InputValidationError(Exception): Exception raised when the inputs are not valid. This is used to indicate that the inputs do not meet the requirements of the operator. """ + + +class DuplicateTagError(ValueError): + """Raised when duplicate tag values are found and skip_duplicates=False""" + + pass diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index ffbc4d5..e7a8727 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,5 +1,3 @@ -from collections.abc import Collection, Iterator -from datetime import datetime from orcapod.data.kernels import KernelStream, WrappedKernel, TrackedKernelBase from orcapod.data.sources import SourceBase from orcapod.data.pods import ArrowDataStore, CachedPod @@ -272,15 +270,16 @@ def call( self, tag: dp.Tag, packet: dp.Packet, - skip_record_check: bool = False, - skip_recording: bool = False, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: tag, output_packet = super().call( tag, packet, - skip_record_check=skip_record_check, - skip_recording=skip_recording, + skip_cache_lookup=skip_cache_lookup, + skip_cache_insert=skip_cache_insert, ) + if output_packet is not None: retrieved = ( output_packet.get_meta_value(self.DATA_RETRIEVED_FLAG) is not None @@ -294,9 +293,10 @@ def add_pipeline_record( ) -> None: # combine dp.Tag with packet content hash to compute entry hash tag_with_hash = tag.as_table().append_column( - self.PACKET_HASH_COLUMN, + constants.INPUT_PACKET_HASH, pa.array([input_packet.content_hash()], type=pa.large_string()), ) + entry_id = self.data_context.arrow_hasher.hash_table( tag_with_hash, prefix_hasher_id=True ) @@ -307,15 +307,19 @@ def add_pipeline_record( ) if existing_record is not None: - # if the record already exists, return it + # if the record already exists, then skip return - # no record matching, so construct the full record - input_packet_info = ( input_packet.as_table( include_source=True, ) + .append_column( + constants.PACKET_RECORD_ID, + pa.array( + [self.pod.get_record_id(input_packet)], type=pa.large_string() + ), + ) .append_column( f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", pa.array([input_packet.data_context_key], type=pa.large_string()), @@ -327,7 +331,7 @@ def add_pipeline_record( .drop(input_packet.keys()) ) - combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) + combined_record = arrow_utils.hstack_tables(tag.as_table(), input_packet_info) self.pipeline_store.add_record( self.pipeline_path, @@ -340,7 +344,7 @@ def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": results = self.result_store.get_all_records( - self.record_path, record_id_column=self.PACKET_HASH_COLUMN + self.record_path, record_id_column=constants.PACKET_RECORD_ID ) if self.pipeline_store is None: @@ -354,10 +358,9 @@ def get_all_records( if results is None or taginfo is None: return None - # TODO: do not hardcode the join keys joined_info = taginfo.join( results, - self.PACKET_HASH_COLUMN, + constants.PACKET_RECORD_ID, join_type="inner", ) @@ -366,8 +369,7 @@ def get_all_records( c for c in joined_info.column_names if c.startswith(constants.META_PREFIX) - or c.startswith(constants.CONTEXT_KEY) - or c.startswith(constants.SOURCE_PREFIX) + or c.startswith(constants.DATAGRAM_PREFIX) ] joined_info = joined_info.drop(system_columns) return joined_info diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 405b6a0..8a6f330 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1494,6 +1494,22 @@ class Pod(Kernel, Protocol): and fine-grained caching. """ + def get_record_id(self, packet: Packet) -> str: ... + + @property + def tiered_pod_id(self) -> dict[str, str]: + """ + Return a dictionary representation of the tiered pod's unique identifier. + The key is supposed to be ordered from least to most specific, allowing + for hierarchical identification of the pod. + + This is primarily used for tiered memoization/caching strategies. + + Returns: + dict[str, str]: Dictionary representation of the pod's ID + """ + ... + def input_packet_types(self) -> TypeSpec: """ TypeSpec for input packets that this Pod can process. @@ -1564,6 +1580,66 @@ def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ... +class CachedPod(Pod, Protocol): + def call( + self, + tag: Tag, + packet: Packet, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[Tag, Packet | None]: + """ + Process a single packet with its associated tag. + + This is the core method that defines the Pod's computational behavior. + It processes one (tag, packet) pair at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Tag transformation (modify metadata) + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + tag: Metadata associated with the packet + packet: The data payload to process + + Returns: + tuple[Tag, Packet | None]: + - Tag: Output tag (may be modified from input) + - Packet: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing + """ + ... + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records processed by this Pod. + + This method returns a table containing all packets processed by the Pod, + including metadata and system columns if requested. It is useful for: + - Debugging and analysis + - Auditing and data lineage tracking + - Performance monitoring + + Args: + include_system_columns: Whether to include system columns in the output + + Returns: + pa.Table | None: A table containing all processed records, or None if no records are available + """ + ... + + class Source(Kernel, Stream, Protocol): """ Entry point for data into the computational graph. diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 16c96cd..677fa5c 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -123,6 +123,8 @@ def extract_function_info( function_name: str | None = None, input_typespec: TypeSpec | None = None, output_typespec: TypeSpec | None = None, + exclude_function_signature: bool = False, + exclude_function_body: bool = False, ) -> dict[str, Any]: ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 1f0fd16..2f11b53 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -1,5 +1,5 @@ -from typing import Protocol, TYPE_CHECKING -from collections.abc import Collection +from typing import Any, Protocol, TYPE_CHECKING +from collections.abc import Collection, Mapping if TYPE_CHECKING: import pyarrow as pa @@ -48,6 +48,14 @@ def get_records_by_ids( flush: bool = False, ) -> "pa.Table | None": ... + def get_records_with_column_value( + self, + record_path: tuple[str, ...], + column_name_value: Collection[tuple[str, Any]] | Mapping[str, Any], + record_id_column: str | None = None, + flush: bool = False, + ) -> "pa.Table | None": ... + def flush(self) -> None: """Flush any buffered writes to the underlying storage.""" ... diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 12a528c..e897367 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -5,7 +5,7 @@ from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError from collections import defaultdict -from collections.abc import Collection +from collections.abc import Collection, Mapping from pyarrow import Table from orcapod.data import constants @@ -554,7 +554,9 @@ def get_all_records( # Add Delta table data if (delta_table := self._get_delta_table(record_path)) is not None: try: - delta_table_data = delta_table.to_pyarrow_table() + delta_table_data = delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() if delta_table_data.num_rows > 0: tables_to_combine.append(delta_table_data) except Exception as e: @@ -579,6 +581,56 @@ def get_all_records( # Handle record_id_column if specified return self._handle_record_id_column(table_to_return, record_id_column) + def get_records_with_column_value( + self, + record_path: tuple[str, ...], + column_values: Collection[tuple[str, Any]] | Mapping[str, Any], + record_id_column: str | None = None, + flush: bool = False, + ): + if flush: + self.flush_batch(record_path) + # check if record_id is found in pending batches + record_key = self._get_record_key(record_path) + pending_batch = self._pending_batches.get(record_key) + + if isinstance(column_values, Mapping): + # Convert Mapping to list of tuples + pair_list = list(column_values.items()) + elif isinstance(column_values, Collection): + # Ensure it's a list of tuples + pair_list = cast(list[tuple[str, Any]], list(column_values)) + + expressions = [pc.field(c) == v for c, v in pair_list] + combined_expression = expressions[0] + for next_expression in expressions[1:]: + combined_expression = combined_expression & next_expression + + if pending_batch is not None: + filtered_table = pending_batch.filter(combined_expression) + return self._handle_record_id_column(filtered_table, record_id_column) + + # Now check the Delta table + delta_table = self._get_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + result = self._read_delta_table(delta_table, expression=combined_expression) + + if len(result) == 0: + return None + + # Handle (remove/rename) the record id column before returning + return self._handle_record_id_column(result, record_id_column) + + except Exception as e: + logger.error( + f"Error getting record with {column_values} from {'/'.join(record_path)}: {e}" + ) + raise e + def get_record_by_id( self, record_path: tuple[str, ...], @@ -700,6 +752,7 @@ def _read_delta_table( self, delta_table: DeltaTable, filters: list | None = None, + expression: "pc.Expression | None" = None, ) -> "pa.Table": """ Read table using to_pyarrow_dataset with original schema preservation. @@ -711,10 +764,10 @@ def _read_delta_table( Returns: Arrow table with preserved schema """ + filter_expr = None # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading dataset = delta_table.to_pyarrow_dataset(as_large_types=True) - if filters: - filter_expr = None + if filters and expression is None: for filt in filters: if len(filt) == 3: col, op, val = filt @@ -733,9 +786,11 @@ def _read_delta_table( filter_expr = expr else: filter_expr = pc.and_(filter_expr, expr) # type: ignore + elif expression is not None: + filter_expr = expression - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) return dataset.to_table() diff --git a/src/orcapod/types/arrow_utils.py b/src/orcapod/types/arrow_utils.py deleted file mode 100644 index 34a06a3..0000000 --- a/src/orcapod/types/arrow_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -# from collections.abc import Mapping, Collection -# import pyarrow as pa -# from typing import Any - - -# def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: -# """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, -# no field names should collide.""" -# merged_fields = [] -# for schema in schemas: -# merged_fields.extend(schema) -# return pa.schema(merged_fields) - - -# def split_by_column_groups( -# self, *column_groups: Collection[str] -# ) -> tuple[pa.Table | None]: -# """ -# Split the table into multiple tables based on the provided column groups. -# Each group is a collection of column names that should be included in the same table. -# The remaining columns that are not part of any group will be returned as the first table/None. -# """ -# if not column_groups: -# return (self,) - -# tables = [] -# remaining_columns = set(self.column_names) - -# for group in column_groups: -# group_columns = [col for col in group if col in remaining_columns] -# if group_columns: -# tables.append(self.select(group_columns)) -# remaining_columns.difference_update(group_columns) -# else: -# tables.append(None) - -# remaining_table = None -# if remaining_columns: -# orderd_remaining_columns = self.column_names -# remaining_columns = [ -# col for col in orderd_remaining_columns if col in remaining_columns -# ] -# remaining_table = self.select(orderd_remaining_columns) -# return (remaining_table, *tables) - - -# def prepare_prefixed_columns( -# table: pa.Table, -# prefix_group: Collection[str] | Mapping[str, Any | None], -# ) -> tuple[pa.Table, pa.Table]: -# """ """ -# if isinstance(prefix_group, Mapping): -# prefix_group = {k: v if v is not None else {} for k, v in prefix_group.items()} -# elif isinstance(prefix_group, Collection): -# prefix_group = {name: {} for name in prefix_group} -# else: -# raise TypeError( -# "prefix_group must be a Collection of strings or a Mapping of string to string or None." -# ) - -# # Visit each prefix group and split them into separate tables -# member_columns = {} - -# for col_name in table.column_names: -# for prefix in prefix_group: -# if col_name.startswith(prefix): -# # Remove the prefix from the column name -# base_name = col_name.removeprefix(prefix) -# if base_name not in member_columns: -# member_columns[base_name] = [] -# member_columns[base_name].append(table.column(col_name)) - -# data_columns = [] -# data_column_names = [] -# existing_source_info = {} - -# for i, name in enumerate(table.column_names): -# if name.startswith(SOURCE_INFO_PREFIX): -# # Extract the base column name -# base_name = name.removeprefix(SOURCE_INFO_PREFIX) -# existing_source_info[base_name] = table.column(i) -# else: -# data_columns.append(table.column(i)) -# data_column_names.append(name) - -# # Step 2: Create source_info columns for each regular column -# source_info_columns = [] -# source_info_column_names = [] - -# # Create source_info columns for each regular column -# num_rows = table.num_rows - -# for col_name in data_column_names: -# source_info_col_name = f"{SOURCE_INFO_PREFIX}{col_name}" - -# # if col_name is in source_info, use that value -# if col_name in source_info: -# # Use value from source_info dictionary -# source_value = source_info[col_name] -# source_values = pa.array([source_value] * num_rows, type=pa.large_string()) -# # if col_name is in existing_source_info, use that column -# elif col_name in existing_source_info: -# # Use existing source_info column, but convert to large_string -# existing_col = existing_source_info[col_name] -# if existing_col.type == pa.large_string(): -# source_values = existing_col -# else: -# # Convert to large_string -# source_values = pa.compute.cast(existing_col, pa.large_string()) # type: ignore - -# else: -# # Use null values -# source_values = pa.array([None] * num_rows, type=pa.large_string()) - -# source_info_columns.append(source_values) -# source_info_column_names.append(source_info_col_name) - -# # Step 3: Create the final table -# data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) -# source_info_table: pa.Table = pa.Table.from_arrays( -# source_info_columns, names=source_info_column_names -# ) -# return data_table, source_info_table From 3248a94373d8336ae04aadfc591084ec4d1aadb8 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 04:50:22 +0000 Subject: [PATCH 153/224] fix: failure to capture source info column in table --- src/orcapod/data/datagrams/arrow_tag_packet.py | 12 +++++++++++- src/orcapod/data/streams.py | 10 +++++++--- src/orcapod/pipeline/nodes.py | 1 + src/orcapod/utils/arrow_utils.py | 4 ++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index ff11767..ec1ec0f 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -80,7 +80,7 @@ class ArrowPacket(ArrowDatagram): def __init__( self, - table: pa.Table, + table: pa.Table | pa.RecordBatch, meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, semantic_converter: SemanticConverter | None = None, @@ -93,6 +93,16 @@ def __init__( ) if source_info is None: source_info = {} + else: + # normalize by removing any existing prefixes + source_info = { + ( + k.removeprefix(constants.SOURCE_PREFIX) + if k.startswith(constants.SOURCE_PREFIX) + else k + ): v + for k, v in source_info.items() + } # normalize the table to ensure it has the expected source_info columns # TODO: use simpler function to ensure source_info columns diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 40e9878..14cb191 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone from itertools import repeat from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.context import DataContext @@ -398,12 +398,16 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: ) else: - tag = tag_batch + tag = cast(DictTag, tag_batch) + self._cached_elements.append( ( tag, ArrowPacket( packet_batch.slice(i, 1), + source_info=self._source_info_table.slice( + i, 1 + ).to_pylist()[0], semantic_converter=self._packet_converter, data_context=self._data_context, ), @@ -736,7 +740,7 @@ def process_inputs(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: missing = ( all_results.filter(pc.is_null(pc.field("_exists"))) .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) + .drop_columns([constants.INPUT_PACKET_HASH, "_exists"]) ) existing = all_results.filter(pc.is_valid(pc.field("_exists"))).drop( diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index e7a8727..35c6875 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any from orcapod.data.system_constants import orcapod_constants as constants from orcapod.utils import arrow_utils +from collections.abc import Collection if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 700fa3e..1e7865a 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -167,7 +167,7 @@ def split_by_column_groups( def prepare_prefixed_columns( - table: pa.Table, + table: pa.Table | pa.RecordBatch, prefix_info: Collection[str] | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], @@ -239,7 +239,7 @@ def prepare_prefixed_columns( column_values = pa.array([value] * num_rows, type=pa.large_string()) # if col_name is in existing_source_info, use that column elif col_name in existing_columns: - # Use existing source_info column, but convert to large_string + # Use existing prefixed column, but convert to large_string existing_col = table[prefixed_col_name] if existing_col.type == pa.string(): From 84d78abf868a7a12d4c9365ba88a1477a277227a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 05:08:30 +0000 Subject: [PATCH 154/224] refactor: split operators into separate modules --- src/orcapod/data/operators.py | 635 ------------------------- src/orcapod/data/operators/__init__.py | 11 + src/orcapod/data/operators/base.py | 259 ++++++++++ src/orcapod/data/operators/join.py | 108 +++++ src/orcapod/data/operators/mappers.py | 176 +++++++ src/orcapod/data/operators/semijoin.py | 133 ++++++ 6 files changed, 687 insertions(+), 635 deletions(-) delete mode 100644 src/orcapod/data/operators.py create mode 100644 src/orcapod/data/operators/__init__.py create mode 100644 src/orcapod/data/operators/base.py create mode 100644 src/orcapod/data/operators/join.py create mode 100644 src/orcapod/data/operators/mappers.py create mode 100644 src/orcapod/data/operators/semijoin.py diff --git a/src/orcapod/data/operators.py b/src/orcapod/data/operators.py deleted file mode 100644 index d18de9a..0000000 --- a/src/orcapod/data/operators.py +++ /dev/null @@ -1,635 +0,0 @@ -from orcapod.data.kernels import TrackedKernelBase -from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import ImmutableTableStream -from orcapod.types import TypeSpec -from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from abc import abstractmethod -from typing import Any, TYPE_CHECKING -from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping -from orcapod.errors import InputValidationError -from orcapod.data.system_constants import orcapod_constants as constants - -if TYPE_CHECKING: - import pyarrow as pa -else: - pa = LazyModule("pyarrow") - - -class Operator(TrackedKernelBase): - """ - Base class for all operators. - Operators are a special type of kernel that can be used to perform operations on streams. - - They are defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). - """ - - -class NonZeroInputOperator(Operator): - """ - Operators that work with at least one input stream. - This is useful for operators that can take a variable number of (but at least one ) input streams, - such as joins, unions, etc. - """ - - def verify_non_zero_input( - self, - streams: Collection[dp.Stream], - ) -> None: - """ - Check that the inputs to the variable inputs operator are valid. - This method is called before the forward method to ensure that the inputs are valid. - """ - if len(streams) == 0: - raise ValueError( - f"Operator {self.__class__.__name__} requires at least one input stream." - ) - - def validate_inputs(self, *streams: dp.Stream) -> None: - self.verify_non_zero_input(streams) - return self.op_validate_inputs(*streams) - - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Forward method for variable inputs operators. - It expects at least one stream as input. - """ - return self.op_forward(*streams) - - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - return self.op_output_types(*streams) - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - return self.op_identity_structure(streams) - - @abstractmethod - def op_validate_inputs(self, *streams: dp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - - @abstractmethod - def op_forward(self, *streams: dp.Stream) -> dp.Stream: - """ - This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. - It takes variable number of streams as input and returns a new stream as output. - """ - ... - - @abstractmethod - def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - """ - This method should be implemented by subclasses to return the typespecs of the input and output streams. - It takes at least one stream as input and returns a tuple of typespecs. - """ - ... - - @abstractmethod - def op_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. - If zero, it should return identity of the operator itself. - If one or more, it should return a identity structure approrpiate for the operator invoked on the given streams. - """ - ... - - -class BinaryOperator(Operator): - """ - Base class for all operators. - """ - - def check_binary_inputs( - self, - streams: Collection[dp.Stream], - ) -> None: - """ - Check that the inputs to the binary operator are valid. - This method is called before the forward method to ensure that the inputs are valid. - """ - if len(streams) != 2: - raise ValueError("BinaryOperator requires exactly two input streams.") - - def validate_inputs(self, *streams: dp.Stream) -> None: - self.check_binary_inputs(streams) - left_stream, right_stream = streams - return self.op_validate_inputs(left_stream, right_stream) - - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Forward method for binary operators. - It expects exactly two streams as input. - """ - left_stream, right_stream = streams - return self.op_forward(left_stream, right_stream) - - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - left_stream, right_stream = streams - return self.op_output_types(left_stream, right_stream) - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - if streams is not None: - left_stream, right_stream = streams - self.op_identity_structure(left_stream, right_stream) - return self.op_identity_structure() - - @abstractmethod - def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - - @abstractmethod - def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: - """ - This method should be implemented by subclasses to define the specific behavior of the binary operator. - It takes two streams as input and returns a new stream as output. - """ - ... - - @abstractmethod - def op_output_types( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> tuple[TypeSpec, TypeSpec]: - """ - This method should be implemented by subclasses to return the typespecs of the input and output streams. - It takes two streams as input and returns a tuple of typespecs. - """ - ... - - @abstractmethod - def op_identity_structure( - self, - left_stream: dp.Stream | None = None, - right_stream: dp.Stream | None = None, - ) -> Any: - """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes two streams as input and returns a tuple containing the operator name and a set of streams. - """ - ... - - -class UnaryOperator(Operator): - """ - Base class for all operators. - """ - - def check_unary_input( - self, - streams: Collection[dp.Stream], - ) -> None: - """ - Check that the inputs to the unary operator are valid. - """ - if len(streams) != 1: - raise ValueError("UnaryOperator requires exactly one input stream.") - - def validate_inputs(self, *streams: dp.Stream) -> None: - self.check_unary_input(streams) - stream = streams[0] - return self.op_validate_inputs(stream) - - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Forward method for unary operators. - It expects exactly one stream as input. - """ - stream = streams[0] - return self.op_forward(stream) - - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - stream = streams[0] - return self.op_output_types(stream) - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - """ - Return a structure that represents the identity of this operator. - This is used to ensure that the operator can be uniquely identified in the computational graph. - """ - if streams is not None: - stream = list(streams)[0] - self.op_identity_structure(stream) - return self.op_identity_structure() - - @abstractmethod - def op_validate_inputs(self, stream: dp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - ... - - @abstractmethod - def op_forward(self, stream: dp.Stream) -> dp.Stream: - """ - This method should be implemented by subclasses to define the specific behavior of the binary operator. - It takes two streams as input and returns a new stream as output. - """ - ... - - @abstractmethod - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - """ - This method should be implemented by subclasses to return the typespecs of the input and output streams. - It takes two streams as input and returns a tuple of typespecs. - """ - ... - - @abstractmethod - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: - """ - This method should be implemented by subclasses to return a structure that represents the identity of the operator. - It takes two streams as input and returns a tuple containing the operator name and a set of streams. - """ - ... - - -class Join(NonZeroInputOperator): - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Returns a unique identifier for the kernel. - This is used to identify the kernel in the computational graph. - """ - return (f"{self.__class__.__name__}",) - - def op_validate_inputs(self, *streams: dp.Stream) -> None: - try: - self.op_output_types(*streams) - except Exception as e: - # raise InputValidationError(f"Input streams are not compatible: {e}") from e - raise e - - def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - if len(streams) == 1: - # If only one stream is provided, return its typespecs - return streams[0].types() - - stream = streams[0] - tag_typespec, packet_typespec = stream.types() - for other_stream in streams[1:]: - other_tag_typespec, other_packet_typespec = other_stream.types() - tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) - packet_typespec = intersection_typespecs( - packet_typespec, other_packet_typespec - ) - if packet_typespec: - raise InputValidationError( - f"Packets should not have overlapping keys, but {packet_typespec.keys()} found in {stream} and {other_stream}." - ) - - return tag_typespec, packet_typespec - - def op_forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Joins two streams together based on their tags. - The resulting stream will contain all the tags from both streams. - """ - if len(streams) == 1: - return streams[0] - - COMMON_JOIN_KEY = "_common" - - stream = streams[0] - - tag_keys, _ = [set(k) for k in stream.keys()] - table = stream.as_table(include_source=True) - # trick to get cartesian product - table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) - - for next_stream in streams[1:]: - next_tag_keys, _ = next_stream.keys() - next_table = next_stream.as_table(include_source=True) - next_table = next_table.add_column( - 0, COMMON_JOIN_KEY, pa.array([0] * len(next_table)) - ) - common_tag_keys = tag_keys.intersection(next_tag_keys) - common_tag_keys.add(COMMON_JOIN_KEY) - - table = table.join( - next_table, keys=list(common_tag_keys), join_type="inner" - ) - tag_keys.update(next_tag_keys) - - # reorder columns to bring tag columns to the front - # TODO: come up with a better algorithm - table = table.drop(COMMON_JOIN_KEY) - reordered_columns = [col for col in table.column_names if col in tag_keys] - reordered_columns += [col for col in table.column_names if col not in tag_keys] - - return ImmutableTableStream( - table.select(reordered_columns), - tag_columns=tuple(tag_keys), - source=self, - upstreams=streams, - ) - - def op_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - return ( - (self.__class__.__name__,) + (set(streams),) if streams is not None else () - ) - - def __repr__(self) -> str: - return "Join()" - - -class SemiJoin(BinaryOperator): - """ - Binary operator that performs a semi-join between two streams. - - A semi-join returns only the entries from the left stream that have - matching entries in the right stream, based on equality of values - in overlapping columns (columns with the same name and compatible types). - - If there are no overlapping columns between the streams, the entire - left stream is returned unchanged. - - The output stream preserves the schema of the left stream exactly. - """ - - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Returns a unique identifier for the kernel. - This is used to identify the kernel in the computational graph. - """ - return (f"{self.__class__.__name__}",) - - def op_identity_structure( - self, - left_stream: dp.Stream | None = None, - right_stream: dp.Stream | None = None, - ) -> Any: - """ - Return a structure that represents the identity of this operator. - Unlike Join, SemiJoin depends on the order of streams (left vs right). - """ - id_struct = (self.__class__.__name__,) - if left_stream is not None and right_stream is not None: - # Order matters for semi-join: (left_stream, right_stream) - id_struct += (left_stream, right_stream) - return id_struct - - def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: - """ - Performs a semi-join between left and right streams. - Returns entries from left stream that have matching entries in right stream. - """ - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() - - # Find overlapping columns across all columns (tags + packets) - left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) - right_all_typespec = union_typespecs(right_tag_typespec, right_packet_typespec) - - common_keys = tuple( - intersection_typespecs(left_all_typespec, right_all_typespec).keys() - ) - - # If no overlapping columns, return the left stream unmodified - if not common_keys: - return left_stream - - # include source info for left stream - left_table = left_stream.as_table(include_source=True) - - # Get the right table for matching - right_table = right_stream.as_table() - - # Perform left semi join using PyArrow's built-in functionality - semi_joined_table = left_table.join( - right_table, - keys=list(common_keys), - join_type="left semi", - ) - - return ImmutableTableStream( - semi_joined_table, - tag_columns=tuple(left_tag_typespec.keys()), - source=self, - upstreams=(left_stream, right_stream), - ) - - def op_output_types( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> tuple[TypeSpec, TypeSpec]: - """ - Returns the output types for the semi-join operation. - The output preserves the exact schema of the left stream. - """ - # Semi-join preserves the left stream's schema exactly - return left_stream.types() - - def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream - ) -> None: - """ - Validates that the input streams are compatible for semi-join. - Checks that overlapping columns have compatible types. - """ - try: - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() - - # Check that overlapping columns have compatible types across all columns - left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) - right_all_typespec = union_typespecs( - right_tag_typespec, right_packet_typespec - ) - - # intersection_typespecs will raise an error if types are incompatible - intersection_typespecs(left_all_typespec, right_all_typespec) - - except Exception as e: - raise InputValidationError( - f"Input streams are not compatible for semi-join: {e}" - ) from e - - def __repr__(self) -> str: - return "SemiJoin()" - - -class MapPackets(UnaryOperator): - """ - Operator that maps packets in a stream using a user-defined function. - The function is applied to each packet in the stream, and the resulting packets - are returned as a new stream. - """ - - def __init__( - self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs - ): - self.name_map = dict(name_map) - self.drop_unmapped = drop_unmapped - super().__init__(**kwargs) - - def op_forward(self, stream: dp.Stream) -> dp.Stream: - tag_columns, packet_columns = stream.keys() - - if not any(n in packet_columns for n in self.name_map): - # nothing to rename in the packet, return stream as is - return stream - - table = stream.as_table(include_source=True) - - name_map = {tc: tc for tc in tag_columns} # no renaming on tag columns - for c in packet_columns: - if c in self.name_map: - name_map[c] = self.name_map[c] - name_map[f"{constants.SOURCE_PREFIX}{c}"] = ( - f"{constants.SOURCE_PREFIX}{self.name_map[c]}" - ) - else: - name_map[c] = c - - renamed_table = table.rename_columns(name_map) - return ImmutableTableStream( - renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) - ) - - def op_validate_inputs(self, stream: dp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - # verify that renamed value does NOT collide with other columns - tag_columns, packet_columns = stream.keys() - relevant_source = [] - relevant_target = [] - for source, target in self.name_map.items(): - if source in packet_columns: - relevant_source.append(source) - relevant_target.append(target) - remaining_packet_columns = set(packet_columns) - set(relevant_source) - overlapping_packet_columns = remaining_packet_columns.intersection( - relevant_target - ) - overlapping_tag_columns = set(tag_columns).intersection(relevant_target) - - if overlapping_packet_columns or overlapping_tag_columns: - message = f"Renaming {self.name_map} would cause collisions with existing columns: " - if overlapping_packet_columns: - message += f"overlapping packet columns: {overlapping_packet_columns}, " - if overlapping_tag_columns: - message += f"overlapping tag columns: {overlapping_tag_columns}." - raise InputValidationError(message) - - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, packet_typespec = stream.types() - - # Create new packet typespec with renamed keys - new_packet_typespec = { - self.name_map.get(k, k): v for k, v in packet_typespec.items() - } - - return tag_typespec, new_packet_typespec - - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: - return ( - self.__class__.__name__, - self.name_map, - self.drop_unmapped, - ) + ((stream,) if stream is not None else ()) - - -class MapTags(UnaryOperator): - """ - Operator that maps tags in a stream using a user-defined function. - The function is applied to each tag in the stream, and the resulting tags - are returned as a new stream. - """ - - def __init__( - self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs - ): - self.name_map = dict(name_map) - self.drop_unmapped = drop_unmapped - super().__init__(**kwargs) - - def op_forward(self, stream: dp.Stream) -> dp.Stream: - tag_columns, packet_columns = stream.keys() - - if not any(n in tag_columns for n in self.name_map): - # nothing to rename in the tags, return stream as is - return stream - - table = stream.as_table(include_source=True) - - name_map = { - tc: self.name_map.get(tc, tc) for tc in tag_columns - } # rename the tag as necessary - new_tag_columns = [name_map[tc] for tc in tag_columns] - for c in packet_columns: - name_map[c] = c # no renaming on packet columns - - renamed_table = table.rename_columns(name_map) - return ImmutableTableStream( - renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) - ) - - def op_validate_inputs(self, stream: dp.Stream) -> None: - """ - This method should be implemented by subclasses to validate the inputs to the operator. - It takes two streams as input and raises an error if the inputs are not valid. - """ - # verify that renamed value does NOT collide with other columns - tag_columns, packet_columns = stream.keys() - relevant_source = [] - relevant_target = [] - for source, target in self.name_map.items(): - if source in tag_columns: - relevant_source.append(source) - relevant_target.append(target) - remaining_tag_columns = set(tag_columns) - set(relevant_source) - overlapping_tag_columns = remaining_tag_columns.intersection(relevant_target) - overlapping_packet_columns = set(packet_columns).intersection(relevant_target) - - if overlapping_tag_columns or overlapping_packet_columns: - message = f"Renaming {self.name_map} would cause collisions with existing columns: " - if overlapping_tag_columns: - message += f"overlapping tag columns: {overlapping_tag_columns}." - if overlapping_packet_columns: - message += f"overlapping packet columns: {overlapping_packet_columns}." - raise InputValidationError(message) - - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, packet_typespec = stream.types() - - # Create new packet typespec with renamed keys - new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} - - return new_tag_typespec, packet_typespec - - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: - return ( - self.__class__.__name__, - self.name_map, - self.drop_unmapped, - ) + ((stream,) if stream is not None else ()) diff --git a/src/orcapod/data/operators/__init__.py b/src/orcapod/data/operators/__init__.py new file mode 100644 index 0000000..7ba693b --- /dev/null +++ b/src/orcapod/data/operators/__init__.py @@ -0,0 +1,11 @@ +from .join import Join +from .semijoin import SemiJoin +from .mappers import MapTags, MapPackets + + +__all__ = [ + "Join", + "SemiJoin", + "MapTags", + "MapPackets", +] diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/data/operators/base.py new file mode 100644 index 0000000..638cc60 --- /dev/null +++ b/src/orcapod/data/operators/base.py @@ -0,0 +1,259 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.types import TypeSpec +from abc import abstractmethod +from typing import Any +from collections.abc import Collection + + +class Operator(TrackedKernelBase): + """ + Base class for all operators. + Operators are a special type of kernel that can be used to perform operations on streams. + + They are defined as a callable that takes a (possibly empty) collection of streams as the input + and returns a new stream as output (note that output stream is always singular). + """ + + +class UnaryOperator(Operator): + """ + Base class for all operators. + """ + + def check_unary_input( + self, + streams: Collection[dp.Stream], + ) -> None: + """ + Check that the inputs to the unary operator are valid. + """ + if len(streams) != 1: + raise ValueError("UnaryOperator requires exactly one input stream.") + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_unary_input(streams) + stream = streams[0] + return self.op_validate_inputs(stream) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for unary operators. + It expects exactly one stream as input. + """ + stream = streams[0] + return self.op_forward(stream) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + stream = streams[0] + return self.op_output_types(stream) + + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + if streams is not None: + stream = list(streams)[0] + self.op_identity_structure(stream) + return self.op_identity_structure() + + @abstractmethod + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + @abstractmethod + def op_forward(self, stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes two streams as input and returns a tuple containing the operator name and a set of streams. + """ + ... + + +class BinaryOperator(Operator): + """ + Base class for all operators. + """ + + def check_binary_inputs( + self, + streams: Collection[dp.Stream], + ) -> None: + """ + Check that the inputs to the binary operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if len(streams) != 2: + raise ValueError("BinaryOperator requires exactly two input streams.") + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_binary_inputs(streams) + left_stream, right_stream = streams + return self.op_validate_inputs(left_stream, right_stream) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for binary operators. + It expects exactly two streams as input. + """ + left_stream, right_stream = streams + return self.op_forward(left_stream, right_stream) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + left_stream, right_stream = streams + return self.op_output_types(left_stream, right_stream) + + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + if streams is not None: + left_stream, right_stream = streams + self.op_identity_structure(left_stream, right_stream) + return self.op_identity_structure() + + @abstractmethod + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + @abstractmethod + def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_output_types( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure( + self, + left_stream: dp.Stream | None = None, + right_stream: dp.Stream | None = None, + ) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes two streams as input and returns a tuple containing the operator name and a set of streams. + """ + ... + + +class NonZeroInputOperator(Operator): + """ + Operators that work with at least one input stream. + This is useful for operators that can take a variable number of (but at least one ) input streams, + such as joins, unions, etc. + """ + + def verify_non_zero_input( + self, + streams: Collection[dp.Stream], + ) -> None: + """ + Check that the inputs to the variable inputs operator are valid. + This method is called before the forward method to ensure that the inputs are valid. + """ + if len(streams) == 0: + raise ValueError( + f"Operator {self.__class__.__name__} requires at least one input stream." + ) + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.verify_non_zero_input(streams) + return self.op_validate_inputs(*streams) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Forward method for variable inputs operators. + It expects at least one stream as input. + """ + return self.op_forward(*streams) + + def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + return self.op_output_types(*streams) + + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + """ + Return a structure that represents the identity of this operator. + This is used to ensure that the operator can be uniquely identified in the computational graph. + """ + return self.op_identity_structure(streams) + + @abstractmethod + def op_validate_inputs(self, *streams: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + ... + + @abstractmethod + def op_forward(self, *streams: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. + It takes variable number of streams as input and returns a new stream as output. + """ + ... + + @abstractmethod + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes at least one stream as input and returns a tuple of typespecs. + """ + ... + + @abstractmethod + def op_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + """ + This method should be implemented by subclasses to return a structure that represents the identity of the operator. + It takes zero or more streams as input and returns a tuple containing the operator name and a set of streams. + If zero, it should return identity of the operator itself. + If one or more, it should return a identity structure approrpiate for the operator invoked on the given streams. + """ + ... diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py new file mode 100644 index 0000000..a8ab492 --- /dev/null +++ b/src/orcapod/data/operators/join.py @@ -0,0 +1,108 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import ImmutableTableStream +from orcapod.types import TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from abc import abstractmethod +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection, Mapping +from orcapod.errors import InputValidationError +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.operators.base import NonZeroInputOperator + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class Join(NonZeroInputOperator): + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}",) + + def op_validate_inputs(self, *streams: dp.Stream) -> None: + try: + self.op_output_types(*streams) + except Exception as e: + # raise InputValidationError(f"Input streams are not compatible: {e}") from e + raise e + + def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + if len(streams) == 1: + # If only one stream is provided, return its typespecs + return streams[0].types() + + stream = streams[0] + tag_typespec, packet_typespec = stream.types() + for other_stream in streams[1:]: + other_tag_typespec, other_packet_typespec = other_stream.types() + tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) + packet_typespec = intersection_typespecs( + packet_typespec, other_packet_typespec + ) + if packet_typespec: + raise InputValidationError( + f"Packets should not have overlapping keys, but {packet_typespec.keys()} found in {stream} and {other_stream}." + ) + + return tag_typespec, packet_typespec + + def op_forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Joins two streams together based on their tags. + The resulting stream will contain all the tags from both streams. + """ + if len(streams) == 1: + return streams[0] + + COMMON_JOIN_KEY = "_common" + + stream = streams[0] + + tag_keys, _ = [set(k) for k in stream.keys()] + table = stream.as_table(include_source=True) + # trick to get cartesian product + table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) + + for next_stream in streams[1:]: + next_tag_keys, _ = next_stream.keys() + next_table = next_stream.as_table(include_source=True) + next_table = next_table.add_column( + 0, COMMON_JOIN_KEY, pa.array([0] * len(next_table)) + ) + common_tag_keys = tag_keys.intersection(next_tag_keys) + common_tag_keys.add(COMMON_JOIN_KEY) + + table = table.join( + next_table, keys=list(common_tag_keys), join_type="inner" + ) + tag_keys.update(next_tag_keys) + + # reorder columns to bring tag columns to the front + # TODO: come up with a better algorithm + table = table.drop(COMMON_JOIN_KEY) + reordered_columns = [col for col in table.column_names if col in tag_keys] + reordered_columns += [col for col in table.column_names if col not in tag_keys] + + return ImmutableTableStream( + table.select(reordered_columns), + tag_columns=tuple(tag_keys), + source=self, + upstreams=streams, + ) + + def op_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + return ( + (self.__class__.__name__,) + (set(streams),) if streams is not None else () + ) + + def __repr__(self) -> str: + return "Join()" diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py new file mode 100644 index 0000000..4aa35d6 --- /dev/null +++ b/src/orcapod/data/operators/mappers.py @@ -0,0 +1,176 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import ImmutableTableStream +from orcapod.types import TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from abc import abstractmethod +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection, Mapping +from orcapod.errors import InputValidationError +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.operators.base import UnaryOperator + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class MapPackets(UnaryOperator): + """ + Operator that maps packets in a stream using a user-defined function. + The function is applied to each packet in the stream, and the resulting packets + are returned as a new stream. + """ + + def __init__( + self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + ): + self.name_map = dict(name_map) + self.drop_unmapped = drop_unmapped + super().__init__(**kwargs) + + def op_forward(self, stream: dp.Stream) -> dp.Stream: + tag_columns, packet_columns = stream.keys() + + if not any(n in packet_columns for n in self.name_map): + # nothing to rename in the packet, return stream as is + return stream + + table = stream.as_table(include_source=True) + + name_map = {tc: tc for tc in tag_columns} # no renaming on tag columns + for c in packet_columns: + if c in self.name_map: + name_map[c] = self.name_map[c] + name_map[f"{constants.SOURCE_PREFIX}{c}"] = ( + f"{constants.SOURCE_PREFIX}{self.name_map[c]}" + ) + else: + name_map[c] = c + + renamed_table = table.rename_columns(name_map) + return ImmutableTableStream( + renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) + ) + + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # verify that renamed value does NOT collide with other columns + tag_columns, packet_columns = stream.keys() + relevant_source = [] + relevant_target = [] + for source, target in self.name_map.items(): + if source in packet_columns: + relevant_source.append(source) + relevant_target.append(target) + remaining_packet_columns = set(packet_columns) - set(relevant_source) + overlapping_packet_columns = remaining_packet_columns.intersection( + relevant_target + ) + overlapping_tag_columns = set(tag_columns).intersection(relevant_target) + + if overlapping_packet_columns or overlapping_tag_columns: + message = f"Renaming {self.name_map} would cause collisions with existing columns: " + if overlapping_packet_columns: + message += f"overlapping packet columns: {overlapping_packet_columns}, " + if overlapping_tag_columns: + message += f"overlapping tag columns: {overlapping_tag_columns}." + raise InputValidationError(message) + + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, packet_typespec = stream.types() + + # Create new packet typespec with renamed keys + new_packet_typespec = { + self.name_map.get(k, k): v for k, v in packet_typespec.items() + } + + return tag_typespec, new_packet_typespec + + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.name_map, + self.drop_unmapped, + ) + ((stream,) if stream is not None else ()) + + +class MapTags(UnaryOperator): + """ + Operator that maps tags in a stream using a user-defined function. + The function is applied to each tag in the stream, and the resulting tags + are returned as a new stream. + """ + + def __init__( + self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + ): + self.name_map = dict(name_map) + self.drop_unmapped = drop_unmapped + super().__init__(**kwargs) + + def op_forward(self, stream: dp.Stream) -> dp.Stream: + tag_columns, packet_columns = stream.keys() + + if not any(n in tag_columns for n in self.name_map): + # nothing to rename in the tags, return stream as is + return stream + + table = stream.as_table(include_source=True) + + name_map = { + tc: self.name_map.get(tc, tc) for tc in tag_columns + } # rename the tag as necessary + new_tag_columns = [name_map[tc] for tc in tag_columns] + for c in packet_columns: + name_map[c] = c # no renaming on packet columns + + renamed_table = table.rename_columns(name_map) + return ImmutableTableStream( + renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) + ) + + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # verify that renamed value does NOT collide with other columns + tag_columns, packet_columns = stream.keys() + relevant_source = [] + relevant_target = [] + for source, target in self.name_map.items(): + if source in tag_columns: + relevant_source.append(source) + relevant_target.append(target) + remaining_tag_columns = set(tag_columns) - set(relevant_source) + overlapping_tag_columns = remaining_tag_columns.intersection(relevant_target) + overlapping_packet_columns = set(packet_columns).intersection(relevant_target) + + if overlapping_tag_columns or overlapping_packet_columns: + message = f"Renaming {self.name_map} would cause collisions with existing columns: " + if overlapping_tag_columns: + message += f"overlapping tag columns: {overlapping_tag_columns}." + if overlapping_packet_columns: + message += f"overlapping packet columns: {overlapping_packet_columns}." + raise InputValidationError(message) + + def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, packet_typespec = stream.types() + + # Create new packet typespec with renamed keys + new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} + + return new_tag_typespec, packet_typespec + + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.name_map, + self.drop_unmapped, + ) + ((stream,) if stream is not None else ()) diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/data/operators/semijoin.py new file mode 100644 index 0000000..d3c2730 --- /dev/null +++ b/src/orcapod/data/operators/semijoin.py @@ -0,0 +1,133 @@ +from orcapod.data.kernels import TrackedKernelBase +from orcapod.protocols import data_protocols as dp +from orcapod.data.streams import ImmutableTableStream +from orcapod.types import TypeSpec +from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from abc import abstractmethod +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection, Mapping +from orcapod.errors import InputValidationError +from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.operators.base import BinaryOperator + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class SemiJoin(BinaryOperator): + """ + Binary operator that performs a semi-join between two streams. + + A semi-join returns only the entries from the left stream that have + matching entries in the right stream, based on equality of values + in overlapping columns (columns with the same name and compatible types). + + If there are no overlapping columns between the streams, the entire + left stream is returned unchanged. + + The output stream preserves the schema of the left stream exactly. + """ + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Returns a unique identifier for the kernel. + This is used to identify the kernel in the computational graph. + """ + return (f"{self.__class__.__name__}",) + + def op_identity_structure( + self, + left_stream: dp.Stream | None = None, + right_stream: dp.Stream | None = None, + ) -> Any: + """ + Return a structure that represents the identity of this operator. + Unlike Join, SemiJoin depends on the order of streams (left vs right). + """ + id_struct = (self.__class__.__name__,) + if left_stream is not None and right_stream is not None: + # Order matters for semi-join: (left_stream, right_stream) + id_struct += (left_stream, right_stream) + return id_struct + + def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + """ + Performs a semi-join between left and right streams. + Returns entries from left stream that have matching entries in right stream. + """ + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + # Find overlapping columns across all columns (tags + packets) + left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) + right_all_typespec = union_typespecs(right_tag_typespec, right_packet_typespec) + + common_keys = tuple( + intersection_typespecs(left_all_typespec, right_all_typespec).keys() + ) + + # If no overlapping columns, return the left stream unmodified + if not common_keys: + return left_stream + + # include source info for left stream + left_table = left_stream.as_table(include_source=True) + + # Get the right table for matching + right_table = right_stream.as_table() + + # Perform left semi join using PyArrow's built-in functionality + semi_joined_table = left_table.join( + right_table, + keys=list(common_keys), + join_type="left semi", + ) + + return ImmutableTableStream( + semi_joined_table, + tag_columns=tuple(left_tag_typespec.keys()), + source=self, + upstreams=(left_stream, right_stream), + ) + + def op_output_types( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the output types for the semi-join operation. + The output preserves the exact schema of the left stream. + """ + # Semi-join preserves the left stream's schema exactly + return left_stream.types() + + def op_validate_inputs( + self, left_stream: dp.Stream, right_stream: dp.Stream + ) -> None: + """ + Validates that the input streams are compatible for semi-join. + Checks that overlapping columns have compatible types. + """ + try: + left_tag_typespec, left_packet_typespec = left_stream.types() + right_tag_typespec, right_packet_typespec = right_stream.types() + + # Check that overlapping columns have compatible types across all columns + left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) + right_all_typespec = union_typespecs( + right_tag_typespec, right_packet_typespec + ) + + # intersection_typespecs will raise an error if types are incompatible + intersection_typespecs(left_all_typespec, right_all_typespec) + + except Exception as e: + raise InputValidationError( + f"Input streams are not compatible for semi-join: {e}" + ) from e + + def __repr__(self) -> str: + return "SemiJoin()" From e1792222084b448366c9033077e6ad18b6c9fefa Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 05:10:15 +0000 Subject: [PATCH 155/224] refactor: clean up imports --- src/orcapod/data/operators/mappers.py | 5 +---- src/orcapod/data/operators/semijoin.py | 4 ---- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py index 4aa35d6..0c8603e 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/data/operators/mappers.py @@ -1,12 +1,9 @@ -from orcapod.data.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp from orcapod.data.streams import ImmutableTableStream from orcapod.types import TypeSpec -from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from abc import abstractmethod from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping +from collections.abc import Mapping from orcapod.errors import InputValidationError from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.operators.base import UnaryOperator diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/data/operators/semijoin.py index d3c2730..eedfff0 100644 --- a/src/orcapod/data/operators/semijoin.py +++ b/src/orcapod/data/operators/semijoin.py @@ -1,14 +1,10 @@ -from orcapod.data.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp from orcapod.data.streams import ImmutableTableStream from orcapod.types import TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from abc import abstractmethod from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping from orcapod.errors import InputValidationError -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.operators.base import BinaryOperator if TYPE_CHECKING: From f0f621e1a4c201b85ff13450b85ef1f2a3dd1e0d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 05:12:06 +0000 Subject: [PATCH 156/224] refactor: remove old cache pod implementation --- src/orcapod/data/pods.py | 103 --------------------------------------- 1 file changed, 103 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index e777d93..65793cd 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -463,109 +463,6 @@ def __str__(self) -> str: return f"WrappedPod:{self.pod!s}" -class CachedPod2(WrappedPod): - """ - A pod that caches the results of the wrapped pod. - This is useful for pods that are expensive to compute and can benefit from caching. - """ - - # name of the column in the tag store that contains the packet hash - PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" - DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" - - def __init__( - self, - pod: dp.Pod, - result_store: ArrowDataStore, - record_path_prefix: tuple[str, ...] = (), - **kwargs, - ): - super().__init__(pod, **kwargs) - self.record_path_prefix = record_path_prefix - self.result_store = result_store - # unset data_context native to the object - - self._pod_hash = self.data_context.object_hasher.hash_to_hex( - self.pod, prefix_hasher_id=True - ) - - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Return the pod ID, which is the function name of the wrapped pod. - This is used to identify the pod in the system. - """ - return self.pod.kernel_id + (self._pod_hash,) - - @property - def record_path(self) -> tuple[str, ...]: - """ - Return the path to the record in the result store. - This is used to store the results of the pod. - """ - return self.record_path_prefix + self.kernel_id - - def call( - self, - tag: dp.Tag, - packet: dp.Packet, - skip_record_check: bool = False, - skip_recording: bool = False, - ) -> tuple[dp.Tag, dp.Packet | None]: - # TODO: consider logic for overwriting existing records - output_packet = None - if not skip_record_check: - output_packet = self.get_recorded_output_packet(packet) - if output_packet is None: - tag, output_packet = super().call(tag, packet) - if output_packet is not None and not skip_recording: - self.record_packet(packet, output_packet) - - return tag, output_packet - - def record_packet( - self, - input_packet: dp.Packet, - output_packet: dp.Packet, - skip_duplicates: bool = False, - ) -> dp.Packet: - """ - Record the output packet against the input packet in the result store. - """ - data_table = output_packet.as_table(include_context=True, include_source=True) - - result_flag = self.result_store.add_record( - self.record_path, - input_packet.content_hash(), - data_table, - skip_duplicates=skip_duplicates, - ) - # if result_flag is None: - # # TODO: do more specific error handling - # raise ValueError( - # f"Failed to record packet {input_packet} in result store {self.result_store}" - # ) - # # TODO: make store return retrieved table - return output_packet - - def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | None: - """ - Retrieve the output packet from the result store based on the input packet. - If the output packet is not found, return None. - """ - result_table = self.result_store.get_record_by_id( - self.record_path, input_packet.content_hash() - ) - if result_table is None: - return None - - # note that data context will be loaded from the result store - return ArrowPacket( - result_table, - meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, - ) - - class CachedPod(WrappedPod): """ A pod that caches the results of the wrapped pod. From ba23828946abb4bdbfdc3b5a7a0a1831705cb16c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 10:15:07 +0000 Subject: [PATCH 157/224] feat: add generated implementation for arrow to/from python conversion --- .../semantic_types/python_arrow_types.py | 604 ++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 src/orcapod/semantic_types/python_arrow_types.py diff --git a/src/orcapod/semantic_types/python_arrow_types.py b/src/orcapod/semantic_types/python_arrow_types.py new file mode 100644 index 0000000..4fe7325 --- /dev/null +++ b/src/orcapod/semantic_types/python_arrow_types.py @@ -0,0 +1,604 @@ +import pyarrow as pa +import typing +from typing import get_origin, get_args +import sys + +# Basic type mapping for Python -> Arrow conversion +_PYTHON_TO_ARROW_MAP = { + # Python built-ins + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), # Use large_string by default for Polars compatibility + bool: pa.bool_(), + bytes: pa.large_binary(), # Use large_binary by default for Polars compatibility + # String representations (for when we get type names as strings) + "int": pa.int64(), + "float": pa.float64(), + "str": pa.large_string(), + "bool": pa.bool_(), + "bytes": pa.large_binary(), + # Specific integer types + "int8": pa.int8(), + "int16": pa.int16(), + "int32": pa.int32(), + "int64": pa.int64(), + "uint8": pa.uint8(), + "uint16": pa.uint16(), + "uint32": pa.uint32(), + "uint64": pa.uint64(), + # Specific float types + "float32": pa.float32(), + "float64": pa.float64(), + # Date/time types + "date": pa.date32(), + "datetime": pa.timestamp("us"), + "timestamp": pa.timestamp("us"), +} + +# Reverse mapping for Arrow -> Python conversion (handles both regular and large variants) +_ARROW_TO_PYTHON_MAP = { + # Integer types + pa.int8(): int, + pa.int16(): int, + pa.int32(): int, + pa.int64(): int, + pa.uint8(): int, + pa.uint16(): int, + pa.uint32(): int, + pa.uint64(): int, + # Float types + pa.float32(): float, + pa.float64(): float, + # String types (both regular and large) + pa.string(): str, + pa.large_string(): str, + # Boolean + pa.bool_(): bool, + # Binary types (both regular and large) + pa.binary(): bytes, + pa.large_binary(): bytes, +} + +# Add numpy types if available +try: + import numpy as np + + _PYTHON_TO_ARROW_MAP.update( + { + np.int8: pa.int8(), + np.int16: pa.int16(), + np.int32: pa.int32(), + np.int64: pa.int64(), + np.uint8: pa.uint8(), + np.uint16: pa.uint16(), + np.uint32: pa.uint32(), + np.uint64: pa.uint64(), + np.float32: pa.float32(), + np.float64: pa.float64(), + np.bool_: pa.bool_(), + } + ) +except ImportError: + pass + + +def python_type_to_arrow(type_hint, semantic_registry=None) -> pa.DataType: + """ + Convert Python type hints to PyArrow data types. + + Args: + type_hint: Python type hint to convert + semantic_registry: Optional semantic type registry to check for semantic types + + Examples: + list[int] -> pa.large_list(pa.int64()) + tuple[int, int] -> pa.list_(pa.int64(), 2) + tuple[int, str] -> pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) + dict[str, int] -> pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) + """ + + # Handle basic types first + if type_hint in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[type_hint] + + # Check if this is a registered semantic type + if semantic_registry and hasattr( + semantic_registry, "get_converter_for_python_type" + ): + converter = semantic_registry.get_converter_for_python_type(type_hint) + if converter: + return converter.arrow_struct_type + + # Get the origin (e.g., list, tuple, dict) and args (e.g., int, str) + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is None: + # Handle non-generic types that might not be in basic map + if hasattr(type_hint, "__name__"): + type_name = type_hint.__name__ + if type_name in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[type_name] + raise ValueError(f"Unsupported type: {type_hint}") + + # Handle list types + if origin is list: + if len(args) != 1: + raise ValueError( + f"list type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) # Use large_list for Polars compatibility + + # Handle tuple types + elif origin is tuple: + if len(args) == 0: + raise ValueError("Empty tuple type not supported") + + # Check if all elements are the same type + if len(set(args)) == 1: + # Homogeneous tuple: tuple[int, int, int] -> fixed-size list + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.list_( + element_type, len(args) + ) # Fixed-size lists are always regular lists + else: + # Heterogeneous tuple: tuple[int, str] -> struct with indexed fields + fields = [] + for i, arg_type in enumerate(args): + field_type = python_type_to_arrow(arg_type, semantic_registry) + fields.append((f"f{i}", field_type)) + return pa.struct(fields) + + # Handle dict types + elif origin is dict: + if len(args) != 2: + raise ValueError( + f"dict type must have exactly two type arguments, got: {args}" + ) + key_type = python_type_to_arrow(args[0], semantic_registry) + value_type = python_type_to_arrow(args[1], semantic_registry) + + # Use large_list> representation for better compatibility + # This works reliably across Arrow, Polars, Parquet, etc. + key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) + return pa.large_list(key_value_struct) + + # Handle Union types (including Optional) + elif origin is typing.Union: + # Handle Optional[T] which is Union[T, NoneType] + if len(args) == 2 and type(None) in args: + # This is Optional[T] + non_none_type = args[0] if args[1] is type(None) else args[1] + base_type = python_type_to_arrow(non_none_type, semantic_registry) + # PyArrow handles nullability at the field level, so we just return the base type + return base_type + else: + # Complex unions - convert to a union type + union_types = [python_type_to_arrow(arg, semantic_registry) for arg in args] + # PyArrow union types are complex - for now, just use the first type as fallback + # TODO: Implement proper union support when needed + return union_types[0] # Simplified - take first type + + else: + raise ValueError(f"Unsupported generic type: {origin}") + + +def arrow_type_to_python(arrow_type: pa.DataType) -> type: + """ + Convert PyArrow data types back to Python type hints. + + Args: + arrow_type: PyArrow data type to convert + + Returns: + Python type annotation + + Examples: + pa.int64() -> int + pa.large_list(pa.large_string()) -> list[str] + pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) -> dict[str, int] + pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) -> tuple[int, str] + + Raises: + TypeError: If the Arrow type cannot be converted to a Python type + """ + + # Handle basic types + if arrow_type in _ARROW_TO_PYTHON_MAP: + return _ARROW_TO_PYTHON_MAP[arrow_type] + + # Check by Arrow type categories + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + return bytes + + # Handle complex types + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + element_type = arrow_type.value_type + + # Check if this is a dict representation: list> + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + + # Dict pattern: must have exactly 'key' and 'value' fields + if set(field_names) == {"key", "value"}: + # Find key and value types + key_field = next(f for f in element_type if f.name == "key") + value_field = next(f for f in element_type if f.name == "value") + + key_python_type = arrow_type_to_python(key_field.type) + value_python_type = arrow_type_to_python(value_field.type) + + return dict[key_python_type, value_python_type] + + # Regular list + element_python_type = arrow_type_to_python(element_type) + + # Check if this is a fixed-size list (homogeneous tuple representation) + if ( + hasattr(arrow_type, "list_size") and arrow_type.list_size > 0 + ) or pa.types.is_fixed_size_list(arrow_type): + # Fixed-size list represents homogeneous tuple + if pa.types.is_fixed_size_list(arrow_type): + size = arrow_type.list_size + else: + size = arrow_type.list_size + return tuple[tuple(element_python_type for _ in range(size))] + else: + # Variable-size list + return list[element_python_type] + + elif pa.types.is_struct(arrow_type): + # Check if this is a heterogeneous tuple representation + field_names = [field.name for field in arrow_type] + + # Tuple pattern: fields named f0, f1, f2, etc. + if all(name.startswith("f") and name[1:].isdigit() for name in field_names): + # Sort by field index to maintain order + sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) + field_types = [arrow_type_to_python(field.type) for field in sorted_fields] + return tuple[tuple(field_types)] + else: + # TODO: Could support NamedTuple or dataclass conversion here + raise TypeError( + f"Cannot convert struct type to Python type hint. " + f"Struct has fields: {field_names}. " + f"Only tuple-like structs (f0, f1, ...) are supported." + ) + + elif pa.types.is_map(arrow_type): + # Handle pa.map_ types (though we prefer list representation) + key_python_type = arrow_type_to_python(arrow_type.key_type) + value_python_type = arrow_type_to_python(arrow_type.item_type) + return dict[key_python_type, value_python_type] + + elif pa.types.is_union(arrow_type): + # Handle union types -> Union[T1, T2, ...] + import typing + + # Get the child types from the union + child_types = [] + for i in range(arrow_type.num_fields): + child_field = arrow_type[i] + child_types.append(arrow_type_to_python(child_field.type)) + + if len(child_types) == 2 and type(None) in child_types: + # This is Optional[T] + non_none_type = next(t for t in child_types if t is not type(None)) + return typing.Optional[non_none_type] + else: + return typing.Union[tuple(child_types)] + + else: + raise TypeError( + f"Cannot convert Arrow type '{arrow_type}' to Python type hint. " + f"Supported types: int, float, str, bool, bytes, list, large_list, fixed_size_list, tuple, dict, struct, map, union. " + f"Arrow type category: {arrow_type}" + ) + + +def parse_type_string(type_string: str): + """ + Parse a type hint from a string representation. + Useful when you have type hints as strings. + + Example: + parse_type_string("list[int]") -> pa.large_list(pa.int64()) + """ + # This is a simplified version - for production use, consider using ast.literal_eval + # or a proper type hint parser + try: + # Try to evaluate the string as a type hint + # Note: This uses eval which can be dangerous - use with trusted input only + import typing + + namespace = { + "list": list, + "tuple": tuple, + "dict": dict, + "int": int, + "str": str, + "float": float, + "bool": bool, + "bytes": bytes, + "Optional": typing.Optional, + "Union": typing.Union, + } + type_hint = eval(type_string, {"__builtins__": {}}, namespace) + return python_type_to_arrow(type_hint) + except Exception as e: + raise ValueError(f"Could not parse type string '{type_string}': {e}") + + +# Helper functions for dict conversion +def dict_to_arrow_list(d: dict) -> list[dict]: + """Convert Python dict to Arrow-compatible list of key-value structs.""" + return [{"key": k, "value": v} for k, v in d.items()] + + +def arrow_list_to_dict(lst: list[dict]) -> dict: + """Convert Arrow list of key-value structs back to Python dict.""" + return {item["key"]: item["value"] for item in lst if item is not None} + + +# Example usage and comprehensive tests +if __name__ == "__main__": + print("=== Complete Python Type Hint ↔ PyArrow Type Converter ===\n") + + # Test basic functionality first + print("Testing basic round-trip:") + try: + # Simple test + python_type = dict[str, int] + arrow_type = python_type_to_arrow(python_type) + recovered_type = arrow_type_to_python(arrow_type) + print(f"✓ {python_type} -> {arrow_type} -> {recovered_type}") + print(f" Match: {recovered_type == python_type}") + except Exception as e: + print(f"✗ Basic test failed: {e}") + + print("\n" + "=" * 60) + print("Testing Python -> Arrow conversion:") + + # Test cases for Python -> Arrow + python_to_arrow_tests = [ + # Basic types + (int, pa.int64()), + (str, pa.large_string()), + (float, pa.float64()), + (bool, pa.bool_()), + # Lists (both regular and large) + (list[int], pa.large_list(pa.int64())), + (list[str], pa.large_list(pa.large_string())), + (list[float], pa.large_list(pa.float64())), + # Homogeneous tuples (always use regular fixed-size lists) + (tuple[int, int], pa.list_(pa.int64(), 2)), + (tuple[str, str, str], pa.list_(pa.large_string(), 3)), + # Heterogeneous tuples + (tuple[int, str], pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), + ( + tuple[int, str, float], + pa.struct( + [("f0", pa.int64()), ("f1", pa.large_string()), ("f2", pa.float64())] + ), + ), + # Dict types - using large_list> for Polars compatibility + ( + dict[str, int], + pa.large_list( + pa.struct([("key", pa.large_string()), ("value", pa.int64())]) + ), + ), + ( + dict[int, str], + pa.large_list( + pa.struct([("key", pa.int64()), ("value", pa.large_string())]) + ), + ), + # Nested types + (list[list[int]], pa.large_list(pa.large_list(pa.int64()))), + ( + list[tuple[int, str]], + pa.large_list(pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), + ), + ] + + for python_type, expected_arrow_type in python_to_arrow_tests: + try: + result = python_type_to_arrow(python_type) + success = result == expected_arrow_type + status = "✓" if success else "✗" + print(f"{status} {python_type} -> {result}") + if not success: + print(f" Expected: {expected_arrow_type}") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing Arrow -> Python type conversion:") + + arrow_to_python_tests = [ + # Basic types (both regular and large variants) + (pa.int64(), int), + (pa.string(), str), + (pa.large_string(), str), + (pa.float64(), float), + (pa.bool_(), bool), + (pa.binary(), bytes), + (pa.large_binary(), bytes), + # Lists (both regular and large) + (pa.list_(pa.int64(), -1), list[int]), + (pa.large_list(pa.int64()), list[int]), + (pa.list_(pa.string(), -1), list[str]), + (pa.large_list(pa.large_string()), list[str]), + # Fixed-size lists (homogeneous tuples) + (pa.list_(pa.int64(), 3), tuple[int, int, int]), + (pa.list_(pa.large_string(), 2), tuple[str, str]), + # Dict representation: both regular and large list variants + ( + pa.list_(pa.struct([("key", pa.string()), ("value", pa.int64())]), -1), + dict[str, int], + ), + ( + pa.large_list( + pa.struct([("key", pa.large_string()), ("value", pa.int64())]) + ), + dict[str, int], + ), + ( + pa.list_(pa.struct([("key", pa.int64()), ("value", pa.string())]), -1), + dict[int, str], + ), + ( + pa.large_list( + pa.struct([("key", pa.int64()), ("value", pa.large_string())]) + ), + dict[int, str], + ), + # Heterogeneous tuples: struct + (pa.struct([("f0", pa.int64()), ("f1", pa.string())]), tuple[int, str]), + (pa.struct([("f0", pa.int64()), ("f1", pa.large_string())]), tuple[int, str]), + ( + pa.struct([("f0", pa.int64()), ("f1", pa.string()), ("f2", pa.float64())]), + tuple[int, str, float], + ), + # Maps (if encountered) + (pa.map_(pa.string(), pa.int64()), dict[str, int]), + (pa.map_(pa.large_string(), pa.int64()), dict[str, int]), + # Nested structures + (pa.list_(pa.list_(pa.int64(), -1), -1), list[list[int]]), + (pa.large_list(pa.large_list(pa.int64())), list[list[int]]), + ] + + for arrow_type, expected_python_type in arrow_to_python_tests: + try: + result = arrow_type_to_python(arrow_type) + success = result == expected_python_type + status = "✓" if success else "✗" + print(f"{status} {arrow_type} -> {result}") + if not success: + print(f" Expected: {expected_python_type}") + except Exception as e: + print(f"✗ {arrow_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing round-trip conversion:") + + round_trip_tests = [ + dict[str, int], + list[int], + tuple[int, str], + tuple[str, str, str], + list[dict[str, int]], + list[list[str]], + tuple[int, float, bool], + ] + + for python_type in round_trip_tests: + try: + # Python -> Arrow -> Python + arrow_type = python_type_to_arrow(python_type) + recovered_python_type = arrow_type_to_python(arrow_type) + success = recovered_python_type == python_type + status = "✓" if success else "✗" + print(f"{status} {python_type} -> {arrow_type} -> {recovered_python_type}") + if not success: + print(f" Round-trip failed!") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing string parsing:") + + string_tests = [ + "list[int]", + "tuple[int, str]", + "dict[str, int]", + "list[dict[str, float]]", + ] + + for type_str in string_tests: + try: + result = parse_type_string(type_str) + print(f"✓ '{type_str}' -> {result}") + except Exception as e: + print(f"✗ '{type_str}' -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing practical data conversion:") + + # Test actual data conversion + try: + # Create some test data + test_data = [ + {"name": "Alice", "scores": {"math": 95, "english": 87}}, + {"name": "Bob", "scores": {"math": 78, "english": 92}}, + ] + + # Create schema with nested dict using large_list representation + dict_type = python_type_to_arrow(dict[str, int]) + schema = pa.schema([("name", pa.large_string()), ("scores", dict_type)]) + + print(f"Dict type representation: {dict_type}") + + # Convert Python dicts to the expected list format + converted_data = [] + for record in test_data: + converted_record = record.copy() + if "scores" in converted_record: + # Convert dict to list of key-value structs + scores_dict = converted_record["scores"] + converted_record["scores"] = dict_to_arrow_list(scores_dict) + converted_data.append(converted_record) + + # Create Arrow table - need to handle the conversion properly + try: + table = pa.table(converted_data, schema=schema) + except Exception as table_error: + # If direct conversion fails, convert each column separately + print(f" Direct table creation failed: {table_error}") + print(" Trying column-by-column conversion...") + + # Convert each field separately + arrays = [] + for field in schema: + field_name = field.name + field_type = field.type + + # Extract column data + column_data = [record.get(field_name) for record in converted_data] + + # Create array with explicit type + array = pa.array(column_data, type=field_type) + arrays.append(array) + + # Create table from arrays + table = pa.table(arrays, schema=schema) + print(f"✓ Created PyArrow table with large_list representation") + + # Convert back to Python and reconstruct dicts + result_data = table.to_pylist() + for record in result_data: + if "scores" in record and record["scores"]: + # Convert list of key-value structs back to dict + record["scores"] = arrow_list_to_dict(record["scores"]) + + print(f"✓ Round-trip successful: {result_data[0]['scores']}") + + except Exception as e: + print(f"✗ Practical conversion test failed: {e}") + + print(f"\n{'=' * 60}") + print("All tests completed!") From 193374466d6fc95ed0616a94acce1ff3b2c6dc07 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 10:43:00 +0000 Subject: [PATCH 158/224] wip: working implementation of arrow python conversion --- .../semantic_types/python_arrow_types.py | 465 +++++++++++++++++- 1 file changed, 464 insertions(+), 1 deletion(-) diff --git a/src/orcapod/semantic_types/python_arrow_types.py b/src/orcapod/semantic_types/python_arrow_types.py index 4fe7325..71ed7e7 100644 --- a/src/orcapod/semantic_types/python_arrow_types.py +++ b/src/orcapod/semantic_types/python_arrow_types.py @@ -353,7 +353,308 @@ def arrow_list_to_dict(lst: list[dict]) -> dict: return {item["key"]: item["value"] for item in lst if item is not None} -# Example usage and comprehensive tests +def python_dicts_to_arrow_table(data: list[dict], schema: dict[str, type]) -> pa.Table: + """ + Convert list of Python dictionaries to PyArrow table with proper type conversion. + + Args: + data: List of Python dictionaries + schema: Dictionary mapping field names to Python type hints + + Returns: + PyArrow table with proper types + + Examples: + data = [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] + schema = {"x": int, "y": list[int]} + -> PyArrow table with x: int64, y: large_list + + data = [{"name": "Alice", "scores": {"math": 95, "english": 87}}] + schema = {"name": str, "scores": dict[str, int]} + -> PyArrow table with name: large_string, scores: large_list> + """ + if not data: + raise ValueError("Cannot create table from empty data list") + + if not schema: + raise ValueError("Schema cannot be empty") + + # Convert schema to Arrow schema + arrow_fields = [] + for field_name, python_type in schema.items(): + arrow_type = python_type_to_arrow(python_type) + arrow_fields.append(pa.field(field_name, arrow_type)) + + arrow_schema = pa.schema(arrow_fields) + + # Convert data with proper type transformations + converted_data = [] + for record in data: + converted_record = {} + for field_name, python_type in schema.items(): + value = record.get(field_name) + if value is not None: + converted_value = _convert_python_value_for_arrow(value, python_type) + converted_record[field_name] = converted_value + else: + converted_record[field_name] = None + converted_data.append(converted_record) + + # Create table with explicit schema + try: + table = pa.table(converted_data, schema=arrow_schema) + return table + except Exception as e: + # Fallback: create each column separately + arrays = [] + for field in arrow_schema: + field_name = field.name + field_type = field.type + + # Extract column data + column_data = [record.get(field_name) for record in converted_data] + + # Create array with explicit type + array = pa.array(column_data, type=field_type) + arrays.append(array) + + return pa.table(arrays, schema=arrow_schema) + + +def _convert_python_value_for_arrow(value, python_type): + """ + Convert a Python value to Arrow-compatible format based on expected type. + + Args: + value: Python value to convert + python_type: Expected Python type hint + + Returns: + Value in Arrow-compatible format + """ + origin = get_origin(python_type) + args = get_args(python_type) + + # Handle basic types - no conversion needed + if python_type in {int, float, str, bool, bytes} or origin is None: + return value + + # Handle Optional types + if origin is typing.Union and len(args) == 2 and type(None) in args: + if value is None: + return None + non_none_type = args[0] if args[1] is type(None) else args[1] + return _convert_python_value_for_arrow(value, non_none_type) + + # Handle list types + elif origin is list: + if not isinstance(value, (list, tuple)): + raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") + element_type = args[0] + return [_convert_python_value_for_arrow(item, element_type) for item in value] + + # Handle tuple types + elif origin is tuple: + if not isinstance(value, (list, tuple)): + raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") + + if len(set(args)) == 1: + # Homogeneous tuple - convert to list + element_type = args[0] + return [ + _convert_python_value_for_arrow(item, element_type) for item in value + ] + else: + # Heterogeneous tuple - convert to struct dict + if len(value) != len(args): + raise ValueError( + f"Tuple length mismatch: expected {len(args)}, got {len(value)}" + ) + struct_dict = {} + for i, (item, item_type) in enumerate(zip(value, args)): + struct_dict[f"f{i}"] = _convert_python_value_for_arrow(item, item_type) + return struct_dict + + # Handle dict types + elif origin is dict: + if not isinstance(value, dict): + raise TypeError(f"Expected dict for {python_type}, got {type(value)}") + + key_type, value_type = args + # Convert dict to list of key-value structs + key_value_list = [] + for k, v in value.items(): + converted_key = _convert_python_value_for_arrow(k, key_type) + converted_value = _convert_python_value_for_arrow(v, value_type) + key_value_list.append({"key": converted_key, "value": converted_value}) + return key_value_list + + else: + # For unsupported types, return as-is and let Arrow handle it + return value + + +def arrow_table_to_python_dicts(table: pa.Table) -> list[dict]: + """ + Convert PyArrow table back to list of Python dictionaries with proper type conversion. + + Args: + table: PyArrow table to convert + + Returns: + List of Python dictionaries with proper Python types + + Examples: + Arrow table with x: int64, y: large_list + -> [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] + + Arrow table with scores: large_list> + -> [{"name": "Alice", "scores": {"math": 95, "english": 87}}] + """ + # Convert table to list of raw dictionaries + raw_dicts = table.to_pylist() + + # Convert each dictionary with proper type transformations + converted_dicts = [] + for raw_dict in raw_dicts: + converted_dict = {} + for field_name, value in raw_dict.items(): + if value is not None: + # Get the Arrow field type + field = table.schema.field(field_name) + arrow_type = field.type + + # Convert based on Arrow type + converted_value = _convert_arrow_value_to_python(value, arrow_type) + converted_dict[field_name] = converted_value + else: + converted_dict[field_name] = None + converted_dicts.append(converted_dict) + + return converted_dicts + + +def _convert_arrow_value_to_python(value, arrow_type): + """ + Convert Arrow value back to proper Python type. + + Args: + value: Value from Arrow table (as returned by to_pylist()) + arrow_type: PyArrow type of the field + + Returns: + Value converted to proper Python type + """ + # Handle basic types - no conversion needed + if ( + pa.types.is_integer(arrow_type) + or pa.types.is_floating(arrow_type) + or pa.types.is_boolean(arrow_type) + or pa.types.is_string(arrow_type) + or pa.types.is_large_string(arrow_type) + or pa.types.is_binary(arrow_type) + or pa.types.is_large_binary(arrow_type) + ): + return value + + # Handle list types (including large_list and fixed_size_list) + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + if value is None: + return None + + element_type = arrow_type.value_type + + # Check if this is a dict representation: list> + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + if set(field_names) == {"key", "value"}: + # This is a dict - convert list of key-value structs to dict + result_dict = {} + for item in value: + if item is not None: + key_field = element_type.field("key") + value_field = element_type.field("value") + + converted_key = _convert_arrow_value_to_python( + item["key"], key_field.type + ) + converted_value = _convert_arrow_value_to_python( + item["value"], value_field.type + ) + result_dict[converted_key] = converted_value + return result_dict + + # Regular list - convert each element + converted_list = [] + for item in value: + converted_item = _convert_arrow_value_to_python(item, element_type) + converted_list.append(converted_item) + + # For fixed-size lists, convert to tuple if all elements are same type + if pa.types.is_fixed_size_list(arrow_type): + return tuple(converted_list) + else: + return converted_list + + # Handle struct types + elif pa.types.is_struct(arrow_type): + if value is None: + return None + + field_names = [field.name for field in arrow_type] + + # Check if this is a tuple representation (f0, f1, f2, ...) + if all(name.startswith("f") and name[1:].isdigit() for name in field_names): + # Convert struct to tuple + sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) + tuple_values = [] + for field in sorted_fields: + field_value = value.get(field.name) + converted_value = _convert_arrow_value_to_python( + field_value, field.type + ) + tuple_values.append(converted_value) + return tuple(tuple_values) + else: + # Regular struct - convert each field + converted_struct = {} + for field in arrow_type: + field_name = field.name + field_value = value.get(field_name) + converted_value = _convert_arrow_value_to_python( + field_value, field.type + ) + converted_struct[field_name] = converted_value + return converted_struct + + # Handle map types + elif pa.types.is_map(arrow_type): + if value is None: + return None + + # Maps are returned as list of {'key': k, 'value': v} dicts + result_dict = {} + key_type = arrow_type.key_type + item_type = arrow_type.item_type + + for item in value: + if item is not None: + converted_key = _convert_arrow_value_to_python(item["key"], key_type) + converted_value = _convert_arrow_value_to_python( + item["value"], item_type + ) + result_dict[converted_key] = converted_value + return result_dict + + else: + # For unsupported types, return as-is + return value + + if __name__ == "__main__": print("=== Complete Python Type Hint ↔ PyArrow Type Converter ===\n") @@ -369,6 +670,129 @@ def arrow_list_to_dict(lst: list[dict]) -> dict: except Exception as e: print(f"✗ Basic test failed: {e}") + print("\n" + "=" * 60) + print("Testing complex nested structures:") + + complex_nested_tests = [ + # Nested dictionaries + ( + dict[str, dict[str, int]], + pa.large_list( + pa.struct( + [ + ("key", pa.large_string()), + ( + "value", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.int64())] + ) + ), + ), + ] + ) + ), + ), + # Mixed complex types in tuples + ( + tuple[dict[str, int], list[str]], + pa.struct( + [ + ( + "f0", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.int64())] + ) + ), + ), + ("f1", pa.large_list(pa.large_string())), + ] + ), + ), + # Complex value types in dicts + ( + dict[str, list[int]], + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.large_list(pa.int64()))] + ) + ), + ), + # Triple nesting + ( + list[dict[str, list[int]]], + pa.large_list( + pa.large_list( + pa.struct( + [ + ("key", pa.large_string()), + ("value", pa.large_list(pa.int64())), + ] + ) + ) + ), + ), + # Complex tuple with nested structures + ( + tuple[list[int], dict[str, float], str], + pa.struct( + [ + ("f0", pa.large_list(pa.int64())), + ( + "f1", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.float64())] + ) + ), + ), + ("f2", pa.large_string()), + ] + ), + ), + ] + + for python_type, expected_arrow_type in complex_nested_tests: + try: + result = python_type_to_arrow(python_type) + success = result == expected_arrow_type + status = "✓" if success else "✗" + print(f"{status} {python_type}") + print(f" -> {result}") + if not success: + print(f" Expected: {expected_arrow_type}") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing complex nested round-trips:") + + complex_round_trip_tests = [ + dict[str, dict[str, int]], + tuple[dict[str, int], list[str]], + dict[str, list[int]], + list[dict[str, list[int]]], + tuple[list[int], dict[str, float], str], + dict[str, tuple[int, str]], + list[tuple[dict[str, int], list[str]]], + ] + + for python_type in complex_round_trip_tests: + try: + # Python -> Arrow -> Python + arrow_type = python_type_to_arrow(python_type) + recovered_python_type = arrow_type_to_python(arrow_type) + success = recovered_python_type == python_type + status = "✓" if success else "✗" + print(f"{status} {python_type}") + print(f" -> {arrow_type}") + print(f" -> {recovered_python_type}") + if not success: + print(f" Round-trip failed!") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + print("\n" + "=" * 60) print("Testing Python -> Arrow conversion:") @@ -600,5 +1024,44 @@ def arrow_list_to_dict(lst: list[dict]) -> dict: except Exception as e: print(f"✗ Practical conversion test failed: {e}") + print("Testing edge cases and limitations:") + + edge_case_tests = [ + # Complex key types - these are challenging but let's see what happens + "dict[tuple[str, int], str]", # tuple keys + "dict[str, dict[int, list[str]]]", # deeply nested + "Optional[dict[str, int]]", # optional complex types + ] + + for type_str in edge_case_tests: + try: + # Parse and convert + namespace = { + "list": list, + "tuple": tuple, + "dict": dict, + "int": int, + "str": str, + "float": float, + "bool": bool, + "bytes": bytes, + "Optional": typing.Optional, + "Union": typing.Union, + } + python_type = eval(type_str, {"__builtins__": {}}, namespace) + arrow_type = python_type_to_arrow(python_type) + recovered_type = arrow_type_to_python(arrow_type) + + success = recovered_type == python_type + status = "✓" if success else "⚠" + print(f"{status} {type_str}") + print(f" -> {arrow_type}") + print(f" -> {recovered_type}") + if not success: + print(f" Note: Complex key types may have limitations") + + except Exception as e: + print(f"✗ {type_str} -> ERROR: {e}") + print(f"\n{'=' * 60}") print("All tests completed!") From f87d313cecb048d507e426961041f13b592523d7 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 19:15:26 +0000 Subject: [PATCH 159/224] feat: working complete arrow python converters --- src/orcapod/semantic_types/__init__.py | 0 .../semantic_types/complete_converter.py | 1317 +++++++++++++++++ .../semantic_types/complete_converter_test.py | 628 ++++++++ src/orcapod/semantic_types/schemas.py | 357 +++++ .../semantic_types/struct_converters.py | 307 ++++ .../semantic_types/table_converters.py | 318 ++++ 6 files changed, 2927 insertions(+) create mode 100644 src/orcapod/semantic_types/__init__.py create mode 100644 src/orcapod/semantic_types/complete_converter.py create mode 100644 src/orcapod/semantic_types/complete_converter_test.py create mode 100644 src/orcapod/semantic_types/schemas.py create mode 100644 src/orcapod/semantic_types/struct_converters.py create mode 100644 src/orcapod/semantic_types/table_converters.py diff --git a/src/orcapod/semantic_types/__init__.py b/src/orcapod/semantic_types/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/semantic_types/complete_converter.py b/src/orcapod/semantic_types/complete_converter.py new file mode 100644 index 0000000..f4177d9 --- /dev/null +++ b/src/orcapod/semantic_types/complete_converter.py @@ -0,0 +1,1317 @@ +import pyarrow as pa +from typing import get_origin, get_args, Any +import typing +from collections.abc import Collection, Sequence, Mapping, Iterable, Set +import sys +from orcapod.semantic_types.struct_converters import SemanticTypeRegistry + +# Basic type mapping for Python -> Arrow conversion +_PYTHON_TO_ARROW_MAP = { + # Python built-ins + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), # Use large_string by default for Polars compatibility + bool: pa.bool_(), + bytes: pa.large_binary(), # Use large_binary by default for Polars compatibility + # String representations (for when we get type names as strings) + "int": pa.int64(), + "float": pa.float64(), + "str": pa.large_string(), + "bool": pa.bool_(), + "bytes": pa.large_binary(), + # Specific integer types + "int8": pa.int8(), + "int16": pa.int16(), + "int32": pa.int32(), + "int64": pa.int64(), + "uint8": pa.uint8(), + "uint16": pa.uint16(), + "uint32": pa.uint32(), + "uint64": pa.uint64(), + # Specific float types + "float32": pa.float32(), + "float64": pa.float64(), + # Date/time types + "date": pa.date32(), + "datetime": pa.timestamp("us"), + "timestamp": pa.timestamp("us"), +} + +# Reverse mapping for Arrow -> Python conversion (handles both regular and large variants) +_ARROW_TO_PYTHON_MAP = { + # Integer types + pa.int8(): int, + pa.int16(): int, + pa.int32(): int, + pa.int64(): int, + pa.uint8(): int, + pa.uint16(): int, + pa.uint32(): int, + pa.uint64(): int, + # Float types + pa.float32(): float, + pa.float64(): float, + # String types (both regular and large) + pa.string(): str, + pa.large_string(): str, + # Boolean + pa.bool_(): bool, + # Binary types (both regular and large) + pa.binary(): bytes, + pa.large_binary(): bytes, +} + +# Add numpy types if available +try: + import numpy as np + + _PYTHON_TO_ARROW_MAP.update( + { + np.int8: pa.int8(), + np.int16: pa.int16(), + np.int32: pa.int32(), + np.int64: pa.int64(), + np.uint8: pa.uint8(), + np.uint16: pa.uint16(), + np.uint32: pa.uint32(), + np.uint64: pa.uint64(), + np.float32: pa.float32(), + np.float64: pa.float64(), + np.bool_: pa.bool_(), + } + ) +except ImportError: + pass + + +def python_type_to_arrow( + type_hint, semantic_registry: SemanticTypeRegistry | None = None +) -> pa.DataType: + """ + Convert Python type hints to PyArrow data types. + + Args: + type_hint: Python type hint to convert + semantic_registry: Optional semantic type registry to check for semantic types + + Examples: + list[int] -> pa.large_list(pa.int64()) + tuple[int, int] -> pa.list_(pa.int64(), 2) + tuple[int, str] -> pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) + dict[str, int] -> pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) + """ + + # Handle basic types first + if type_hint in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[type_hint] + + # Check if this is a registered semantic type + if semantic_registry is not None: + converter = semantic_registry.get_converter_for_python_type(type_hint) + if converter: + return converter.arrow_struct_type + + # Get the origin (e.g., list, tuple, dict) and args (e.g., int, str) + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is None: + # Handle non-generic types that might not be in basic map + if hasattr(type_hint, "__name__"): + type_name = type_hint.__name__ + if type_name in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[type_name] + raise ValueError(f"Unsupported type: {type_hint}") + + # Handle list types + if origin is list: + if len(args) != 1: + raise ValueError( + f"list type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) # Use large_list for Polars compatibility + + # Handle tuple types + elif origin is tuple: + if len(args) == 0: + raise ValueError("Empty tuple type not supported") + + # Check if all elements are the same type + if len(set(args)) == 1: + # Homogeneous tuple: tuple[int, int, int] -> fixed-size list + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.list_( + element_type, len(args) + ) # Fixed-size lists are always regular lists + else: + # Heterogeneous tuple: tuple[int, str] -> struct with indexed fields + fields = [] + for i, arg_type in enumerate(args): + field_type = python_type_to_arrow(arg_type, semantic_registry) + fields.append((f"f{i}", field_type)) + return pa.struct(fields) + + # Handle dict types + elif origin is dict: + if len(args) != 2: + raise ValueError( + f"dict type must have exactly two type arguments, got: {args}" + ) + key_type = python_type_to_arrow(args[0], semantic_registry) + value_type = python_type_to_arrow(args[1], semantic_registry) + + # Use large_list> representation for better compatibility + # This works reliably across Arrow, Polars, Parquet, etc. + key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) + return pa.large_list(key_value_struct) + + # Handle abstract base classes and collections + elif origin in {Collection, Sequence, Iterable}: + # Treat as list - most common concrete implementation + if len(args) != 1: + raise ValueError( + f"{origin.__name__} type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) + + elif origin is Set or origin is set: + # Sets -> lists (Arrow doesn't have native set type) + if len(args) != 1: + raise ValueError( + f"set type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) + + elif origin is Mapping: + # Mapping -> dict representation + if len(args) != 2: + raise ValueError( + f"Mapping type must have exactly two type arguments, got: {args}" + ) + key_type = python_type_to_arrow(args[0], semantic_registry) + value_type = python_type_to_arrow(args[1], semantic_registry) + key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) + return pa.large_list(key_value_struct) + elif origin is typing.Union: + # Handle Optional[T] which is Union[T, NoneType] + if len(args) == 2 and type(None) in args: + # This is Optional[T] + non_none_type = args[0] if args[1] is type(None) else args[1] + base_type = python_type_to_arrow(non_none_type, semantic_registry) + # PyArrow handles nullability at the field level, so we just return the base type + return base_type + else: + # Complex unions - convert to a union type + union_types = [python_type_to_arrow(arg, semantic_registry) for arg in args] + # PyArrow union types are complex - for now, just use the first type as fallback + # TODO: Implement proper union support when needed + return union_types[0] # Simplified - take first type + + else: + raise ValueError(f"Unsupported generic type: {origin}") + + +def arrow_type_to_python(arrow_type: pa.DataType) -> type: + """ + Convert PyArrow data types back to Python type hints. + + Args: + arrow_type: PyArrow data type to convert + + Returns: + Python type annotation + + Examples: + pa.int64() -> int + pa.large_list(pa.large_string()) -> list[str] + pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) -> dict[str, int] + pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) -> tuple[int, str] + + Raises: + TypeError: If the Arrow type cannot be converted to a Python type + """ + + # Handle basic types + if arrow_type in _ARROW_TO_PYTHON_MAP: + return _ARROW_TO_PYTHON_MAP[arrow_type] + + # Check by Arrow type categories + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + return bytes + + # Handle complex types + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + element_type = arrow_type.value_type + + # Check if this is a dict representation: list> + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + + # Dict pattern: must have exactly 'key' and 'value' fields + if set(field_names) == {"key", "value"}: + # Find key and value types + key_field = next(f for f in element_type if f.name == "key") + value_field = next(f for f in element_type if f.name == "value") + + key_python_type = arrow_type_to_python(key_field.type) + value_python_type = arrow_type_to_python(value_field.type) + + return dict[key_python_type, value_python_type] + + # Regular list + element_python_type = arrow_type_to_python(element_type) + + # Check if this is a fixed-size list (homogeneous tuple representation) + if pa.types.is_fixed_size_list(arrow_type): + # Fixed-size list -> homogeneous tuple + size = arrow_type.list_size + return tuple[tuple(element_python_type for _ in range(size))] + else: + # Variable-size list -> list + return list[element_python_type] + + elif pa.types.is_struct(arrow_type): + # Check if this is a heterogeneous tuple representation + field_names = [field.name for field in arrow_type] + + # Tuple pattern: fields named f0, f1, f2, etc. + if all(name.startswith("f") and name[1:].isdigit() for name in field_names): + # Sort by field index to maintain order + sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) + field_types = [arrow_type_to_python(field.type) for field in sorted_fields] + return tuple[tuple(field_types)] + else: + # TODO: Could support NamedTuple or dataclass conversion here + raise TypeError( + f"Cannot convert struct type to Python type hint. " + f"Struct has fields: {field_names}. " + f"Only tuple-like structs (f0, f1, ...) are supported." + ) + + elif pa.types.is_map(arrow_type): + # Handle pa.map_ types (though we prefer list representation) + key_python_type = arrow_type_to_python(arrow_type.key_type) + value_python_type = arrow_type_to_python(arrow_type.item_type) + return dict[key_python_type, value_python_type] + + elif pa.types.is_union(arrow_type): + # Handle union types -> Union[T1, T2, ...] + import typing + + # Get the child types from the union + child_types = [] + for i in range(arrow_type.num_fields): + child_field = arrow_type[i] + child_types.append(arrow_type_to_python(child_field.type)) + + if len(child_types) == 2 and type(None) in child_types: + # This is Optional[T] + non_none_type = next(t for t in child_types if t is not type(None)) + return typing.Optional[non_none_type] + else: + return typing.Union[tuple(child_types)] + + else: + raise TypeError( + f"Cannot convert Arrow type '{arrow_type}' to Python type hint. " + f"Supported types: int, float, str, bool, bytes, list, large_list, fixed_size_list, tuple, dict, struct, map, union. " + f"Arrow type category: {arrow_type}" + ) + + +def parse_type_string(type_string: str): + """ + Parse a type hint from a string representation. + Useful when you have type hints as strings. + + Example: + parse_type_string("list[int]") -> pa.large_list(pa.int64()) + """ + # This is a simplified version - for production use, consider using ast.literal_eval + # or a proper type hint parser + try: + # Try to evaluate the string as a type hint + # Note: This uses eval which can be dangerous - use with trusted input only + import typing + + namespace = { + "list": list, + "tuple": tuple, + "dict": dict, + "int": int, + "str": str, + "float": float, + "bool": bool, + "bytes": bytes, + "Optional": typing.Optional, + "Union": typing.Union, + } + type_hint = eval(type_string, {"__builtins__": {}}, namespace) + return python_type_to_arrow(type_hint) + except Exception as e: + raise ValueError(f"Could not parse type string '{type_string}': {e}") + + +def infer_schema_from_data( + data: list[dict], semantic_registry: SemanticTypeRegistry | None = None +) -> dict[str, type]: + """ + Infer schema from sample data (best effort). + + Args: + data: List of sample dictionaries + semantic_registry: Optional semantic type registry for detecting semantic types + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema = {} + + # Get all possible field names + all_fields = set() + for record in data: + all_fields.update(record.keys()) + + # Infer type for each field + for field_name in all_fields: + field_values = [ + record.get(field_name) + for record in data + if field_name in record and record[field_name] is not None + ] + + if not field_values: + schema[field_name] = str # Default fallback instead of Any + continue + + # Get types of all values + value_types = {type(v) for v in field_values} + + if len(value_types) == 1: + # All values have same type + value_type = next(iter(value_types)) + + # Check if this is a semantic type first + if semantic_registry: + converter = semantic_registry.get_converter_for_python_type(value_type) + if converter: + schema[field_name] = value_type + continue + + # For containers, try to infer element types + if value_type is list and field_values: + # Infer list element type from first non-empty list + for lst in field_values: + if lst: # non-empty list + element_types = {type(elem) for elem in lst} + if len(element_types) == 1: + element_type = next(iter(element_types)) + schema[field_name] = list[element_type] + else: + # Mixed types - use str as fallback instead of Any + schema[field_name] = list[str] + break + else: + schema[field_name] = list[str] # Default fallback instead of Any + + elif value_type in {set, frozenset} and field_values: + # Infer set element type from first non-empty set + for s in field_values: + if s: # non-empty set + element_types = {type(elem) for elem in s} + if len(element_types) == 1: + element_type = next(iter(element_types)) + schema[field_name] = set[element_type] + else: + schema[field_name] = set[ + str + ] # Mixed types - fallback to str + break + else: + schema[field_name] = set[str] # All sets empty - fallback to str + + elif value_type is dict and field_values: + # Infer dict types from first non-empty dict + for d in field_values: + if d: # non-empty dict + key_types = {type(k) for k in d.keys()} + value_types = {type(v) for v in d.values()} + + if len(key_types) == 1 and len(value_types) == 1: + key_type = next(iter(key_types)) + val_type = next(iter(value_types)) + schema[field_name] = dict[key_type, val_type] + else: + # Mixed types - use most common types or fallback to str + key_type = ( + str if str in key_types else next(iter(key_types)) + ) + val_type = ( + str if str in value_types else next(iter(value_types)) + ) + schema[field_name] = dict[key_type, val_type] + break + else: + schema[field_name] = dict[ + str, str + ] # Default fallback instead of Any + + else: + schema[field_name] = value_type + + else: + # Mixed types - use str as fallback instead of Any + schema[field_name] = str + + return schema + + +def arrow_list_to_set(lst: list) -> set: + """Convert Arrow list back to Python set (removes duplicates).""" + return set(lst) if lst is not None else set() + + +def dict_to_arrow_list(d: dict) -> list[dict]: + """Convert Python dict to Arrow-compatible list of key-value structs.""" + return [{"key": k, "value": v} for k, v in d.items()] + + +def arrow_list_to_dict(lst: list[dict]) -> dict: + """Convert Arrow list of key-value structs back to Python dict.""" + return {item["key"]: item["value"] for item in lst if item is not None} + + +def python_dicts_to_arrow_table( + data: list[dict], + schema: dict[str, type] | None = None, + semantic_registry: SemanticTypeRegistry | None = None, +) -> pa.Table: + """ + Convert list of Python dictionaries to PyArrow table with proper type conversion. + + Args: + data: List of Python dictionaries + schema: Dictionary mapping field names to Python type hints (optional) + semantic_registry: Optional semantic type registry for complex Python objects + + Returns: + PyArrow table with proper types + + Examples: + # Basic usage + data = [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] + schema = {"x": int, "y": list[int]} + + # With semantic types + from pathlib import Path + data = [{"name": "Alice", "file": Path("/home/alice/data.csv")}] + schema = {"name": str, "file": Path} + table = python_dicts_to_arrow_table(data, schema, semantic_registry) + """ + if not data: + raise ValueError("Cannot create table from empty data list") + + # Auto-infer schema if not provided + if schema is None: + schema = infer_schema_from_data(data, semantic_registry) + print(f"Auto-inferred schema: {schema}") + + if not schema: + raise ValueError("Schema cannot be empty (and could not be inferred)") + + # Convert schema to Arrow schema (with semantic type support) + arrow_fields = [] + for field_name, python_type in schema.items(): + arrow_type = python_type_to_arrow(python_type, semantic_registry) + arrow_fields.append(pa.field(field_name, arrow_type)) + + arrow_schema = pa.schema(arrow_fields) + + # Convert data with proper type transformations (with semantic type support) + converted_data = [] + for record in data: + converted_record = {} + for field_name, python_type in schema.items(): + value = record.get(field_name) + if value is not None: + converted_value = _convert_python_value_for_arrow( + value, python_type, semantic_registry + ) + converted_record[field_name] = converted_value + else: + converted_record[field_name] = None + converted_data.append(converted_record) + + # Create table with explicit schema + try: + table = pa.table(converted_data, schema=arrow_schema) + return table + except Exception as e: + # Fallback: create each column separately + arrays = [] + for field in arrow_schema: + field_name = field.name + field_type = field.type + + # Extract column data + column_data = [record.get(field_name) for record in converted_data] + + # Create array with explicit type + array = pa.array(column_data, type=field_type) + arrays.append(array) + + return pa.table(arrays, schema=arrow_schema) + + +def _convert_python_value_for_arrow( + value, python_type, semantic_registry: SemanticTypeRegistry | None = None +): + """ + Convert a Python value to Arrow-compatible format based on expected type. + + Args: + value: Python value to convert + python_type: Expected Python type hint + semantic_registry: Optional semantic type registry + + Returns: + Value in Arrow-compatible format + """ + # First, check if this is a semantic type + if semantic_registry and hasattr( + semantic_registry, "get_converter_for_python_type" + ): + converter = semantic_registry.get_converter_for_python_type(python_type) + if converter: + # Convert using semantic type converter + return converter.python_to_struct_dict(value) + + # Fall back to standard type conversion + origin = get_origin(python_type) + args = get_args(python_type) + + # Handle basic types - no conversion needed + if python_type in {int, float, str, bool, bytes} or origin is None: + return value + + # Handle Optional types + if origin is typing.Union and len(args) == 2 and type(None) in args: + if value is None: + return None + non_none_type = args[0] if args[1] is type(None) else args[1] + return _convert_python_value_for_arrow(value, non_none_type, semantic_registry) + + # Handle abstract collections + elif origin is list or origin in {Collection, Sequence, Iterable}: + if not isinstance(value, (list, tuple)): + raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") + element_type = args[0] if args else Any + return [ + _convert_python_value_for_arrow(item, element_type, semantic_registry) + for item in value + ] + + # Handle set types + elif origin is set or origin is Set: + if not isinstance(value, (set, frozenset, list, tuple)): + raise TypeError( + f"Expected set/list/tuple for {python_type}, got {type(value)}" + ) + element_type = args[0] if args else Any + + # Convert set to sorted list for deterministic ordering + if isinstance(value, (set, frozenset)): + try: + # Sort if elements are comparable + value_list = sorted(value) + except TypeError: + # If elements aren't comparable (e.g., mixed types), convert to list as-is + # This maintains some order but isn't guaranteed to be deterministic + value_list = list(value) + else: + # Already a list/tuple, keep as-is + value_list = list(value) + + return [ + _convert_python_value_for_arrow(item, element_type, semantic_registry) + for item in value_list + ] + + # Handle mapping types + elif origin is dict or origin is Mapping: + if not isinstance(value, dict): + raise TypeError(f"Expected dict for {python_type}, got {type(value)}") + + key_type, value_type = (args[0], args[1]) if len(args) >= 2 else (Any, Any) + # Convert dict to list of key-value structs + key_value_list = [] + for k, v in value.items(): + converted_key = _convert_python_value_for_arrow( + k, key_type, semantic_registry + ) + converted_value = _convert_python_value_for_arrow( + v, value_type, semantic_registry + ) + key_value_list.append({"key": converted_key, "value": converted_value}) + return key_value_list + + # Handle tuple types + elif origin is tuple: + if not isinstance(value, (list, tuple)): + raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") + + if len(set(args)) == 1: + # Homogeneous tuple - convert to list + element_type = args[0] + return [ + _convert_python_value_for_arrow(item, element_type, semantic_registry) + for item in value + ] + else: + # Heterogeneous tuple - convert to struct dict + if len(value) != len(args): + raise ValueError( + f"Tuple length mismatch: expected {len(args)}, got {len(value)}" + ) + struct_dict = {} + for i, (item, item_type) in enumerate(zip(value, args)): + struct_dict[f"f{i}"] = _convert_python_value_for_arrow( + item, item_type, semantic_registry + ) + return struct_dict + + # Handle dict types + elif origin is dict: + if not isinstance(value, dict): + raise TypeError(f"Expected dict for {python_type}, got {type(value)}") + + key_type, value_type = args + # Convert dict to list of key-value structs + key_value_list = [] + for k, v in value.items(): + converted_key = _convert_python_value_for_arrow( + k, key_type, semantic_registry + ) + converted_value = _convert_python_value_for_arrow( + v, value_type, semantic_registry + ) + key_value_list.append({"key": converted_key, "value": converted_value}) + return key_value_list + + else: + # For unsupported types, return as-is and let Arrow handle it + return value + + +def arrow_table_to_python_dicts( + table: pa.Table, semantic_registry: SemanticTypeRegistry | None = None +) -> list[dict]: + """ + Convert PyArrow table back to list of Python dictionaries with proper type conversion. + + Args: + table: PyArrow table to convert + semantic_registry: Optional semantic type registry for complex Python objects + + Returns: + List of Python dictionaries with proper Python types + + Examples: + Arrow table with x: int64, y: large_list + -> [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] + + Arrow table with semantic types (Path stored as struct) + -> [{"name": "Alice", "file": Path("/home/alice/data.csv")}] + """ + # Convert table to list of raw dictionaries + raw_dicts = table.to_pylist() + + # Convert each dictionary with proper type transformations + converted_dicts = [] + for raw_dict in raw_dicts: + converted_dict = {} + for field_name, value in raw_dict.items(): + if value is not None: + # Get the Arrow field type + field = table.schema.field(field_name) + arrow_type = field.type + + # Convert based on Arrow type (with semantic type support) + converted_value = _convert_arrow_value_to_python( + value, arrow_type, semantic_registry + ) + converted_dict[field_name] = converted_value + else: + converted_dict[field_name] = None + converted_dicts.append(converted_dict) + + return converted_dicts + + +def _convert_arrow_value_to_python( + value, arrow_type, semantic_registry: SemanticTypeRegistry | None = None +): + """ + Convert Arrow value back to proper Python type. + + Args: + value: Value from Arrow table (as returned by to_pylist()) + arrow_type: PyArrow type of the field + semantic_registry: Optional semantic type registry + + Returns: + Value converted to proper Python type + """ + # First, check if this is a semantic struct type + if semantic_registry and pa.types.is_struct(arrow_type): + converter = semantic_registry.get_converter_for_struct_type(arrow_type) + if converter and isinstance(value, dict): + # Convert using semantic type converter + return converter.struct_dict_to_python(value) + + # Fall back to standard type conversion + # Handle basic types - no conversion needed + if ( + pa.types.is_integer(arrow_type) + or pa.types.is_floating(arrow_type) + or pa.types.is_boolean(arrow_type) + or pa.types.is_string(arrow_type) + or pa.types.is_large_string(arrow_type) + or pa.types.is_binary(arrow_type) + or pa.types.is_large_binary(arrow_type) + ): + return value + + # Handle list types (including large_list and fixed_size_list) + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + if value is None: + return None + + element_type = arrow_type.value_type + + # Check if this is a dict representation: list> + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + if set(field_names) == {"key", "value"}: + # This is a dict - convert list of key-value structs to dict + result_dict = {} + for item in value: + if item is not None: + key_field = element_type.field("key") + value_field = element_type.field("value") + + converted_key = _convert_arrow_value_to_python( + item["key"], key_field.type, semantic_registry + ) + converted_value = _convert_arrow_value_to_python( + item["value"], value_field.type, semantic_registry + ) + result_dict[converted_key] = converted_value + return result_dict + + # Regular list - convert each element + converted_list = [] + for item in value: + converted_item = _convert_arrow_value_to_python( + item, element_type, semantic_registry + ) + converted_list.append(converted_item) + + # For fixed-size lists, convert to tuple if all elements are same type + if pa.types.is_fixed_size_list(arrow_type): + return tuple(converted_list) + else: + return converted_list + + # Handle struct types + elif pa.types.is_struct(arrow_type): + if value is None: + return None + + field_names = [field.name for field in arrow_type] + + # Check if this is a tuple representation (f0, f1, f2, ...) + if all(name.startswith("f") and name[1:].isdigit() for name in field_names): + # Convert struct to tuple + sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) + tuple_values = [] + for field in sorted_fields: + field_value = value.get(field.name) + converted_value = _convert_arrow_value_to_python( + field_value, field.type, semantic_registry + ) + tuple_values.append(converted_value) + return tuple(tuple_values) + else: + # Regular struct - convert each field (could be semantic type handled above) + converted_struct = {} + for field in arrow_type: + field_name = field.name + field_value = value.get(field_name) + converted_value = _convert_arrow_value_to_python( + field_value, field.type, semantic_registry + ) + converted_struct[field_name] = converted_value + return converted_struct + + # Handle map types + elif pa.types.is_map(arrow_type): + if value is None: + return None + + # Maps are returned as list of {'key': k, 'value': v} dicts + result_dict = {} + key_type = arrow_type.key_type + item_type = arrow_type.item_type + + for item in value: + if item is not None: + converted_key = _convert_arrow_value_to_python( + item["key"], key_type, semantic_registry + ) + converted_value = _convert_arrow_value_to_python( + item["value"], item_type, semantic_registry + ) + result_dict[converted_key] = converted_value + return result_dict + + else: + # For unsupported types, return as-is + return value + + +if __name__ == "__main__": + print("=== Complete Python Type Hint ↔ PyArrow Type Converter ===\n") + + # Test basic functionality first + print("Testing basic round-trip:") + try: + # Simple test + python_type = dict[str, int] + arrow_type = python_type_to_arrow(python_type) + recovered_type = arrow_type_to_python(arrow_type) + print(f"✓ {python_type} -> {arrow_type} -> {recovered_type}") + print(f" Match: {recovered_type == python_type}") + except Exception as e: + print(f"✗ Basic test failed: {e}") + + print("\n" + "=" * 60) + print("Testing complex nested structures:") + + complex_nested_tests = [ + # Nested dictionaries + ( + dict[str, dict[str, int]], + pa.large_list( + pa.struct( + [ + ("key", pa.large_string()), + ( + "value", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.int64())] + ) + ), + ), + ] + ) + ), + ), + # Mixed complex types in tuples + ( + tuple[dict[str, int], list[str]], + pa.struct( + [ + ( + "f0", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.int64())] + ) + ), + ), + ("f1", pa.large_list(pa.large_string())), + ] + ), + ), + # Complex value types in dicts + ( + dict[str, list[int]], + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.large_list(pa.int64()))] + ) + ), + ), + # Triple nesting + ( + list[dict[str, list[int]]], + pa.large_list( + pa.large_list( + pa.struct( + [ + ("key", pa.large_string()), + ("value", pa.large_list(pa.int64())), + ] + ) + ) + ), + ), + # Complex tuple with nested structures + ( + tuple[list[int], dict[str, float], str], + pa.struct( + [ + ("f0", pa.large_list(pa.int64())), + ( + "f1", + pa.large_list( + pa.struct( + [("key", pa.large_string()), ("value", pa.float64())] + ) + ), + ), + ("f2", pa.large_string()), + ] + ), + ), + ] + + for python_type, expected_arrow_type in complex_nested_tests: + try: + result = python_type_to_arrow(python_type) + success = result == expected_arrow_type + status = "✓" if success else "✗" + print(f"{status} {python_type}") + print(f" -> {result}") + if not success: + print(f" Expected: {expected_arrow_type}") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing complex nested round-trips:") + + complex_round_trip_tests = [ + dict[str, dict[str, int]], + tuple[dict[str, int], list[str]], + dict[str, list[int]], + list[dict[str, list[int]]], + tuple[list[int], dict[str, float], str], + dict[str, tuple[int, str]], + list[tuple[dict[str, int], list[str]]], + ] + + for python_type in complex_round_trip_tests: + try: + # Python -> Arrow -> Python + arrow_type = python_type_to_arrow(python_type) + recovered_python_type = arrow_type_to_python(arrow_type) + success = recovered_python_type == python_type + status = "✓" if success else "✗" + print(f"{status} {python_type}") + print(f" -> {arrow_type}") + print(f" -> {recovered_python_type}") + if not success: + print(f" Round-trip failed!") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing Python -> Arrow conversion:") + + # Test cases for Python -> Arrow + python_to_arrow_tests = [ + # Basic types + (int, pa.int64()), + (str, pa.large_string()), + (float, pa.float64()), + (bool, pa.bool_()), + # Lists (both regular and large) + (list[int], pa.large_list(pa.int64())), + (list[str], pa.large_list(pa.large_string())), + (list[float], pa.large_list(pa.float64())), + # Homogeneous tuples (always use regular fixed-size lists) + (tuple[int, int], pa.list_(pa.int64(), 2)), + (tuple[str, str, str], pa.list_(pa.large_string(), 3)), + # Heterogeneous tuples + (tuple[int, str], pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), + ( + tuple[int, str, float], + pa.struct( + [("f0", pa.int64()), ("f1", pa.large_string()), ("f2", pa.float64())] + ), + ), + # Dict types - using large_list> for Polars compatibility + ( + dict[str, int], + pa.large_list( + pa.struct([("key", pa.large_string()), ("value", pa.int64())]) + ), + ), + ( + dict[int, str], + pa.large_list( + pa.struct([("key", pa.int64()), ("value", pa.large_string())]) + ), + ), + # Nested types + (list[list[int]], pa.large_list(pa.large_list(pa.int64()))), + ( + list[tuple[int, str]], + pa.large_list(pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), + ), + ] + + for python_type, expected_arrow_type in python_to_arrow_tests: + try: + result = python_type_to_arrow(python_type) + success = result == expected_arrow_type + status = "✓" if success else "✗" + print(f"{status} {python_type} -> {result}") + if not success: + print(f" Expected: {expected_arrow_type}") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing Arrow -> Python type conversion:") + + arrow_to_python_tests = [ + # Basic types (both regular and large variants) + (pa.int64(), int), + (pa.string(), str), + (pa.large_string(), str), + (pa.float64(), float), + (pa.bool_(), bool), + (pa.binary(), bytes), + (pa.large_binary(), bytes), + # Lists (both regular and large) + (pa.list_(pa.int64(), -1), list[int]), + (pa.large_list(pa.int64()), list[int]), + (pa.list_(pa.string(), -1), list[str]), + (pa.large_list(pa.large_string()), list[str]), + # Fixed-size lists (homogeneous tuples) + (pa.list_(pa.int64(), 3), tuple[int, int, int]), + (pa.list_(pa.large_string(), 2), tuple[str, str]), + # Dict representation: both regular and large list variants + ( + pa.list_(pa.struct([("key", pa.string()), ("value", pa.int64())]), -1), + dict[str, int], + ), + ( + pa.large_list( + pa.struct([("key", pa.large_string()), ("value", pa.int64())]) + ), + dict[str, int], + ), + ( + pa.list_(pa.struct([("key", pa.int64()), ("value", pa.string())]), -1), + dict[int, str], + ), + ( + pa.large_list( + pa.struct([("key", pa.int64()), ("value", pa.large_string())]) + ), + dict[int, str], + ), + # Heterogeneous tuples: struct + (pa.struct([("f0", pa.int64()), ("f1", pa.string())]), tuple[int, str]), + (pa.struct([("f0", pa.int64()), ("f1", pa.large_string())]), tuple[int, str]), + ( + pa.struct([("f0", pa.int64()), ("f1", pa.string()), ("f2", pa.float64())]), + tuple[int, str, float], + ), + # Maps (if encountered) + (pa.map_(pa.string(), pa.int64()), dict[str, int]), + (pa.map_(pa.large_string(), pa.int64()), dict[str, int]), + # Nested structures + (pa.list_(pa.list_(pa.int64(), -1), -1), list[list[int]]), + (pa.large_list(pa.large_list(pa.int64())), list[list[int]]), + ] + + for arrow_type, expected_python_type in arrow_to_python_tests: + try: + result = arrow_type_to_python(arrow_type) + success = result == expected_python_type + status = "✓" if success else "✗" + print(f"{status} {arrow_type} -> {result}") + if not success: + print(f" Expected: {expected_python_type}") + except Exception as e: + print(f"✗ {arrow_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing round-trip conversion:") + + round_trip_tests = [ + dict[str, int], + list[int], + tuple[int, str], + tuple[str, str, str], + list[dict[str, int]], + list[list[str]], + tuple[int, float, bool], + ] + + for python_type in round_trip_tests: + try: + # Python -> Arrow -> Python + arrow_type = python_type_to_arrow(python_type) + recovered_python_type = arrow_type_to_python(arrow_type) + success = recovered_python_type == python_type + status = "✓" if success else "✗" + print(f"{status} {python_type} -> {arrow_type} -> {recovered_python_type}") + if not success: + print(f" Round-trip failed!") + except Exception as e: + print(f"✗ {python_type} -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing string parsing:") + + string_tests = [ + "list[int]", + "tuple[int, str]", + "dict[str, int]", + "list[dict[str, float]]", + ] + + for type_str in string_tests: + try: + result = parse_type_string(type_str) + print(f"✓ '{type_str}' -> {result}") + except Exception as e: + print(f"✗ '{type_str}' -> ERROR: {e}") + + print("\n" + "=" * 60) + print("Testing practical data conversion:") + + # Test actual data conversion + try: + # Create some test data + test_data = [ + {"name": "Alice", "scores": {"math": 95, "english": 87}}, + {"name": "Bob", "scores": {"math": 78, "english": 92}}, + ] + + # Create schema with nested dict using large_list representation + dict_type = python_type_to_arrow(dict[str, int]) + schema = pa.schema([("name", pa.large_string()), ("scores", dict_type)]) + + print(f"Dict type representation: {dict_type}") + + # Convert Python dicts to the expected list format + converted_data = [] + for record in test_data: + converted_record = record.copy() + if "scores" in converted_record: + # Convert dict to list of key-value structs + scores_dict = converted_record["scores"] + converted_record["scores"] = dict_to_arrow_list(scores_dict) + converted_data.append(converted_record) + + # Create Arrow table - need to handle the conversion properly + try: + table = pa.table(converted_data, schema=schema) + except Exception as table_error: + # If direct conversion fails, convert each column separately + print(f" Direct table creation failed: {table_error}") + print(" Trying column-by-column conversion...") + + # Convert each field separately + arrays = [] + for field in schema: + field_name = field.name + field_type = field.type + + # Extract column data + column_data = [record.get(field_name) for record in converted_data] + + # Create array with explicit type + array = pa.array(column_data, type=field_type) + arrays.append(array) + + # Create table from arrays + table = pa.table(arrays, schema=schema) + print(f"✓ Created PyArrow table with large_list representation") + + # Convert back to Python and reconstruct dicts + result_data = table.to_pylist() + for record in result_data: + if "scores" in record and record["scores"]: + # Convert list of key-value structs back to dict + record["scores"] = arrow_list_to_dict(record["scores"]) + + print(f"✓ Round-trip successful: {result_data[0]['scores']}") + + except Exception as e: + print(f"✗ Practical conversion test failed: {e}") + + print("Testing edge cases and limitations:") + + edge_case_tests = [ + # Complex key types - these are challenging but let's see what happens + "dict[tuple[str, int], str]", # tuple keys + "dict[str, dict[int, list[str]]]", # deeply nested + "Optional[dict[str, int]]", # optional complex types + ] + + for type_str in edge_case_tests: + try: + # Parse and convert + namespace = { + "list": list, + "tuple": tuple, + "dict": dict, + "int": int, + "str": str, + "float": float, + "bool": bool, + "bytes": bytes, + "Optional": typing.Optional, + "Union": typing.Union, + } + python_type = eval(type_str, {"__builtins__": {}}, namespace) + arrow_type = python_type_to_arrow(python_type) + recovered_type = arrow_type_to_python(arrow_type) + + success = recovered_type == python_type + status = "✓" if success else "⚠" + print(f"{status} {type_str}") + print(f" -> {arrow_type}") + print(f" -> {recovered_type}") + if not success: + print(f" Note: Complex key types may have limitations") + + except Exception as e: + print(f"✗ {type_str} -> ERROR: {e}") + + print(f"\n{'=' * 60}") + print("All tests completed!") diff --git a/src/orcapod/semantic_types/complete_converter_test.py b/src/orcapod/semantic_types/complete_converter_test.py new file mode 100644 index 0000000..b1939cd --- /dev/null +++ b/src/orcapod/semantic_types/complete_converter_test.py @@ -0,0 +1,628 @@ +""" +Comprehensive test suite for Python Type Hint ↔ PyArrow Type Converter +with full semantic type support. + +This test suite validates: +- Basic type conversions +- Complex nested structures +- Set handling with deterministic ordering +- Dictionary representations +- Semantic type integration +- Schema inference +- Round-trip conversion fidelity +- Error handling and edge cases +""" + +import pyarrow as pa +import typing +from typing import Any, Optional, Union +from collections.abc import Collection, Sequence, Set, Mapping +from pathlib import Path +import tempfile +import uuid +from datetime import datetime, date +import json + +# Import the converter functions +# (In real usage, these would be imported from your module) +from orcapod.semantic_types.complete_converter import ( + python_type_to_arrow, + arrow_type_to_python, + python_dicts_to_arrow_table, + arrow_table_to_python_dicts, + infer_schema_from_data, + dict_to_arrow_list, + arrow_list_to_dict, + arrow_list_to_set, +) + + +# Mock Semantic Type System for Testing +class MockSemanticRegistry: + """Mock semantic registry that supports Path, UUID, and custom types.""" + + def __init__(self): + self.converters = { + Path: MockPathConverter(), + uuid.UUID: MockUUIDConverter(), # Use uuid.UUID directly + "CustomData": MockCustomDataConverter(), + } + + def get_converter_for_python_type(self, python_type): + # Handle direct type lookups + if python_type in self.converters: + return self.converters[python_type] + + # Handle subclass relationships - add safety check + for registered_type, converter in self.converters.items(): + try: + if ( + isinstance(registered_type, type) + and isinstance(python_type, type) + and issubclass(python_type, registered_type) + ): + return converter + except TypeError: + # Handle cases where issubclass fails (e.g., with generic types) + continue + + # Handle string-based lookups (for custom types) + type_name = getattr(python_type, "__name__", str(python_type)) + if type_name in self.converters: + return self.converters[type_name] + + return None + + def get_converter_for_struct_type(self, struct_type): + if not pa.types.is_struct(struct_type): + return None + + field_names = {f.name for f in struct_type} + + # Path struct detection + if field_names == {"semantic_type", "path"}: + return self.converters[Path] + + # UUID struct detection + if field_names == {"semantic_type", "uuid_str"}: + return self.converters[uuid.UUID] + + # Custom data struct detection + if field_names == {"semantic_type", "data", "metadata"}: + return self.converters["CustomData"] + + return None + + +class MockPathConverter: + """Mock converter for pathlib.Path objects.""" + + @property + def arrow_struct_type(self): + return pa.struct([("semantic_type", pa.string()), ("path", pa.large_string())]) + + def python_to_struct_dict(self, value): + if not isinstance(value, Path): + raise TypeError(f"Expected Path, got {type(value)}") + return {"semantic_type": "path", "path": str(value)} + + def struct_dict_to_python(self, struct_dict): + if struct_dict.get("semantic_type") != "path": + raise ValueError("Not a path semantic type") + return Path(struct_dict["path"]) + + +class MockUUIDConverter: + """Mock converter for UUID objects.""" + + @property + def arrow_struct_type(self): + return pa.struct([("semantic_type", pa.string()), ("uuid_str", pa.string())]) + + def python_to_struct_dict(self, value): + if not isinstance(value, uuid.UUID): + raise TypeError(f"Expected UUID, got {type(value)}") + return {"semantic_type": "uuid", "uuid_str": str(value)} + + def struct_dict_to_python(self, struct_dict): + if struct_dict.get("semantic_type") != "uuid": + raise ValueError("Not a uuid semantic type") + return uuid.UUID(struct_dict["uuid_str"]) + + +class CustomData: + """Custom data class for testing complex semantic types.""" + + def __init__(self, data: dict, metadata: dict = None): + self.data = data + self.metadata = metadata or {} + + def __eq__(self, other): + if not isinstance(other, CustomData): + return False + return self.data == other.data and self.metadata == other.metadata + + def __repr__(self): + return f"CustomData(data={self.data}, metadata={self.metadata})" + + +class MockCustomDataConverter: + """Mock converter for CustomData objects.""" + + @property + def arrow_struct_type(self): + return pa.struct( + [ + ("semantic_type", pa.string()), + ("data", pa.large_string()), # JSON serialized + ("metadata", pa.large_string()), # JSON serialized + ] + ) + + def python_to_struct_dict(self, value): + if not isinstance(value, CustomData): + raise TypeError(f"Expected CustomData, got {type(value)}") + return { + "semantic_type": "custom_data", + "data": json.dumps(value.data), + "metadata": json.dumps(value.metadata), + } + + def struct_dict_to_python(self, struct_dict): + if struct_dict.get("semantic_type") != "custom_data": + raise ValueError("Not a custom_data semantic type") + + data = json.loads(struct_dict["data"]) + metadata = json.loads(struct_dict["metadata"]) + return CustomData(data, metadata) + + +def run_comprehensive_tests(): + """Run comprehensive test suite for the type converter.""" + + print("🚀 COMPREHENSIVE PYTHON ↔ ARROW TYPE CONVERTER TEST SUITE") + print("=" * 80) + + # Initialize mock semantic registry + semantic_registry = MockSemanticRegistry() + + # Test counters + total_tests = 0 + passed_tests = 0 + + def test_case(name: str, test_func): + """Helper to run individual test cases.""" + nonlocal total_tests, passed_tests + total_tests += 1 + + print(f"\n📋 Testing: {name}") + try: + test_func() + print(f" ✅ PASSED") + passed_tests += 1 + except Exception as e: + print(f" ❌ FAILED: {e}") + import traceback + + traceback.print_exc() + + # Test 1: Basic Type Conversions + def test_basic_types(): + basic_tests = [ + (int, pa.int64()), + (str, pa.large_string()), + (float, pa.float64()), + (bool, pa.bool_()), + (bytes, pa.large_binary()), + ] + + for python_type, expected_arrow_type in basic_tests: + result = python_type_to_arrow(python_type) + assert result == expected_arrow_type, ( + f"Expected {expected_arrow_type}, got {result}" + ) + + # Test reverse conversion + recovered_type = arrow_type_to_python(result) + assert recovered_type == python_type, ( + f"Round-trip failed: {python_type} -> {result} -> {recovered_type}" + ) + + test_case("Basic Type Conversions", test_basic_types) + + # Test 2: Complex Nested Structures + def test_complex_nested(): + complex_tests = [ + # Nested dictionaries + dict[str, dict[str, int]], + # Mixed tuples with complex types (remove set[float] as it gets converted to list[float]) + tuple[dict[str, int], list[str]], + # Deep nesting + list[dict[str, list[tuple[int, str]]]], + # Complex mappings + dict[str, tuple[list[int], list[str]]], # Changed set[str] to list[str] + ] + + for complex_type in complex_tests: + arrow_type = python_type_to_arrow(complex_type) + recovered_type = arrow_type_to_python(arrow_type) + assert recovered_type == complex_type, ( + f"Complex round-trip failed: {complex_type} -> {arrow_type} -> {recovered_type}" + ) + + test_case("Complex Nested Structures", test_complex_nested) + + # Test 3: Set Handling with Deterministic Ordering + def test_set_deterministic_ordering(): + # Test data with sets that should be sorted deterministically + set_data = [ + {"tags": {3, 1, 4, 1, 5}, "name": "Alice"}, # Duplicate should be removed + {"tags": {9, 2, 6, 5, 3}, "name": "Bob"}, + {"tags": {"python", "arrow", "data"}, "name": "Charlie"}, # String set + ] + + # Test with numeric sets + numeric_schema = {"tags": set[int], "name": str} + table1 = python_dicts_to_arrow_table(set_data[:2], numeric_schema) + result1 = arrow_table_to_python_dicts(table1) + + # Verify deterministic ordering (should be sorted) + assert result1[0]["tags"] == [1, 3, 4, 5], ( + f"Expected sorted [1, 3, 4, 5], got {result1[0]['tags']}" + ) + assert result1[1]["tags"] == [2, 3, 5, 6, 9], ( + f"Expected sorted [2, 3, 5, 6, 9], got {result1[1]['tags']}" + ) + + # Test with string sets + string_schema = {"tags": set[str], "name": str} + table2 = python_dicts_to_arrow_table([set_data[2]], string_schema) + result2 = arrow_table_to_python_dicts(table2) + + # Verify alphabetical ordering - note: sets become lists in round-trip + expected_sorted = sorted(["python", "arrow", "data"]) + assert result2[0]["tags"] == expected_sorted, ( + f"Expected {expected_sorted}, got {result2[0]['tags']}" + ) + + # Test that we can convert back to set if needed + recovered_set = set(result2[0]["tags"]) + original_set = {"python", "arrow", "data"} + assert recovered_set == original_set, "Set contents should be preserved" + + test_case("Set Deterministic Ordering", test_set_deterministic_ordering) + + # Test 4: Abstract Collection Types + def test_abstract_collections(): + abstract_tests = [ + (Collection[int], pa.large_list(pa.int64())), + (Sequence[str], pa.large_list(pa.large_string())), + (Set[float], pa.large_list(pa.float64())), + ( + Mapping[str, int], + pa.large_list( + pa.struct([("key", pa.large_string()), ("value", pa.int64())]) + ), + ), + ] + + for python_type, expected_arrow_type in abstract_tests: + result = python_type_to_arrow(python_type) + assert result == expected_arrow_type, ( + f"Abstract type conversion failed for {python_type}" + ) + + test_case("Abstract Collection Types", test_abstract_collections) + + # Test 5: Semantic Type Integration + def test_semantic_types(): + # Create test data with various semantic types + test_uuid = uuid.uuid4() + custom_data = CustomData( + data={"key": "value", "count": 42}, + metadata={"created": "2024-01-01", "version": 1}, + ) + + semantic_data = [ + { + "id": 1, + "name": "Alice", + "file_path": Path("/home/alice/data.csv"), + "unique_id": test_uuid, + "custom": custom_data, + "tags": ["analysis", "data"], + }, + { + "id": 2, + "name": "Bob", + "file_path": Path("/home/bob/results.json"), + "unique_id": uuid.uuid4(), + "custom": CustomData({"type": "result"}, {"source": "experiment"}), + "tags": ["results", "ml"], + }, + ] + + semantic_schema = { + "id": int, + "name": str, + "file_path": Path, + "unique_id": uuid.UUID, # Use uuid.UUID directly + "custom": CustomData, + "tags": list[str], + } + + # Convert to Arrow table with semantic types + table = python_dicts_to_arrow_table( + semantic_data, semantic_schema, semantic_registry + ) + + # Verify schema contains semantic struct types + schema_types = {field.name: field.type for field in table.schema} + assert pa.types.is_struct(schema_types["file_path"]), ( + "Path should be converted to struct" + ) + assert pa.types.is_struct(schema_types["unique_id"]), ( + "UUID should be converted to struct" + ) + assert pa.types.is_struct(schema_types["custom"]), ( + "CustomData should be converted to struct" + ) + + # Test round-trip conversion + recovered_data = arrow_table_to_python_dicts(table, semantic_registry) + + # Verify semantic types were properly reconstructed + assert isinstance(recovered_data[0]["file_path"], Path), ( + "Path not properly recovered" + ) + assert isinstance(recovered_data[0]["unique_id"], uuid.UUID), ( + "UUID not properly recovered" + ) + assert isinstance(recovered_data[0]["custom"], CustomData), ( + "CustomData not properly recovered" + ) + + # Verify values are correct + assert str(recovered_data[0]["file_path"]) == "/home/alice/data.csv" + assert recovered_data[0]["unique_id"] == test_uuid + assert recovered_data[0]["custom"] == custom_data + + test_case("Semantic Type Integration", test_semantic_types) + + # Test 6: Schema Inference + def test_schema_inference(): + # Test data with mixed types for inference - make sure data matches what we expect + inference_data = [ + { + "name": "Alice", + "age": 25, + "scores": [95, 87, 92], + "active": True, + "metadata": {"role": "admin", "level": "5"}, # Make level a string + "tags": {"python", "data", "ml"}, + }, + { + "name": "Bob", + "age": 30, + "scores": [78, 85], + "active": False, + "metadata": {"role": "user", "level": "2"}, # Make level a string + "tags": {"javascript", "web"}, + }, + ] + + # Test inference without semantic types + inferred_schema = infer_schema_from_data(inference_data) + print(f"Inferred schema: {inferred_schema}") + + expected_types = { + "name": str, + "age": int, + "scores": list[int], + "active": bool, + "metadata": dict[str, str], # Now all values are strings + "tags": set[str], + } + + for field, expected_type in expected_types.items(): + assert field in inferred_schema, f"Field {field} not in inferred schema" + # For complex types, just check the origin matches + if hasattr(expected_type, "__origin__"): + assert inferred_schema[field].__origin__ == expected_type.__origin__, ( + f"Field {field}: expected {expected_type.__origin__}, got {inferred_schema[field].__origin__}" + ) + else: + assert inferred_schema[field] == expected_type, ( + f"Field {field}: expected {expected_type}, got {inferred_schema[field]}" + ) + + # Test table creation with inferred schema + table = python_dicts_to_arrow_table(inference_data) # No explicit schema + recovered = arrow_table_to_python_dicts(table) + + # Verify basic round-trip works + assert len(recovered) == 2 + assert recovered[0]["name"] == "Alice" + assert recovered[0]["age"] == 25 + assert recovered[0]["metadata"]["role"] == "admin" + + test_case("Schema Inference", test_schema_inference) + + # Test 7: Optional and Union Types + def test_optional_union_types(): + # Test Optional types + optional_data = [ + {"name": "Alice", "middle_name": "Marie", "age": 25}, + {"name": "Bob", "middle_name": None, "age": 30}, # None value + ] + + optional_schema = { + "name": str, + "middle_name": Optional[str], # Should handle None values + "age": int, + } + + table = python_dicts_to_arrow_table(optional_data, optional_schema) + recovered = arrow_table_to_python_dicts(table) + + assert recovered[0]["middle_name"] == "Marie" + assert recovered[1]["middle_name"] is None # None should be preserved + + test_case("Optional and Union Types", test_optional_union_types) + + # Test 8: Error Handling and Edge Cases + def test_error_handling(): + # Test with empty data + try: + python_dicts_to_arrow_table([]) + assert False, "Should raise error for empty data" + except ValueError as e: + assert "empty data list" in str(e) + + # Test with unsupported type + try: + python_type_to_arrow(complex) # complex numbers not supported + assert False, "Should raise error for unsupported type" + except ValueError: + pass # Expected + + # Test with mismatched data and schema - this should fail gracefully + mismatch_data = [{"name": "Alice", "age": "twenty-five"}] # age as string + mismatch_schema = {"name": str, "age": int} # expects int + + # This should raise an error due to type mismatch + try: + table = python_dicts_to_arrow_table(mismatch_data, mismatch_schema) + assert False, "Should raise error for type mismatch" + except (ValueError, pa.ArrowInvalid) as e: + # Expected - conversion should fail for incompatible types + assert "convert" in str(e).lower() or "invalid" in str(e).lower() + + test_case("Error Handling and Edge Cases", test_error_handling) + + # Test 9: Large Data Performance Test + def test_large_data_performance(): + import time + + # Generate larger dataset + large_data = [] + for i in range(1000): + large_data.append( + { + "id": i, + "name": f"User_{i}", + "scores": [i % 100, (i * 2) % 100, (i * 3) % 100], + "metadata": { + "group": str(i % 10), + "active": str(i % 2 == 0), + }, # Convert to strings + "tags": {f"tag_{i % 5}", f"category_{i % 3}"}, + } + ) + + schema = { + "id": int, + "name": str, + "scores": list[int], + "metadata": dict[str, str], # Change from Any to str + "tags": set[str], + } + + # Time the conversion + start_time = time.time() + table = python_dicts_to_arrow_table(large_data, schema) + conversion_time = time.time() - start_time + + # Time the round-trip + start_time = time.time() + recovered = arrow_table_to_python_dicts(table) + recovery_time = time.time() - start_time + + print(f" 📊 Performance: {len(large_data)} records") + print(f" Conversion: {conversion_time:.3f}s") + print(f" Recovery: {recovery_time:.3f}s") + + # Verify correctness on sample + assert len(recovered) == 1000 + assert recovered[0]["id"] == 0 + assert recovered[999]["id"] == 999 + assert isinstance(recovered[0]["tags"], list) # Sets become lists + + test_case("Large Data Performance", test_large_data_performance) + + # Test 10: File I/O Round-trip Test + def test_file_io_roundtrip(): + # Test saving to and loading from Parquet file + test_data = [ + { + "name": "Alice", + "path": Path("/tmp/alice.txt"), + "scores": {"math": 95, "english": 87}, + "tags": {"student", "honor_roll"}, + }, + { + "name": "Bob", + "path": Path("/tmp/bob.txt"), + "scores": {"math": 78, "english": 92}, + "tags": {"student", "debate_team"}, + }, + ] + + schema = {"name": str, "path": Path, "scores": dict[str, int], "tags": set[str]} + + # Convert to Arrow table + table = python_dicts_to_arrow_table(test_data, schema, semantic_registry) + + # Write to temporary Parquet file + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f: + temp_path = f.name + + try: + # Write to Parquet + import pyarrow.parquet as pq + + pq.write_table(table, temp_path) + + # Read back from Parquet + loaded_table = pq.read_table(temp_path) + + # Convert back to Python + recovered_data = arrow_table_to_python_dicts( + loaded_table, semantic_registry + ) + + # Verify data integrity + assert len(recovered_data) == 2 + assert isinstance(recovered_data[0]["path"], Path) + assert str(recovered_data[0]["path"]) == "/tmp/alice.txt" + assert recovered_data[0]["scores"]["math"] == 95 + + print(f" 💾 Successfully wrote and read {temp_path}") + + finally: + # Clean up + import os + + if os.path.exists(temp_path): + os.unlink(temp_path) + + test_case("File I/O Round-trip", test_file_io_roundtrip) + + # Print final results + print("\n" + "=" * 80) + print(f"🏁 TEST RESULTS: {passed_tests}/{total_tests} tests passed") + + if passed_tests == total_tests: + print("🎉 ALL TESTS PASSED! The converter is working perfectly.") + else: + failed = total_tests - passed_tests + print(f"⚠️ {failed} test(s) failed. Please review the failures above.") + + print("=" * 80) + + return passed_tests == total_tests + + +if __name__ == "__main__": + success = run_comprehensive_tests() + exit(0 if success else 1) diff --git a/src/orcapod/semantic_types/schemas.py b/src/orcapod/semantic_types/schemas.py new file mode 100644 index 0000000..57f0551 --- /dev/null +++ b/src/orcapod/semantic_types/schemas.py @@ -0,0 +1,357 @@ +from typing import Self +from orcapod.types.core import DataType, TypeSpec +from orcapod.types.semantic_types import ( + SemanticType, + SemanticTypeRegistry, + PythonArrowConverter, +) +import pyarrow as pa +import datetime + +# This mapping is expected to be stable +# Be sure to test this assumption holds true +DEFAULT_ARROW_TYPE_LUT = { + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), + bool: pa.bool_(), +} + + +def python_to_arrow_type(python_type: type) -> pa.DataType: + if python_type in DEFAULT_ARROW_TYPE_LUT: + return DEFAULT_ARROW_TYPE_LUT[python_type] + raise TypeError(f"Converstion of python type {python_type} is not supported yet") + + +def arrow_to_python_type(arrow_type: pa.DataType) -> type: + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_date(arrow_type): + return datetime.date + elif pa.types.is_timestamp(arrow_type): + return datetime.datetime + elif pa.types.is_binary(arrow_type): + return bytes + else: + raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") + + +class PythonSchema(dict[str, DataType]): + """ + A schema for Python data types, mapping string keys to Python types. + + This is used to define the expected structure of data packets in OrcaPod. + + Attributes + ---------- + keys : str + The keys of the schema. + values : type + The types corresponding to each key. + + Examples + -------- + >>> schema = PythonSchema(name=str, age=int) + >>> print(schema) + {'name': , 'age': } + """ + + def copy(self) -> "PythonSchema": + return PythonSchema(self) + + def to_semantic_schema( + self, semantic_type_registry: SemanticTypeRegistry + ) -> "SemanticSchema": + """ + Convert the Python schema to a semantic schema using the provided semantic type registry. + + Parameters + ---------- + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + + Examples + -------- + >>> python_schema = PythonSchema(name=str, age=int) + >>> semantic_schema = python_schema.to_semantic_schema(registry) + >>> print(semantic_schema) + {'name': (str, None), 'age': (int, None)} + """ + return SemanticSchema.from_typespec(self, semantic_type_registry) + + def to_arrow_schema( + self, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> pa.Schema: + """ + Convert the Python schema to an Arrow schema. + If converters are provided, they are used to convert the schema. Note that + no validation is performed on the converters, so they must be compatible with the schema. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + fields = [] + for field_name, python_type in self.items(): + if field_name in converters: + converter = converters[field_name] + arrow_type = converter.arrow_type + metadata = None + if converter.semantic_type_name is not None: + metadata = { + b"semantic_type": converter.semantic_type_name.encode( + "utf-8" + ) + } + else: + arrow_type = python_to_arrow_type(python_type) + metadata = None + fields.append(pa.field(field_name, arrow_type, metadata=metadata)) + return pa.schema(fields) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + # Otherwise, convert using the semantic type registry + return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() + + @classmethod + def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: + """ + Create a PythonSchema from a SemanticSchema. + + Parameters + ---------- + semantic_schema : SemanticSchema + The semantic schema to convert. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return cls(semantic_schema.get_python_types()) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry | None = None, + converters: dict[str, PythonArrowConverter] | None = None, + ) -> Self: + """ + Create a PythonSchema from an Arrow schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. + semantic_type_registry : SemanticTypeRegistry + The registry containing semantic type information. + skip_system_columns : bool, optional + Whether to skip system columns (default is True). + converters : dict[str, PythonArrowConverter], optional + A dictionary of converters to use for converting the schema. If provided, the schema will be + converted using the converters. If not provided, the schema will be converted using the semantic type + registry. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + if converters is not None: + # If converters are provided, use them to convert the schema + python_types = {} + for field in arrow_schema: + # TODO: consider performing validation of semantic type + if field.name in converters: + converter = converters[field.name] + python_types[field.name] = converter.python_type + else: + python_types[field.name] = arrow_to_python_type(field.type) + return cls(python_types) + + if semantic_type_registry is None: + raise ValueError( + "semantic_type_registry must be provided if converters are not" + ) + semantic_schema = SemanticSchema.from_arrow_schema( + arrow_schema, + semantic_type_registry, + ) + return cls(semantic_schema.get_python_types()) + + +class SemanticSchema(dict[str, type | SemanticType]): + """ + A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. + + This is used to define the expected structure of data packets with semantic types in OrcaPod. + + Attributes + ---------- + keys : str + The keys of the schema. + values : type | SemanticType + Either type for simple fields or SemanticType for semantic fields. + + Examples + -------- + >>> schema = SemanticSchema(image=SemanticType('path'), age=int) + >>> print(schema) + {"image": SemanticType(name='path'), "age": })} + """ + + def get_semantic_fields(self) -> dict[str, SemanticType]: + """ + Get a dictionary of semantic fields in the schema. + + Returns + ------- + dict[str, SemanticType] + A dictionary mapping keys to their corresponding SemanticType. + """ + return {k: v for k, v in self.items() if isinstance(v, SemanticType)} + + def get_python_types(self) -> dict[str, type]: + """ + Get the Python types for all keys in the schema. + + Returns + ------- + dict[str, type] + A dictionary mapping keys to their corresponding Python types. + """ + return { + k: v.get_default_python_type() if isinstance(v, SemanticType) else v + for k, v in self.items() + } + + def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: + """ + Get the Arrow types for all keys in the schema. + + Returns + ------- + dict[str, tuple[pa.DataType, str|None]] + A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, + the second element of the tuple is the semantic type name; otherwise, it is None. + """ + return { + k: (v.get_default_arrow_type(), v.name) + if isinstance(v, SemanticType) + else (python_to_arrow_type(v), None) + for k, v in self.items() + } + + def to_arrow_schema(self) -> pa.Schema: + """ + Get the Arrow schema, which is a PythonSchema representation of the semantic schema. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + fields = [] + for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): + if semantic_type_name is not None: + field = pa.field( + k, + arrow_type, + metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, + ) + else: + field = pa.field(k, arrow_type) + fields.append(field) + + return pa.schema(fields) + + def to_python_schema(self) -> PythonSchema: + """ + Get the Python schema, which is a PythonSchema representation of the semantic schema. + + Returns + ------- + PythonSchema + A new schema mapping keys to Python types. + """ + return PythonSchema.from_semantic_schema(self) + + @classmethod + def from_arrow_schema( + cls, + arrow_schema: pa.Schema, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + """ + Create a SemanticSchema from an Arrow schema. + + Parameters + ---------- + arrow_schema : pa.Schema + The Arrow schema to convert. + + Returns + ------- + SemanticSchema + A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + """ + + semantic_schema = {} + for field in arrow_schema: + field_type = None + if field.metadata is not None: + semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() + if semantic_type_name: + semantic_type = semantic_type_registry.get_semantic_type( + semantic_type_name + ) + if semantic_type is None: + raise ValueError( + f"Semantic type '{semantic_type_name}' not found in registry" + ) + if not semantic_type.supports_arrow_type(field.type): + raise ValueError( + f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" + ) + field_type = semantic_type + + if ( + field_type is None + ): # was not set to semantic type, so fallback to simple conversion + field_type = arrow_to_python_type(field.type) + + semantic_schema[field.name] = field_type + return cls(semantic_schema) + + @classmethod + def from_typespec( + cls, + typespec: TypeSpec, + semantic_type_registry: SemanticTypeRegistry, + ) -> Self: + semantic_schema = {} + for key, python_type in typespec.items(): + semantic_type = semantic_type_registry.get_semantic_type_for_python_type( + python_type + ) + if semantic_type is not None: + semantic_schema[key] = semantic_type + else: + semantic_schema[key] = python_type + return cls(semantic_schema) diff --git a/src/orcapod/semantic_types/struct_converters.py b/src/orcapod/semantic_types/struct_converters.py new file mode 100644 index 0000000..b5ab182 --- /dev/null +++ b/src/orcapod/semantic_types/struct_converters.py @@ -0,0 +1,307 @@ +""" +Struct-based semantic type system for OrcaPod. + +This replaces the metadata-based approach with explicit struct fields, +making semantic types visible in schemas and preserved through operations. +""" + +from typing import Any, Protocol +from pathlib import Path +import pyarrow as pa +from collections.abc import Collection + + +# Core protocols +class StructConverter(Protocol): + """Protocol for converting between Python objects and semantic structs.""" + + @property + def semantic_type_name(self) -> str: + """The semantic type name this converter handles.""" + ... + + @property + def python_type(self) -> type: + """The Python type this converter can handle.""" + ... + + @property + def arrow_struct_type(self) -> pa.StructType: + """The Arrow struct type this converter produces.""" + ... + + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert Python value to struct dictionary.""" + ... + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dictionary back to Python value.""" + ... + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type.""" + ... + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + """Check if this converter can handle the given struct type.""" + ... + + +# Path-specific implementation +class PathStructConverter: + """Converter for pathlib.Path objects to/from semantic structs.""" + + def __init__(self): + self._semantic_type_name = "path" + self._python_type = Path + + # Define the Arrow struct type for paths + self._arrow_struct_type = pa.struct( + [ + pa.field("semantic_type", pa.string()), + pa.field("path", pa.large_string()), + ] + ) + + @property + def semantic_type_name(self) -> str: + return self._semantic_type_name + + @property + def python_type(self) -> type: + return self._python_type + + @property + def arrow_struct_type(self) -> pa.StructType: + return self._arrow_struct_type + + def python_to_struct_dict(self, value: Path) -> dict[str, Any]: + """Convert Path to struct dictionary.""" + if not isinstance(value, Path): + raise TypeError(f"Expected Path, got {type(value)}") + + return { + "semantic_type": self._semantic_type_name, + "path": str(value), + } + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: + """Convert struct dictionary back to Path.""" + if struct_dict.get("semantic_type") != self._semantic_type_name: + raise ValueError( + f"Expected semantic_type '{self._semantic_type_name}', " + f"got '{struct_dict.get('semantic_type')}'" + ) + + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct") + + return Path(path_str) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type.""" + return issubclass(python_type, Path) + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + """Check if this converter can handle the given struct type.""" + # Check if struct has the expected fields + field_names = [field.name for field in struct_type] + expected_fields = {"semantic_type", "path"} + + if set(field_names) != expected_fields: + return False + + # Check field types + field_types = {field.name: field.type for field in struct_type} + + return ( + field_types["semantic_type"] == pa.string() + and field_types["path"] == pa.large_string() + ) + + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: + """Check if a struct dictionary represents this semantic type.""" + return struct_dict.get("semantic_type") == self._semantic_type_name + + +# Registry for managing semantic type converters +class SemanticTypeRegistry: + """Registry that manages struct-based semantic type converters.""" + + def __init__(self, converters: Collection[StructConverter] | None = None): + self._python_to_converter: dict[type, StructConverter] = {} + self._name_to_converter: dict[str, StructConverter] = {} + self._struct_type_to_converter: dict[pa.StructType, StructConverter] = {} + + if converters: + for converter in converters: + self.register_converter(converter) + + def register_converter(self, converter: StructConverter) -> None: + """Register a semantic type converter.""" + # Register by Python type + python_type = converter.python_type + if python_type in self._python_to_converter: + existing = self._python_to_converter[python_type] + raise ValueError( + f"Python type {python_type} already registered with converter " + f"for semantic type '{existing.semantic_type_name}'" + ) + self._python_to_converter[python_type] = converter + + # Register by semantic type name + name = converter.semantic_type_name + if name in self._name_to_converter: + raise ValueError(f"Semantic type '{name}' already registered") + self._name_to_converter[name] = converter + + # Register by struct type + struct_type = converter.arrow_struct_type + self._struct_type_to_converter[struct_type] = converter + + def get_converter_for_python_type( + self, python_type: type + ) -> StructConverter | None: + """Get converter for a Python type.""" + # Direct lookup first + converter = self._python_to_converter.get(python_type) + if converter: + return converter + + # Check for subclass relationships + for registered_type, converter in self._python_to_converter.items(): + if issubclass(python_type, registered_type): + return converter + + return None + + def get_converter_for_semantic_type( + self, semantic_type_name: str + ) -> StructConverter | None: + """Get converter by semantic type name.""" + return self._name_to_converter.get(semantic_type_name) + + def get_converter_for_struct_type( + self, struct_type: pa.StructType + ) -> StructConverter | None: + """Get converter for an Arrow struct type.""" + # Direct lookup first + converter = self._struct_type_to_converter.get(struct_type) + if converter: + return converter + + # Check if any converter can handle this struct type + for converter in self._name_to_converter.values(): + if converter.can_handle_struct_type(struct_type): + return converter + + return None + + def is_semantic_struct_type(self, struct_type: pa.StructType) -> bool: + """Check if a struct type represents a semantic type.""" + return self.get_converter_for_struct_type(struct_type) is not None + + def has_python_type(self, python_type: type) -> bool: + """Check if a Python type is registered.""" + return self.get_converter_for_python_type(python_type) is not None + + def has_semantic_type(self, semantic_type_name: str) -> bool: + """Check if a semantic type is registered.""" + return semantic_type_name in self._name_to_converter + + def list_semantic_types(self) -> list[str]: + """Get all registered semantic type names.""" + return list(self._name_to_converter.keys()) + + def list_python_types(self) -> list[type]: + """Get all registered Python types.""" + return list(self._python_to_converter.keys()) + + +# Conversion utilities +class SemanticStructConverter: + """Main converter class for working with semantic structs.""" + + def __init__(self, registry: SemanticTypeRegistry): + self.registry = registry + + def python_to_struct_dict(self, value: Any) -> dict[str, Any] | None: + """Convert Python value to struct dict if it's a semantic type.""" + converter = self.registry.get_converter_for_python_type(type(value)) + if converter: + return converter.python_to_struct_dict(value) + return None + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dict back to Python value.""" + semantic_type = struct_dict.get("semantic_type") + if not semantic_type: + raise ValueError("Struct dict missing 'semantic_type' field") + + converter = self.registry.get_converter_for_semantic_type(semantic_type) + if not converter: + raise ValueError(f"No converter found for semantic type '{semantic_type}'") + + return converter.struct_dict_to_python(struct_dict) + + def is_semantic_struct_dict(self, struct_dict: dict[str, Any]) -> bool: + """Check if a dict represents a semantic struct.""" + semantic_type = struct_dict.get("semantic_type") + if not semantic_type: + return False + return self.registry.has_semantic_type(semantic_type) + + def python_to_arrow_array(self, values: list[Any]) -> pa.Array: + """Convert list of Python values to Arrow array of structs.""" + if not values: + raise ValueError("Cannot convert empty list") + + # Check if first value is a semantic type + first_converter = self.registry.get_converter_for_python_type(type(values[0])) + if not first_converter: + raise ValueError(f"No semantic type converter for {type(values[0])}") + + # Convert all values to struct dicts + struct_dicts = [] + for value in values: + converter = self.registry.get_converter_for_python_type(type(value)) + if converter is None or converter != first_converter: + raise ValueError("All values must be the same semantic type") + struct_dicts.append(converter.python_to_struct_dict(value)) + + # Create Arrow array + return pa.array(struct_dicts, type=first_converter.arrow_struct_type) + + def arrow_array_to_python(self, array: pa.Array) -> list[Any]: + """Convert Arrow struct array back to list of Python values.""" + if not pa.types.is_struct(array.type): + raise ValueError(f"Expected struct array, got {array.type}") + + converter = self.registry.get_converter_for_struct_type(array.type) + if not converter: + raise ValueError(f"No converter found for struct type {array.type}") + + # Convert each struct to Python value + python_values = [] + for i in range(len(array)): + struct_scalar = array[i] + if struct_scalar.is_valid: + struct_dict = struct_scalar.as_py() + python_values.append(converter.struct_dict_to_python(struct_dict)) + else: + python_values.append(None) + + return python_values + + +# Default registry with Path support +def create_default_registry() -> SemanticTypeRegistry: + """Create default registry with built-in semantic types.""" + registry = SemanticTypeRegistry() + registry.register_converter(PathStructConverter()) + return registry + + +# Global default registry +DEFAULT_REGISTRY = create_default_registry() diff --git a/src/orcapod/semantic_types/table_converters.py b/src/orcapod/semantic_types/table_converters.py new file mode 100644 index 0000000..c1f9265 --- /dev/null +++ b/src/orcapod/semantic_types/table_converters.py @@ -0,0 +1,318 @@ +""" +Schema system for struct-based semantic types. + +This replaces the metadata-based schema handling with explicit struct types +in the Arrow schema itself. +""" + +from collections.abc import Mapping +from typing import Any, Self +import pyarrow as pa + +from orcapod.types import TypeSpec +from .struct_converters import ( + StructConverter, + SemanticTypeRegistry, + SemanticStructConverter, +) + + +class SemanticSchema: + """Schema that handles semantic types as explicit struct fields.""" + + def __init__(self, python_schema: TypeSpec, registry: SemanticTypeRegistry): + """ + Create a semantic schema. + + Args: + schema_dict: Mapping of field names to Python types + registry: Semantic type registry to use + """ + self.python_schema = dict(python_schema) + # TODO: integrate with data context system + self.registry = registry # or DEFAULT_REGISTRY + self.converter = SemanticStructConverter(self.registry) + + def to_arrow_schema(self) -> pa.Schema: + """Convert to Arrow schema with semantic types as structs.""" + fields = [] + + for field_name, python_type in self.python_schema.items(): + # Check if this is a semantic type + converter = self.registry.get_converter_for_python_type(python_type) + + if converter: + # Use the struct type for semantic types + arrow_type = converter.arrow_struct_type + else: + # Use standard Arrow types for regular types + arrow_type = self._python_to_arrow_type(python_type) + + fields.append(pa.field(field_name, arrow_type)) + + return pa.schema(fields) + + def _python_to_arrow_type(self, python_type: type) -> pa.DataType: + """Convert Python type to Arrow type for non-semantic types.""" + type_mapping = { + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), + bool: pa.bool_(), + bytes: pa.binary(), + } + + if python_type in type_mapping: + return type_mapping[python_type] + else: + raise TypeError(f"Unsupported Python type: {python_type}") + + @classmethod + def from_arrow_schema( + cls, arrow_schema: pa.Schema, registry: SemanticTypeRegistry + ) -> "SemanticSchema": + """Create SemanticSchema from Arrow schema.""" + schema_dict = {} + + for field in arrow_schema: + if pa.types.is_struct(field.type): + # Check if this is a semantic struct + converter = registry.get_converter_for_struct_type(field.type) + if converter: + schema_dict[field.name] = converter.python_type + else: + # Regular struct - not supported yet + # TODO: support by constructing typed dictionary + raise ValueError( + f"Non-semantic struct types not supported: {field.type}" + ) + else: + # Regular Arrow type + schema_dict[field.name] = cls._arrow_to_python_type(field.type) + + return cls(schema_dict, registry) + + @staticmethod + def _arrow_to_python_type(arrow_type: pa.DataType) -> type: + """Convert Arrow type to Python type.""" + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_binary(arrow_type): + return bytes + else: + raise TypeError(f"Unsupported Arrow type: {arrow_type}") + + def get_semantic_fields(self) -> dict[str, type]: + """Get fields that are semantic types.""" + semantic_fields = {} + for field_name, python_type in self.python_schema.items(): + if self.registry.has_python_type(python_type): + semantic_fields[field_name] = python_type + return semantic_fields + + def get_regular_fields(self) -> dict[str, type]: + """Get fields that are regular (non-semantic) types.""" + regular_fields = {} + for field_name, python_type in self.python_schema.items(): + if not self.registry.has_python_type(python_type): + regular_fields[field_name] = python_type + return regular_fields + + +class SchemaSemanticTableConverter: + """Schema-specific semantic converter that pre-resolves semantic type converters for efficiency. + + This converter is optimized for batch processing of data with a consistent schema. + It pre-resolves all semantic type converters during initialization to avoid + repeated registry lookups during data conversion. + """ + + def __init__(self, schema: SemanticSchema): + """ + Create converter for a specific schema. + + Args: + schema: Semantic schema defining field types and semantic mappings + """ + self.schema = schema + + # Pre-resolve converters for each semantic field (performance optimization) + self.field_converters: dict[str, StructConverter] = {} + self.semantic_fields = set() + self.regular_fields = set() + + for field_name, python_type in schema.python_schema.items(): + converter = self.schema.registry.get_converter_for_python_type(python_type) + if converter: + self.field_converters[field_name] = converter + self.semantic_fields.add(field_name) + else: + self.regular_fields.add(field_name) + + @classmethod + def from_python_schema( + cls, python_schema: TypeSpec, registry: SemanticTypeRegistry + ) -> Self: + """Factory method to create converter from schema.""" + return cls(SemanticSchema(python_schema, registry)) + + @classmethod + def from_arrow_schema( + cls, arrow_schema: "pa.Schema", registry: SemanticTypeRegistry + ) -> Self: + return cls(SemanticSchema.from_arrow_schema(arrow_schema, registry)) + + def python_dict_to_struct_dict( + self, data_dict: Mapping[str, Any] + ) -> dict[str, Any]: + """Convert Python dict to struct dict for semantic fields.""" + result = dict(data_dict) + + for field_name, converter in self.field_converters.items(): + if field_name in result and result[field_name] is not None: + result[field_name] = converter.python_to_struct_dict(result[field_name]) + + return result + + def struct_dict_to_python_dict( + self, struct_dict: Mapping[str, Any] + ) -> dict[str, Any]: + """Convert struct dict back to Python dict for semantic fields.""" + result = dict(struct_dict) + + for field_name, converter in self.field_converters.items(): + if field_name in result and result[field_name] is not None: + if isinstance(result[field_name], dict): + result[field_name] = converter.struct_dict_to_python( + result[field_name] + ) + + return result + + def python_dicts_to_arrow_table(self, data_dicts: list[dict[str, Any]]) -> pa.Table: + """Convert list of Python dicts to Arrow table with semantic structs.""" + if not data_dicts: + raise ValueError("Cannot create table from empty list") + + # Process each field using pre-resolved converters + arrow_data = {} + + for field_name in self.schema.python_schema.keys(): + values = [d.get(field_name) for d in data_dicts] + + if field_name in self.field_converters: + # Semantic field - convert to structs using pre-resolved converter + converter = self.field_converters[field_name] + struct_dicts = [] + for value in values: + if value is not None: + struct_dicts.append(converter.python_to_struct_dict(value)) + else: + struct_dicts.append(None) + arrow_data[field_name] = pa.array( + struct_dicts, type=converter.arrow_struct_type + ) + else: + # Regular field + arrow_data[field_name] = pa.array(values) + + return pa.table(arrow_data, schema=self.schema.to_arrow_schema()) + + def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: + """Convert Arrow table back to list of Python dicts.""" + # Convert table to list of dictionaries + raw_dicts = table.to_pylist() + + # Process each dictionary to convert structs back to Python objects + python_dicts = [] + for raw_dict in raw_dicts: + python_dict = {} + for field_name, value in raw_dict.items(): + if field_name in self.field_converters and isinstance(value, dict): + # Convert semantic struct back to Python object using pre-resolved converter + converter = self.field_converters[field_name] + python_dict[field_name] = converter.struct_dict_to_python(value) + else: + # Regular value + python_dict[field_name] = value + python_dicts.append(python_dict) + + return python_dicts + + def python_dict_to_arrow_table(self, data_dict: dict[str, Any]) -> pa.Table: + """Convert single Python dict to Arrow table.""" + return self.python_dicts_to_arrow_table([data_dict]) + + +class SemanticTableConverter: + """General-purpose converter for working with semantic types without pre-defined schema.""" + + def __init__(self, registry: SemanticTypeRegistry): + self.registry = registry + self.struct_converter = SemanticStructConverter(self.registry) + + def python_dict_to_arrow_table( + self, data_dict: dict[str, Any], schema: SemanticSchema | None = None + ) -> pa.Table: + """Convert dictionary of Python values to Arrow table.""" + if schema is None: + # Infer schema from data + schema_dict = {key: type(value) for key, value in data_dict.items()} + schema = SemanticSchema(schema_dict, self.registry) + + # Use schema-specific converter for efficiency + converter = SchemaSemanticTableConverter(schema) + return converter.python_dict_to_arrow_table(data_dict) + + def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: + """Convert Arrow table back to list of Python dictionaries.""" + # Infer schema from Arrow table + schema = SemanticSchema.from_arrow_schema(table.schema, self.registry) + + # Use schema-specific converter for efficiency + converter = SchemaSemanticTableConverter(schema) + return converter.arrow_table_to_python_dicts(table) + + def python_dicts_to_arrow_table( + self, dicts: list[dict[str, Any]], schema: SemanticSchema | None = None + ) -> pa.Table: + """Convert list of Python dictionaries to Arrow table.""" + if not dicts: + raise ValueError("Cannot create table from empty list") + + if schema is None: + # Infer schema from first dictionary + schema_dict = {key: type(value) for key, value in dicts[0].items()} + schema = SemanticSchema(schema_dict, self.registry) + + # Use schema-specific converter for efficiency + converter = SchemaSemanticTableConverter(schema) + return converter.python_dicts_to_arrow_table(dicts) + + +# Utility functions for working with semantic tables +def create_semantic_table( + data: dict[str, Any] | list[dict[str, Any]], + registry: SemanticTypeRegistry, +) -> pa.Table: + """Convenience function to create Arrow table with semantic types.""" + converter = SemanticTableConverter(registry) + + if isinstance(data, dict): + return converter.python_dict_to_arrow_table(data) + else: + return converter.python_dicts_to_arrow_table(data) + + +def extract_python_data( + table: pa.Table, registry: SemanticTypeRegistry +) -> list[dict[str, Any]]: + """Convenience function to extract Python data from semantic table.""" + converter = SemanticTableConverter(registry) + return converter.arrow_table_to_python_dicts(table) From 15472950c2d0e60c4272299a6eacbc5ca6c7ec90 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Thu, 31 Jul 2025 19:16:32 +0000 Subject: [PATCH 160/224] build: add jsonschema build dependency --- pyproject.toml | 1 + uv.lock | 178 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d0bbe4f..6af8bd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ "deltalake>=1.0.2", "httpie>=3.2.4", "ipykernel>=6.29.5", + "jsonschema>=4.25.0", "pyarrow-stubs>=20.0.0.20250716", "pyiceberg>=0.9.1", "pytest>=8.3.5", diff --git a/uv.lock b/uv.lock index 03d0483..637997f 100644 --- a/uv.lock +++ b/uv.lock @@ -117,6 +117,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, ] +[[package]] +name = "attrs" +version = "25.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/1367933a8532ee6ff8d63537de4f1177af4bff9f3e829baf7331f595bb24/attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b", size = 812032, upload-time = "2025-03-13T11:10:22.779Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" }, +] + [[package]] name = "beartype" version = "0.21.0" @@ -723,6 +732,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, ] +[[package]] +name = "jsonschema" +version = "4.25.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/00/a297a868e9d0784450faa7365c2172a7d6110c763e30ba861867c32ae6a9/jsonschema-4.25.0.tar.gz", hash = "sha256:e63acf5c11762c0e6672ffb61482bdf57f0876684d8d249c0fe2d730d48bc55f", size = 356830, upload-time = "2025-07-18T15:39:45.11Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl", hash = "sha256:24c2e8da302de79c8b9382fee3e76b355e44d2a4364bb207159ce10b517bd716", size = 89184, upload-time = "2025-07-18T15:39:42.956Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/ce/46fbd9c8119cfc3581ee5643ea49464d168028cfb5caff5fc0596d0cf914/jsonschema_specifications-2025.4.1.tar.gz", hash = "sha256:630159c9f4dbea161a6a2205c3011cc4f18ff381b189fff48bb39b9bf26ae608", size = 15513, upload-time = "2025-04-23T12:34:07.418Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl", hash = "sha256:4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af", size = 18437, upload-time = "2025-04-23T12:34:05.422Z" }, +] + [[package]] name = "jupyter-client" version = "8.6.3" @@ -1216,6 +1252,7 @@ dev = [ { name = "deltalake" }, { name = "httpie" }, { name = "ipykernel" }, + { name = "jsonschema" }, { name = "pyarrow-stubs" }, { name = "pyiceberg" }, { name = "pytest" }, @@ -1245,6 +1282,7 @@ dev = [ { name = "deltalake", specifier = ">=1.0.2" }, { name = "httpie", specifier = ">=3.2.4" }, { name = "ipykernel", specifier = ">=6.29.5" }, + { name = "jsonschema", specifier = ">=4.25.0" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, @@ -1937,6 +1975,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/67/e60968d3b0e077495a8fee89cf3f2373db98e528288a48f1ee44967f6e8c/redis-6.2.0-py3-none-any.whl", hash = "sha256:c8ddf316ee0aab65f04a11229e94a64b2618451dab7a67cb2f77eb799d872d5e", size = 278659, upload-time = "2025-05-28T05:01:16.955Z" }, ] +[[package]] +name = "referencing" +version = "0.36.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2f/db/98b5c277be99dd18bfd91dd04e1b759cad18d1a338188c936e92f921c7e2/referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa", size = 74744, upload-time = "2025-01-25T08:48:16.138Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0", size = 26775, upload-time = "2025-01-25T08:48:14.241Z" }, +] + [[package]] name = "requests" version = "2.32.3" @@ -1983,6 +2035,132 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424, upload-time = "2024-11-01T16:43:55.817Z" }, ] +[[package]] +name = "rpds-py" +version = "0.26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/aa/4456d84bbb54adc6a916fb10c9b374f78ac840337644e4a5eda229c81275/rpds_py-0.26.0.tar.gz", hash = "sha256:20dae58a859b0906f0685642e591056f1e787f3a8b39c8e8749a45dc7d26bdb0", size = 27385, upload-time = "2025-07-01T15:57:13.958Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/31/1459645f036c3dfeacef89e8e5825e430c77dde8489f3b99eaafcd4a60f5/rpds_py-0.26.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4c70c70f9169692b36307a95f3d8c0a9fcd79f7b4a383aad5eaa0e9718b79b37", size = 372466, upload-time = "2025-07-01T15:53:40.55Z" }, + { url = "https://files.pythonhosted.org/packages/dd/ff/3d0727f35836cc8773d3eeb9a46c40cc405854e36a8d2e951f3a8391c976/rpds_py-0.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:777c62479d12395bfb932944e61e915741e364c843afc3196b694db3d669fcd0", size = 357825, upload-time = "2025-07-01T15:53:42.247Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ce/badc5e06120a54099ae287fa96d82cbb650a5f85cf247ffe19c7b157fd1f/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec671691e72dff75817386aa02d81e708b5a7ec0dec6669ec05213ff6b77e1bd", size = 381530, upload-time = "2025-07-01T15:53:43.585Z" }, + { url = "https://files.pythonhosted.org/packages/1e/a5/fa5d96a66c95d06c62d7a30707b6a4cfec696ab8ae280ee7be14e961e118/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a1cb5d6ce81379401bbb7f6dbe3d56de537fb8235979843f0d53bc2e9815a79", size = 396933, upload-time = "2025-07-01T15:53:45.78Z" }, + { url = "https://files.pythonhosted.org/packages/00/a7/7049d66750f18605c591a9db47d4a059e112a0c9ff8de8daf8fa0f446bba/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f789e32fa1fb6a7bf890e0124e7b42d1e60d28ebff57fe806719abb75f0e9a3", size = 513973, upload-time = "2025-07-01T15:53:47.085Z" }, + { url = "https://files.pythonhosted.org/packages/0e/f1/528d02c7d6b29d29fac8fd784b354d3571cc2153f33f842599ef0cf20dd2/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c55b0a669976cf258afd718de3d9ad1b7d1fe0a91cd1ab36f38b03d4d4aeaaf", size = 402293, upload-time = "2025-07-01T15:53:48.117Z" }, + { url = "https://files.pythonhosted.org/packages/15/93/fde36cd6e4685df2cd08508f6c45a841e82f5bb98c8d5ecf05649522acb5/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c70d9ec912802ecfd6cd390dadb34a9578b04f9bcb8e863d0a7598ba5e9e7ccc", size = 383787, upload-time = "2025-07-01T15:53:50.874Z" }, + { url = "https://files.pythonhosted.org/packages/69/f2/5007553aaba1dcae5d663143683c3dfd03d9395289f495f0aebc93e90f24/rpds_py-0.26.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3021933c2cb7def39d927b9862292e0f4c75a13d7de70eb0ab06efed4c508c19", size = 416312, upload-time = "2025-07-01T15:53:52.046Z" }, + { url = "https://files.pythonhosted.org/packages/8f/a7/ce52c75c1e624a79e48a69e611f1c08844564e44c85db2b6f711d76d10ce/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a7898b6ca3b7d6659e55cdac825a2e58c638cbf335cde41f4619e290dd0ad11", size = 558403, upload-time = "2025-07-01T15:53:53.192Z" }, + { url = "https://files.pythonhosted.org/packages/79/d5/e119db99341cc75b538bf4cb80504129fa22ce216672fb2c28e4a101f4d9/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:12bff2ad9447188377f1b2794772f91fe68bb4bbfa5a39d7941fbebdbf8c500f", size = 588323, upload-time = "2025-07-01T15:53:54.336Z" }, + { url = "https://files.pythonhosted.org/packages/93/94/d28272a0b02f5fe24c78c20e13bbcb95f03dc1451b68e7830ca040c60bd6/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:191aa858f7d4902e975d4cf2f2d9243816c91e9605070aeb09c0a800d187e323", size = 554541, upload-time = "2025-07-01T15:53:55.469Z" }, + { url = "https://files.pythonhosted.org/packages/93/e0/8c41166602f1b791da892d976057eba30685486d2e2c061ce234679c922b/rpds_py-0.26.0-cp310-cp310-win32.whl", hash = "sha256:b37a04d9f52cb76b6b78f35109b513f6519efb481d8ca4c321f6a3b9580b3f45", size = 220442, upload-time = "2025-07-01T15:53:56.524Z" }, + { url = "https://files.pythonhosted.org/packages/87/f0/509736bb752a7ab50fb0270c2a4134d671a7b3038030837e5536c3de0e0b/rpds_py-0.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:38721d4c9edd3eb6670437d8d5e2070063f305bfa2d5aa4278c51cedcd508a84", size = 231314, upload-time = "2025-07-01T15:53:57.842Z" }, + { url = "https://files.pythonhosted.org/packages/09/4c/4ee8f7e512030ff79fda1df3243c88d70fc874634e2dbe5df13ba4210078/rpds_py-0.26.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9e8cb77286025bdb21be2941d64ac6ca016130bfdcd228739e8ab137eb4406ed", size = 372610, upload-time = "2025-07-01T15:53:58.844Z" }, + { url = "https://files.pythonhosted.org/packages/fa/9d/3dc16be00f14fc1f03c71b1d67c8df98263ab2710a2fbd65a6193214a527/rpds_py-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e09330b21d98adc8ccb2dbb9fc6cb434e8908d4c119aeaa772cb1caab5440a0", size = 358032, upload-time = "2025-07-01T15:53:59.985Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5a/7f1bf8f045da2866324a08ae80af63e64e7bfaf83bd31f865a7b91a58601/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c9c1b92b774b2e68d11193dc39620d62fd8ab33f0a3c77ecdabe19c179cdbc1", size = 381525, upload-time = "2025-07-01T15:54:01.162Z" }, + { url = "https://files.pythonhosted.org/packages/45/8a/04479398c755a066ace10e3d158866beb600867cacae194c50ffa783abd0/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:824e6d3503ab990d7090768e4dfd9e840837bae057f212ff9f4f05ec6d1975e7", size = 397089, upload-time = "2025-07-01T15:54:02.319Z" }, + { url = "https://files.pythonhosted.org/packages/72/88/9203f47268db488a1b6d469d69c12201ede776bb728b9d9f29dbfd7df406/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ad7fd2258228bf288f2331f0a6148ad0186b2e3643055ed0db30990e59817a6", size = 514255, upload-time = "2025-07-01T15:54:03.38Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b4/01ce5d1e853ddf81fbbd4311ab1eff0b3cf162d559288d10fd127e2588b5/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0dc23bbb3e06ec1ea72d515fb572c1fea59695aefbffb106501138762e1e915e", size = 402283, upload-time = "2025-07-01T15:54:04.923Z" }, + { url = "https://files.pythonhosted.org/packages/34/a2/004c99936997bfc644d590a9defd9e9c93f8286568f9c16cdaf3e14429a7/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d80bf832ac7b1920ee29a426cdca335f96a2b5caa839811803e999b41ba9030d", size = 383881, upload-time = "2025-07-01T15:54:06.482Z" }, + { url = "https://files.pythonhosted.org/packages/05/1b/ef5fba4a8f81ce04c427bfd96223f92f05e6cd72291ce9d7523db3b03a6c/rpds_py-0.26.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0919f38f5542c0a87e7b4afcafab6fd2c15386632d249e9a087498571250abe3", size = 415822, upload-time = "2025-07-01T15:54:07.605Z" }, + { url = "https://files.pythonhosted.org/packages/16/80/5c54195aec456b292f7bd8aa61741c8232964063fd8a75fdde9c1e982328/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d422b945683e409000c888e384546dbab9009bb92f7c0b456e217988cf316107", size = 558347, upload-time = "2025-07-01T15:54:08.591Z" }, + { url = "https://files.pythonhosted.org/packages/f2/1c/1845c1b1fd6d827187c43afe1841d91678d7241cbdb5420a4c6de180a538/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:77a7711fa562ba2da1aa757e11024ad6d93bad6ad7ede5afb9af144623e5f76a", size = 587956, upload-time = "2025-07-01T15:54:09.963Z" }, + { url = "https://files.pythonhosted.org/packages/2e/ff/9e979329dd131aa73a438c077252ddabd7df6d1a7ad7b9aacf6261f10faa/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238e8c8610cb7c29460e37184f6799547f7e09e6a9bdbdab4e8edb90986a2318", size = 554363, upload-time = "2025-07-01T15:54:11.073Z" }, + { url = "https://files.pythonhosted.org/packages/00/8b/d78cfe034b71ffbe72873a136e71acc7a831a03e37771cfe59f33f6de8a2/rpds_py-0.26.0-cp311-cp311-win32.whl", hash = "sha256:893b022bfbdf26d7bedb083efeea624e8550ca6eb98bf7fea30211ce95b9201a", size = 220123, upload-time = "2025-07-01T15:54:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/94/c1/3c8c94c7dd3905dbfde768381ce98778500a80db9924731d87ddcdb117e9/rpds_py-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:87a5531de9f71aceb8af041d72fc4cab4943648d91875ed56d2e629bef6d4c03", size = 231732, upload-time = "2025-07-01T15:54:13.434Z" }, + { url = "https://files.pythonhosted.org/packages/67/93/e936fbed1b734eabf36ccb5d93c6a2e9246fbb13c1da011624b7286fae3e/rpds_py-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:de2713f48c1ad57f89ac25b3cb7daed2156d8e822cf0eca9b96a6f990718cc41", size = 221917, upload-time = "2025-07-01T15:54:14.559Z" }, + { url = "https://files.pythonhosted.org/packages/ea/86/90eb87c6f87085868bd077c7a9938006eb1ce19ed4d06944a90d3560fce2/rpds_py-0.26.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:894514d47e012e794f1350f076c427d2347ebf82f9b958d554d12819849a369d", size = 363933, upload-time = "2025-07-01T15:54:15.734Z" }, + { url = "https://files.pythonhosted.org/packages/63/78/4469f24d34636242c924626082b9586f064ada0b5dbb1e9d096ee7a8e0c6/rpds_py-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc921b96fa95a097add244da36a1d9e4f3039160d1d30f1b35837bf108c21136", size = 350447, upload-time = "2025-07-01T15:54:16.922Z" }, + { url = "https://files.pythonhosted.org/packages/ad/91/c448ed45efdfdade82348d5e7995e15612754826ea640afc20915119734f/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e1157659470aa42a75448b6e943c895be8c70531c43cb78b9ba990778955582", size = 384711, upload-time = "2025-07-01T15:54:18.101Z" }, + { url = "https://files.pythonhosted.org/packages/ec/43/e5c86fef4be7f49828bdd4ecc8931f0287b1152c0bb0163049b3218740e7/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:521ccf56f45bb3a791182dc6b88ae5f8fa079dd705ee42138c76deb1238e554e", size = 400865, upload-time = "2025-07-01T15:54:19.295Z" }, + { url = "https://files.pythonhosted.org/packages/55/34/e00f726a4d44f22d5c5fe2e5ddd3ac3d7fd3f74a175607781fbdd06fe375/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9def736773fd56b305c0eef698be5192c77bfa30d55a0e5885f80126c4831a15", size = 517763, upload-time = "2025-07-01T15:54:20.858Z" }, + { url = "https://files.pythonhosted.org/packages/52/1c/52dc20c31b147af724b16104500fba13e60123ea0334beba7b40e33354b4/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cdad4ea3b4513b475e027be79e5a0ceac8ee1c113a1a11e5edc3c30c29f964d8", size = 406651, upload-time = "2025-07-01T15:54:22.508Z" }, + { url = "https://files.pythonhosted.org/packages/2e/77/87d7bfabfc4e821caa35481a2ff6ae0b73e6a391bb6b343db2c91c2b9844/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82b165b07f416bdccf5c84546a484cc8f15137ca38325403864bfdf2b5b72f6a", size = 386079, upload-time = "2025-07-01T15:54:23.987Z" }, + { url = "https://files.pythonhosted.org/packages/e3/d4/7f2200c2d3ee145b65b3cddc4310d51f7da6a26634f3ac87125fd789152a/rpds_py-0.26.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d04cab0a54b9dba4d278fe955a1390da3cf71f57feb78ddc7cb67cbe0bd30323", size = 421379, upload-time = "2025-07-01T15:54:25.073Z" }, + { url = "https://files.pythonhosted.org/packages/ae/13/9fdd428b9c820869924ab62236b8688b122baa22d23efdd1c566938a39ba/rpds_py-0.26.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:79061ba1a11b6a12743a2b0f72a46aa2758613d454aa6ba4f5a265cc48850158", size = 562033, upload-time = "2025-07-01T15:54:26.225Z" }, + { url = "https://files.pythonhosted.org/packages/f3/e1/b69686c3bcbe775abac3a4c1c30a164a2076d28df7926041f6c0eb5e8d28/rpds_py-0.26.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:f405c93675d8d4c5ac87364bb38d06c988e11028a64b52a47158a355079661f3", size = 591639, upload-time = "2025-07-01T15:54:27.424Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c9/1e3d8c8863c84a90197ac577bbc3d796a92502124c27092413426f670990/rpds_py-0.26.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dafd4c44b74aa4bed4b250f1aed165b8ef5de743bcca3b88fc9619b6087093d2", size = 557105, upload-time = "2025-07-01T15:54:29.93Z" }, + { url = "https://files.pythonhosted.org/packages/9f/c5/90c569649057622959f6dcc40f7b516539608a414dfd54b8d77e3b201ac0/rpds_py-0.26.0-cp312-cp312-win32.whl", hash = "sha256:3da5852aad63fa0c6f836f3359647870e21ea96cf433eb393ffa45263a170d44", size = 223272, upload-time = "2025-07-01T15:54:31.128Z" }, + { url = "https://files.pythonhosted.org/packages/7d/16/19f5d9f2a556cfed454eebe4d354c38d51c20f3db69e7b4ce6cff904905d/rpds_py-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf47cfdabc2194a669dcf7a8dbba62e37a04c5041d2125fae0233b720da6f05c", size = 234995, upload-time = "2025-07-01T15:54:32.195Z" }, + { url = "https://files.pythonhosted.org/packages/83/f0/7935e40b529c0e752dfaa7880224771b51175fce08b41ab4a92eb2fbdc7f/rpds_py-0.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:20ab1ae4fa534f73647aad289003f1104092890849e0266271351922ed5574f8", size = 223198, upload-time = "2025-07-01T15:54:33.271Z" }, + { url = "https://files.pythonhosted.org/packages/6a/67/bb62d0109493b12b1c6ab00de7a5566aa84c0e44217c2d94bee1bd370da9/rpds_py-0.26.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:696764a5be111b036256c0b18cd29783fab22154690fc698062fc1b0084b511d", size = 363917, upload-time = "2025-07-01T15:54:34.755Z" }, + { url = "https://files.pythonhosted.org/packages/4b/f3/34e6ae1925a5706c0f002a8d2d7f172373b855768149796af87bd65dcdb9/rpds_py-0.26.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e6c15d2080a63aaed876e228efe4f814bc7889c63b1e112ad46fdc8b368b9e1", size = 350073, upload-time = "2025-07-01T15:54:36.292Z" }, + { url = "https://files.pythonhosted.org/packages/75/83/1953a9d4f4e4de7fd0533733e041c28135f3c21485faaef56a8aadbd96b5/rpds_py-0.26.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390e3170babf42462739a93321e657444f0862c6d722a291accc46f9d21ed04e", size = 384214, upload-time = "2025-07-01T15:54:37.469Z" }, + { url = "https://files.pythonhosted.org/packages/48/0e/983ed1b792b3322ea1d065e67f4b230f3b96025f5ce3878cc40af09b7533/rpds_py-0.26.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7da84c2c74c0f5bc97d853d9e17bb83e2dcafcff0dc48286916001cc114379a1", size = 400113, upload-time = "2025-07-01T15:54:38.954Z" }, + { url = "https://files.pythonhosted.org/packages/69/7f/36c0925fff6f660a80be259c5b4f5e53a16851f946eb080351d057698528/rpds_py-0.26.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4c5fe114a6dd480a510b6d3661d09d67d1622c4bf20660a474507aaee7eeeee9", size = 515189, upload-time = "2025-07-01T15:54:40.57Z" }, + { url = "https://files.pythonhosted.org/packages/13/45/cbf07fc03ba7a9b54662c9badb58294ecfb24f828b9732970bd1a431ed5c/rpds_py-0.26.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3100b3090269f3a7ea727b06a6080d4eb7439dca4c0e91a07c5d133bb1727ea7", size = 406998, upload-time = "2025-07-01T15:54:43.025Z" }, + { url = "https://files.pythonhosted.org/packages/6c/b0/8fa5e36e58657997873fd6a1cf621285ca822ca75b4b3434ead047daa307/rpds_py-0.26.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c03c9b0c64afd0320ae57de4c982801271c0c211aa2d37f3003ff5feb75bb04", size = 385903, upload-time = "2025-07-01T15:54:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/4b/f7/b25437772f9f57d7a9fbd73ed86d0dcd76b4c7c6998348c070d90f23e315/rpds_py-0.26.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5963b72ccd199ade6ee493723d18a3f21ba7d5b957017607f815788cef50eaf1", size = 419785, upload-time = "2025-07-01T15:54:46.043Z" }, + { url = "https://files.pythonhosted.org/packages/a7/6b/63ffa55743dfcb4baf2e9e77a0b11f7f97ed96a54558fcb5717a4b2cd732/rpds_py-0.26.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9da4e873860ad5bab3291438525cae80169daecbfafe5657f7f5fb4d6b3f96b9", size = 561329, upload-time = "2025-07-01T15:54:47.64Z" }, + { url = "https://files.pythonhosted.org/packages/2f/07/1f4f5e2886c480a2346b1e6759c00278b8a69e697ae952d82ae2e6ee5db0/rpds_py-0.26.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5afaddaa8e8c7f1f7b4c5c725c0070b6eed0228f705b90a1732a48e84350f4e9", size = 590875, upload-time = "2025-07-01T15:54:48.9Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bc/e6639f1b91c3a55f8c41b47d73e6307051b6e246254a827ede730624c0f8/rpds_py-0.26.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4916dc96489616a6f9667e7526af8fa693c0fdb4f3acb0e5d9f4400eb06a47ba", size = 556636, upload-time = "2025-07-01T15:54:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/05/4c/b3917c45566f9f9a209d38d9b54a1833f2bb1032a3e04c66f75726f28876/rpds_py-0.26.0-cp313-cp313-win32.whl", hash = "sha256:2a343f91b17097c546b93f7999976fd6c9d5900617aa848c81d794e062ab302b", size = 222663, upload-time = "2025-07-01T15:54:52.023Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0b/0851bdd6025775aaa2365bb8de0697ee2558184c800bfef8d7aef5ccde58/rpds_py-0.26.0-cp313-cp313-win_amd64.whl", hash = "sha256:0a0b60701f2300c81b2ac88a5fb893ccfa408e1c4a555a77f908a2596eb875a5", size = 234428, upload-time = "2025-07-01T15:54:53.692Z" }, + { url = "https://files.pythonhosted.org/packages/ed/e8/a47c64ed53149c75fb581e14a237b7b7cd18217e969c30d474d335105622/rpds_py-0.26.0-cp313-cp313-win_arm64.whl", hash = "sha256:257d011919f133a4746958257f2c75238e3ff54255acd5e3e11f3ff41fd14256", size = 222571, upload-time = "2025-07-01T15:54:54.822Z" }, + { url = "https://files.pythonhosted.org/packages/89/bf/3d970ba2e2bcd17d2912cb42874107390f72873e38e79267224110de5e61/rpds_py-0.26.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:529c8156d7506fba5740e05da8795688f87119cce330c244519cf706a4a3d618", size = 360475, upload-time = "2025-07-01T15:54:56.228Z" }, + { url = "https://files.pythonhosted.org/packages/82/9f/283e7e2979fc4ec2d8ecee506d5a3675fce5ed9b4b7cb387ea5d37c2f18d/rpds_py-0.26.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f53ec51f9d24e9638a40cabb95078ade8c99251945dad8d57bf4aabe86ecee35", size = 346692, upload-time = "2025-07-01T15:54:58.561Z" }, + { url = "https://files.pythonhosted.org/packages/e3/03/7e50423c04d78daf391da3cc4330bdb97042fc192a58b186f2d5deb7befd/rpds_py-0.26.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab504c4d654e4a29558eaa5bb8cea5fdc1703ea60a8099ffd9c758472cf913f", size = 379415, upload-time = "2025-07-01T15:54:59.751Z" }, + { url = "https://files.pythonhosted.org/packages/57/00/d11ee60d4d3b16808432417951c63df803afb0e0fc672b5e8d07e9edaaae/rpds_py-0.26.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd0641abca296bc1a00183fe44f7fced8807ed49d501f188faa642d0e4975b83", size = 391783, upload-time = "2025-07-01T15:55:00.898Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/1069c394d9c0d6d23c5b522e1f6546b65793a22950f6e0210adcc6f97c3e/rpds_py-0.26.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b312fecc1d017b5327afa81d4da1480f51c68810963a7336d92203dbb3d4f1", size = 512844, upload-time = "2025-07-01T15:55:02.201Z" }, + { url = "https://files.pythonhosted.org/packages/08/3b/c4fbf0926800ed70b2c245ceca99c49f066456755f5d6eb8863c2c51e6d0/rpds_py-0.26.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c741107203954f6fc34d3066d213d0a0c40f7bb5aafd698fb39888af277c70d8", size = 402105, upload-time = "2025-07-01T15:55:03.698Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b0/db69b52ca07413e568dae9dc674627a22297abb144c4d6022c6d78f1e5cc/rpds_py-0.26.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc3e55a7db08dc9a6ed5fb7103019d2c1a38a349ac41901f9f66d7f95750942f", size = 383440, upload-time = "2025-07-01T15:55:05.398Z" }, + { url = "https://files.pythonhosted.org/packages/4c/e1/c65255ad5b63903e56b3bb3ff9dcc3f4f5c3badde5d08c741ee03903e951/rpds_py-0.26.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e851920caab2dbcae311fd28f4313c6953993893eb5c1bb367ec69d9a39e7ed", size = 412759, upload-time = "2025-07-01T15:55:08.316Z" }, + { url = "https://files.pythonhosted.org/packages/e4/22/bb731077872377a93c6e93b8a9487d0406c70208985831034ccdeed39c8e/rpds_py-0.26.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dfbf280da5f876d0b00c81f26bedce274e72a678c28845453885a9b3c22ae632", size = 556032, upload-time = "2025-07-01T15:55:09.52Z" }, + { url = "https://files.pythonhosted.org/packages/e0/8b/393322ce7bac5c4530fb96fc79cc9ea2f83e968ff5f6e873f905c493e1c4/rpds_py-0.26.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:1cc81d14ddfa53d7f3906694d35d54d9d3f850ef8e4e99ee68bc0d1e5fed9a9c", size = 585416, upload-time = "2025-07-01T15:55:11.216Z" }, + { url = "https://files.pythonhosted.org/packages/49/ae/769dc372211835bf759319a7aae70525c6eb523e3371842c65b7ef41c9c6/rpds_py-0.26.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dca83c498b4650a91efcf7b88d669b170256bf8017a5db6f3e06c2bf031f57e0", size = 554049, upload-time = "2025-07-01T15:55:13.004Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f9/4c43f9cc203d6ba44ce3146246cdc38619d92c7bd7bad4946a3491bd5b70/rpds_py-0.26.0-cp313-cp313t-win32.whl", hash = "sha256:4d11382bcaf12f80b51d790dee295c56a159633a8e81e6323b16e55d81ae37e9", size = 218428, upload-time = "2025-07-01T15:55:14.486Z" }, + { url = "https://files.pythonhosted.org/packages/7e/8b/9286b7e822036a4a977f2f1e851c7345c20528dbd56b687bb67ed68a8ede/rpds_py-0.26.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff110acded3c22c033e637dd8896e411c7d3a11289b2edf041f86663dbc791e9", size = 231524, upload-time = "2025-07-01T15:55:15.745Z" }, + { url = "https://files.pythonhosted.org/packages/55/07/029b7c45db910c74e182de626dfdae0ad489a949d84a468465cd0ca36355/rpds_py-0.26.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:da619979df60a940cd434084355c514c25cf8eb4cf9a508510682f6c851a4f7a", size = 364292, upload-time = "2025-07-01T15:55:17.001Z" }, + { url = "https://files.pythonhosted.org/packages/13/d1/9b3d3f986216b4d1f584878dca15ce4797aaf5d372d738974ba737bf68d6/rpds_py-0.26.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ea89a2458a1a75f87caabefe789c87539ea4e43b40f18cff526052e35bbb4fdf", size = 350334, upload-time = "2025-07-01T15:55:18.922Z" }, + { url = "https://files.pythonhosted.org/packages/18/98/16d5e7bc9ec715fa9668731d0cf97f6b032724e61696e2db3d47aeb89214/rpds_py-0.26.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feac1045b3327a45944e7dcbeb57530339f6b17baff154df51ef8b0da34c8c12", size = 384875, upload-time = "2025-07-01T15:55:20.399Z" }, + { url = "https://files.pythonhosted.org/packages/f9/13/aa5e2b1ec5ab0e86a5c464d53514c0467bec6ba2507027d35fc81818358e/rpds_py-0.26.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b818a592bd69bfe437ee8368603d4a2d928c34cffcdf77c2e761a759ffd17d20", size = 399993, upload-time = "2025-07-01T15:55:21.729Z" }, + { url = "https://files.pythonhosted.org/packages/17/03/8021810b0e97923abdbab6474c8b77c69bcb4b2c58330777df9ff69dc559/rpds_py-0.26.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a8b0dd8648709b62d9372fc00a57466f5fdeefed666afe3fea5a6c9539a0331", size = 516683, upload-time = "2025-07-01T15:55:22.918Z" }, + { url = "https://files.pythonhosted.org/packages/dc/b1/da8e61c87c2f3d836954239fdbbfb477bb7b54d74974d8f6fcb34342d166/rpds_py-0.26.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6d3498ad0df07d81112aa6ec6c95a7e7b1ae00929fb73e7ebee0f3faaeabad2f", size = 408825, upload-time = "2025-07-01T15:55:24.207Z" }, + { url = "https://files.pythonhosted.org/packages/38/bc/1fc173edaaa0e52c94b02a655db20697cb5fa954ad5a8e15a2c784c5cbdd/rpds_py-0.26.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24a4146ccb15be237fdef10f331c568e1b0e505f8c8c9ed5d67759dac58ac246", size = 387292, upload-time = "2025-07-01T15:55:25.554Z" }, + { url = "https://files.pythonhosted.org/packages/7c/eb/3a9bb4bd90867d21916f253caf4f0d0be7098671b6715ad1cead9fe7bab9/rpds_py-0.26.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a9a63785467b2d73635957d32a4f6e73d5e4df497a16a6392fa066b753e87387", size = 420435, upload-time = "2025-07-01T15:55:27.798Z" }, + { url = "https://files.pythonhosted.org/packages/cd/16/e066dcdb56f5632713445271a3f8d3d0b426d51ae9c0cca387799df58b02/rpds_py-0.26.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:de4ed93a8c91debfd5a047be327b7cc8b0cc6afe32a716bbbc4aedca9e2a83af", size = 562410, upload-time = "2025-07-01T15:55:29.057Z" }, + { url = "https://files.pythonhosted.org/packages/60/22/ddbdec7eb82a0dc2e455be44c97c71c232983e21349836ce9f272e8a3c29/rpds_py-0.26.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:caf51943715b12af827696ec395bfa68f090a4c1a1d2509eb4e2cb69abbbdb33", size = 590724, upload-time = "2025-07-01T15:55:30.719Z" }, + { url = "https://files.pythonhosted.org/packages/2c/b4/95744085e65b7187d83f2fcb0bef70716a1ea0a9e5d8f7f39a86e5d83424/rpds_py-0.26.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4a59e5bc386de021f56337f757301b337d7ab58baa40174fb150accd480bc953", size = 558285, upload-time = "2025-07-01T15:55:31.981Z" }, + { url = "https://files.pythonhosted.org/packages/37/37/6309a75e464d1da2559446f9c811aa4d16343cebe3dbb73701e63f760caa/rpds_py-0.26.0-cp314-cp314-win32.whl", hash = "sha256:92c8db839367ef16a662478f0a2fe13e15f2227da3c1430a782ad0f6ee009ec9", size = 223459, upload-time = "2025-07-01T15:55:33.312Z" }, + { url = "https://files.pythonhosted.org/packages/d9/6f/8e9c11214c46098b1d1391b7e02b70bb689ab963db3b19540cba17315291/rpds_py-0.26.0-cp314-cp314-win_amd64.whl", hash = "sha256:b0afb8cdd034150d4d9f53926226ed27ad15b7f465e93d7468caaf5eafae0d37", size = 236083, upload-time = "2025-07-01T15:55:34.933Z" }, + { url = "https://files.pythonhosted.org/packages/47/af/9c4638994dd623d51c39892edd9d08e8be8220a4b7e874fa02c2d6e91955/rpds_py-0.26.0-cp314-cp314-win_arm64.whl", hash = "sha256:ca3f059f4ba485d90c8dc75cb5ca897e15325e4e609812ce57f896607c1c0867", size = 223291, upload-time = "2025-07-01T15:55:36.202Z" }, + { url = "https://files.pythonhosted.org/packages/4d/db/669a241144460474aab03e254326b32c42def83eb23458a10d163cb9b5ce/rpds_py-0.26.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5afea17ab3a126006dc2f293b14ffc7ef3c85336cf451564a0515ed7648033da", size = 361445, upload-time = "2025-07-01T15:55:37.483Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2d/133f61cc5807c6c2fd086a46df0eb8f63a23f5df8306ff9f6d0fd168fecc/rpds_py-0.26.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:69f0c0a3df7fd3a7eec50a00396104bb9a843ea6d45fcc31c2d5243446ffd7a7", size = 347206, upload-time = "2025-07-01T15:55:38.828Z" }, + { url = "https://files.pythonhosted.org/packages/05/bf/0e8fb4c05f70273469eecf82f6ccf37248558526a45321644826555db31b/rpds_py-0.26.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:801a71f70f9813e82d2513c9a96532551fce1e278ec0c64610992c49c04c2dad", size = 380330, upload-time = "2025-07-01T15:55:40.175Z" }, + { url = "https://files.pythonhosted.org/packages/d4/a8/060d24185d8b24d3923322f8d0ede16df4ade226a74e747b8c7c978e3dd3/rpds_py-0.26.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:df52098cde6d5e02fa75c1f6244f07971773adb4a26625edd5c18fee906fa84d", size = 392254, upload-time = "2025-07-01T15:55:42.015Z" }, + { url = "https://files.pythonhosted.org/packages/b9/7b/7c2e8a9ee3e6bc0bae26bf29f5219955ca2fbb761dca996a83f5d2f773fe/rpds_py-0.26.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bc596b30f86dc6f0929499c9e574601679d0341a0108c25b9b358a042f51bca", size = 516094, upload-time = "2025-07-01T15:55:43.603Z" }, + { url = "https://files.pythonhosted.org/packages/75/d6/f61cafbed8ba1499b9af9f1777a2a199cd888f74a96133d8833ce5eaa9c5/rpds_py-0.26.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9dfbe56b299cf5875b68eb6f0ebaadc9cac520a1989cac0db0765abfb3709c19", size = 402889, upload-time = "2025-07-01T15:55:45.275Z" }, + { url = "https://files.pythonhosted.org/packages/92/19/c8ac0a8a8df2dd30cdec27f69298a5c13e9029500d6d76718130f5e5be10/rpds_py-0.26.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac64f4b2bdb4ea622175c9ab7cf09444e412e22c0e02e906978b3b488af5fde8", size = 384301, upload-time = "2025-07-01T15:55:47.098Z" }, + { url = "https://files.pythonhosted.org/packages/41/e1/6b1859898bc292a9ce5776016c7312b672da00e25cec74d7beced1027286/rpds_py-0.26.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:181ef9b6bbf9845a264f9aa45c31836e9f3c1f13be565d0d010e964c661d1e2b", size = 412891, upload-time = "2025-07-01T15:55:48.412Z" }, + { url = "https://files.pythonhosted.org/packages/ef/b9/ceb39af29913c07966a61367b3c08b4f71fad841e32c6b59a129d5974698/rpds_py-0.26.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:49028aa684c144ea502a8e847d23aed5e4c2ef7cadfa7d5eaafcb40864844b7a", size = 557044, upload-time = "2025-07-01T15:55:49.816Z" }, + { url = "https://files.pythonhosted.org/packages/2f/27/35637b98380731a521f8ec4f3fd94e477964f04f6b2f8f7af8a2d889a4af/rpds_py-0.26.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:e5d524d68a474a9688336045bbf76cb0def88549c1b2ad9dbfec1fb7cfbe9170", size = 585774, upload-time = "2025-07-01T15:55:51.192Z" }, + { url = "https://files.pythonhosted.org/packages/52/d9/3f0f105420fecd18551b678c9a6ce60bd23986098b252a56d35781b3e7e9/rpds_py-0.26.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c1851f429b822831bd2edcbe0cfd12ee9ea77868f8d3daf267b189371671c80e", size = 554886, upload-time = "2025-07-01T15:55:52.541Z" }, + { url = "https://files.pythonhosted.org/packages/6b/c5/347c056a90dc8dd9bc240a08c527315008e1b5042e7a4cf4ac027be9d38a/rpds_py-0.26.0-cp314-cp314t-win32.whl", hash = "sha256:7bdb17009696214c3b66bb3590c6d62e14ac5935e53e929bcdbc5a495987a84f", size = 219027, upload-time = "2025-07-01T15:55:53.874Z" }, + { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload-time = "2025-07-01T15:55:55.167Z" }, + { url = "https://files.pythonhosted.org/packages/ef/9a/1f033b0b31253d03d785b0cd905bc127e555ab496ea6b4c7c2e1f951f2fd/rpds_py-0.26.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3c0909c5234543ada2515c05dc08595b08d621ba919629e94427e8e03539c958", size = 373226, upload-time = "2025-07-01T15:56:16.578Z" }, + { url = "https://files.pythonhosted.org/packages/58/29/5f88023fd6aaaa8ca3c4a6357ebb23f6f07da6079093ccf27c99efce87db/rpds_py-0.26.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c1fb0cda2abcc0ac62f64e2ea4b4e64c57dfd6b885e693095460c61bde7bb18e", size = 359230, upload-time = "2025-07-01T15:56:17.978Z" }, + { url = "https://files.pythonhosted.org/packages/6c/6c/13eaebd28b439da6964dde22712b52e53fe2824af0223b8e403249d10405/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d142d2d6cf9b31c12aa4878d82ed3b2324226270b89b676ac62ccd7df52d08", size = 382363, upload-time = "2025-07-01T15:56:19.977Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/3bb9c486b06da19448646f96147796de23c5811ef77cbfc26f17307b6a9d/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a547e21c5610b7e9093d870be50682a6a6cf180d6da0f42c47c306073bfdbbf6", size = 397146, upload-time = "2025-07-01T15:56:21.39Z" }, + { url = "https://files.pythonhosted.org/packages/15/18/9d1b79eb4d18e64ba8bba9e7dec6f9d6920b639f22f07ee9368ca35d4673/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35e9a70a0f335371275cdcd08bc5b8051ac494dd58bff3bbfb421038220dc871", size = 514804, upload-time = "2025-07-01T15:56:22.78Z" }, + { url = "https://files.pythonhosted.org/packages/4f/5a/175ad7191bdbcd28785204621b225ad70e85cdfd1e09cc414cb554633b21/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0dfa6115c6def37905344d56fb54c03afc49104e2ca473d5dedec0f6606913b4", size = 402820, upload-time = "2025-07-01T15:56:24.584Z" }, + { url = "https://files.pythonhosted.org/packages/11/45/6a67ecf6d61c4d4aff4bc056e864eec4b2447787e11d1c2c9a0242c6e92a/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:313cfcd6af1a55a286a3c9a25f64af6d0e46cf60bc5798f1db152d97a216ff6f", size = 384567, upload-time = "2025-07-01T15:56:26.064Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ba/16589da828732b46454c61858950a78fe4c931ea4bf95f17432ffe64b241/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7bf2496fa563c046d05e4d232d7b7fd61346e2402052064b773e5c378bf6f73", size = 416520, upload-time = "2025-07-01T15:56:27.608Z" }, + { url = "https://files.pythonhosted.org/packages/81/4b/00092999fc7c0c266045e984d56b7314734cc400a6c6dc4d61a35f135a9d/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:aa81873e2c8c5aa616ab8e017a481a96742fdf9313c40f14338ca7dbf50cb55f", size = 559362, upload-time = "2025-07-01T15:56:29.078Z" }, + { url = "https://files.pythonhosted.org/packages/96/0c/43737053cde1f93ac4945157f7be1428724ab943e2132a0d235a7e161d4e/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:68ffcf982715f5b5b7686bdd349ff75d422e8f22551000c24b30eaa1b7f7ae84", size = 588113, upload-time = "2025-07-01T15:56:30.485Z" }, + { url = "https://files.pythonhosted.org/packages/46/46/8e38f6161466e60a997ed7e9951ae5de131dedc3cf778ad35994b4af823d/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6188de70e190847bb6db3dc3981cbadff87d27d6fe9b4f0e18726d55795cee9b", size = 555429, upload-time = "2025-07-01T15:56:31.956Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ac/65da605e9f1dd643ebe615d5bbd11b6efa1d69644fc4bf623ea5ae385a82/rpds_py-0.26.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1c962145c7473723df9722ba4c058de12eb5ebedcb4e27e7d902920aa3831ee8", size = 231950, upload-time = "2025-07-01T15:56:33.337Z" }, + { url = "https://files.pythonhosted.org/packages/51/f2/b5c85b758a00c513bb0389f8fc8e61eb5423050c91c958cdd21843faa3e6/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f61a9326f80ca59214d1cceb0a09bb2ece5b2563d4e0cd37bfd5515c28510674", size = 373505, upload-time = "2025-07-01T15:56:34.716Z" }, + { url = "https://files.pythonhosted.org/packages/23/e0/25db45e391251118e915e541995bb5f5ac5691a3b98fb233020ba53afc9b/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:183f857a53bcf4b1b42ef0f57ca553ab56bdd170e49d8091e96c51c3d69ca696", size = 359468, upload-time = "2025-07-01T15:56:36.219Z" }, + { url = "https://files.pythonhosted.org/packages/0b/73/dd5ee6075bb6491be3a646b301dfd814f9486d924137a5098e61f0487e16/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:941c1cfdf4799d623cf3aa1d326a6b4fdb7a5799ee2687f3516738216d2262fb", size = 382680, upload-time = "2025-07-01T15:56:37.644Z" }, + { url = "https://files.pythonhosted.org/packages/2f/10/84b522ff58763a5c443f5bcedc1820240e454ce4e620e88520f04589e2ea/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72a8d9564a717ee291f554eeb4bfeafe2309d5ec0aa6c475170bdab0f9ee8e88", size = 397035, upload-time = "2025-07-01T15:56:39.241Z" }, + { url = "https://files.pythonhosted.org/packages/06/ea/8667604229a10a520fcbf78b30ccc278977dcc0627beb7ea2c96b3becef0/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:511d15193cbe013619dd05414c35a7dedf2088fcee93c6bbb7c77859765bd4e8", size = 514922, upload-time = "2025-07-01T15:56:40.645Z" }, + { url = "https://files.pythonhosted.org/packages/24/e6/9ed5b625c0661c4882fc8cdf302bf8e96c73c40de99c31e0b95ed37d508c/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aea1f9741b603a8d8fedb0ed5502c2bc0accbc51f43e2ad1337fe7259c2b77a5", size = 402822, upload-time = "2025-07-01T15:56:42.137Z" }, + { url = "https://files.pythonhosted.org/packages/8a/58/212c7b6fd51946047fb45d3733da27e2fa8f7384a13457c874186af691b1/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4019a9d473c708cf2f16415688ef0b4639e07abaa569d72f74745bbeffafa2c7", size = 384336, upload-time = "2025-07-01T15:56:44.239Z" }, + { url = "https://files.pythonhosted.org/packages/aa/f5/a40ba78748ae8ebf4934d4b88e77b98497378bc2c24ba55ebe87a4e87057/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:093d63b4b0f52d98ebae33b8c50900d3d67e0666094b1be7a12fffd7f65de74b", size = 416871, upload-time = "2025-07-01T15:56:46.284Z" }, + { url = "https://files.pythonhosted.org/packages/d5/a6/33b1fc0c9f7dcfcfc4a4353daa6308b3ece22496ceece348b3e7a7559a09/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2abe21d8ba64cded53a2a677e149ceb76dcf44284202d737178afe7ba540c1eb", size = 559439, upload-time = "2025-07-01T15:56:48.549Z" }, + { url = "https://files.pythonhosted.org/packages/71/2d/ceb3f9c12f8cfa56d34995097f6cd99da1325642c60d1b6680dd9df03ed8/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:4feb7511c29f8442cbbc28149a92093d32e815a28aa2c50d333826ad2a20fdf0", size = 588380, upload-time = "2025-07-01T15:56:50.086Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, +] + [[package]] name = "ruff" version = "0.11.12" From d08de3b16cf9e8a873286201f0f2038585a436f6 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 2 Aug 2025 01:13:29 +0000 Subject: [PATCH 161/224] feat: refined universal converter between arrow and python --- .../semantic_types/universal_converter.py | 737 ++++++++++++++++++ 1 file changed, 737 insertions(+) create mode 100644 src/orcapod/semantic_types/universal_converter.py diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py new file mode 100644 index 0000000..872d267 --- /dev/null +++ b/src/orcapod/semantic_types/universal_converter.py @@ -0,0 +1,737 @@ +""" +Universal Type Conversion Engine for Python ↔ Arrow type bidirectional conversion. + +This provides a comprehensive, self-contained system that: +1. Converts Python type hints to Arrow types +2. Converts Arrow types back to Python type hints +3. Creates and caches conversion functions for optimal performance +4. Manages dynamic TypedDict creation for struct preservation +5. Integrates seamlessly with semantic type registries +""" + +from typing import TypedDict, Dict, Type, Any, Callable, Tuple, Optional, get_type_hints +import pyarrow as pa +from functools import lru_cache +import hashlib +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry + +# Handle generic types +from typing import get_origin, get_args +import typing + + +class UniversalTypeConverter: + """ + Universal engine for Python ↔ Arrow type conversion with cached conversion functions. + + This is a complete, self-contained system that handles: + - Python type hint → Arrow type conversion + - Arrow type → Python type hint conversion + - Dynamic TypedDict creation for struct field preservation + - Cached conversion function generation + - Integration with semantic type registries + """ + + def __init__(self, semantic_registry: SemanticTypeRegistry | None = None): + self.semantic_registry = semantic_registry + + # Cache for created TypedDict classes + self._struct_signature_to_typeddict: dict[pa.StructType, type] = {} + self._typeddict_to_struct_signature: dict[type, pa.StructType] = {} + self._created_type_names: set[str] = set() + + # Cache for conversion functions + self._python_to_arrow_converters: dict[type, Callable] = {} + self._arrow_to_python_converters: dict[pa.DataType, Callable] = {} + + # Cache for type mappings + self._python_to_arrow_types: dict[type, pa.DataType] = {} + self._arrow_to_python_types: dict[pa.DataType, type] = {} + + def python_type_to_arrow_type(self, python_type: type) -> pa.DataType: + """ + Convert Python type hint to Arrow type with caching. + + This is the main entry point for Python → Arrow type conversion. + Results are cached for performance. + """ + # Check cache first + if python_type in self._python_to_arrow_types: + return self._python_to_arrow_types[python_type] + + # Convert and cache result + arrow_type = self._convert_python_to_arrow(python_type) + self._python_to_arrow_types[python_type] = arrow_type + + return arrow_type + + def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> Type: + """ + Convert Arrow type to Python type hint with caching. + + This is the main entry point for Arrow → Python type conversion. + Results are cached for performance. + """ + # Check cache first + if arrow_type in self._arrow_to_python_types: + return self._arrow_to_python_types[arrow_type] + + # Convert and cache result + python_type = self._convert_arrow_to_python(arrow_type) + self._arrow_to_python_types[arrow_type] = python_type + + return python_type + + def get_python_to_arrow_converter(self, python_type: Type) -> Callable[[Any], Any]: + """ + Get cached conversion function for Python value → Arrow value. + + This creates and caches conversion functions for optimal performance + during data conversion operations. + """ + if python_type in self._python_to_arrow_converters: + return self._python_to_arrow_converters[python_type] + + # Create conversion function + converter = self._create_python_to_arrow_converter(python_type) + self._python_to_arrow_converters[python_type] = converter + + return converter + + def get_arrow_to_python_converter( + self, arrow_type: pa.DataType + ) -> Callable[[Any], Any]: + """ + Get cached conversion function for Arrow value → Python value. + + This creates and caches conversion functions for optimal performance + during data conversion operations. + """ + if arrow_type in self._arrow_to_python_converters: + return self._arrow_to_python_converters[arrow_type] + + # Create conversion function + converter = self._create_arrow_to_python_converter(arrow_type) + self._arrow_to_python_converters[arrow_type] = converter + + return converter + + def _convert_python_to_arrow(self, python_type: Type) -> pa.DataType: + """Core Python → Arrow type conversion logic.""" + + # Handle basic types + basic_type_map = { + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), + bool: pa.bool_(), + bytes: pa.large_binary(), + } + + if python_type in basic_type_map: + return basic_type_map[python_type] + + # Check semantic registry for registered types + if self.semantic_registry: + converter = self.semantic_registry.get_converter_for_python_type( + python_type + ) + if converter: + return converter.arrow_struct_type + + # Handle typeddict look up + if python_type in self._typeddict_to_struct_signature: + return self._typeddict_to_struct_signature[python_type] + + # Check generic types + origin = get_origin(python_type) + args = get_args(python_type) + + if origin is None: + # Handle string type names + if hasattr(python_type, "__name__"): + type_name = python_type.__name__ + if type_name in basic_type_map: + return basic_type_map[type_name] + raise ValueError(f"Unsupported Python type: {python_type}") + + # Handle list types + if origin is list: + if len(args) != 1: + raise ValueError( + f"list type must have exactly one type argument, got: {args}" + ) + element_type = self.python_type_to_arrow_type(args[0]) + return pa.large_list(element_type) + + # Handle tuple types + elif origin is tuple: + if len(args) == 0: + raise ValueError("Empty tuple type not supported") + + if len(set(args)) == 1: + # Homogeneous tuple → fixed-size list + element_type = self.python_type_to_arrow_type(args[0]) + return pa.list_(element_type, len(args)) + else: + # Heterogeneous tuple → struct with indexed fields + fields = [] + for i, arg_type in enumerate(args): + field_type = self.python_type_to_arrow_type(arg_type) + fields.append((f"f{i}", field_type)) + return pa.struct(fields) + + # Handle dict types + elif origin is dict: + if len(args) != 2: + raise ValueError( + f"dict type must have exactly two type arguments, got: {args}" + ) + key_type = self.python_type_to_arrow_type(args[0]) + value_type = self.python_type_to_arrow_type(args[1]) + key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) + return pa.large_list(key_value_struct) + + # Handle Union/Optional types + elif origin is typing.Union: + if len(args) == 2 and type(None) in args: + # Optional[T] → just T (nullability handled at field level) + non_none_type = args[0] if args[1] is type(None) else args[1] + return self.python_type_to_arrow_type(non_none_type) + else: + # Complex unions → use first type as fallback + return self.python_type_to_arrow_type(args[0]) + + # Handle set types → lists + elif origin is set: + if len(args) != 1: + raise ValueError( + f"set type must have exactly one type argument, got: {args}" + ) + element_type = self.python_type_to_arrow_type(args[0]) + return pa.large_list(element_type) + + else: + raise ValueError(f"Unsupported generic type: {origin}") + + def _convert_arrow_to_python(self, arrow_type: pa.DataType) -> type | Any: + """Core Arrow → Python type conversion logic.""" + + # Handle basic types + if pa.types.is_integer(arrow_type): + return int + elif pa.types.is_floating(arrow_type): + return float + elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): + return str + elif pa.types.is_boolean(arrow_type): + return bool + elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): + return bytes + + # Handle struct types + elif pa.types.is_struct(arrow_type): + # Check if it's a registered semantic type first + if self.semantic_registry: + python_type = ( + self.semantic_registry.get_python_type_for_struct_signature( + arrow_type + ) + ) + if python_type: + return python_type + + # Create dynamic TypedDict for unregistered struct + # TODO: add check for heterogeneous tuple checking each field starts with f + return self._get_or_create_typeddict_for_struct(arrow_type) + + # Handle list types + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + element_type = arrow_type.value_type + + # Check if this is a dict representation: list> + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + if set(field_names) == {"key", "value"}: + # This is a dict + key_field = next(f for f in element_type if f.name == "key") + value_field = next(f for f in element_type if f.name == "value") + + key_python_type = self.arrow_type_to_python_type(key_field.type) + value_python_type = self.arrow_type_to_python_type(value_field.type) + + return dict[key_python_type, value_python_type] + + # Regular list + element_python_type = self.arrow_type_to_python_type(element_type) + + if pa.types.is_fixed_size_list(arrow_type): + # Fixed-size list → homogeneous tuple + size = arrow_type.list_size + return tuple[tuple(element_python_type for _ in range(size))] + else: + # Variable-size list → list + return list[element_python_type] + + # Handle map types + elif pa.types.is_map(arrow_type): + key_python_type = self.arrow_type_to_python_type(arrow_type.key_type) + value_python_type = self.arrow_type_to_python_type(arrow_type.item_type) + return dict[key_python_type, value_python_type] + + # Handle union types + elif pa.types.is_union(arrow_type): + import typing + + child_types = [] + for i in range(arrow_type.num_fields): + child_field = arrow_type[i] + child_types.append(self.arrow_type_to_python_type(child_field.type)) + + if len(child_types) == 2 and type(None) in child_types: + # Optional[T] + non_none_type = next(t for t in child_types if t is not type(None)) + return typing.Optional[non_none_type] + else: + return typing.Union[tuple(child_types)] + + else: + # Default case for unsupported types + return Any + + def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> Type: + """Get or create a TypedDict class for an Arrow struct type.""" + + # Check cache first + if struct_type in self._struct_signature_to_typeddict: + return self._struct_signature_to_typeddict[struct_type] + + # Create field specifications for TypedDict + field_specs: dict[str, type] = {} + for field in struct_type: + field_name = field.name + python_type = self.arrow_type_to_python_type(field.type) + field_specs[field_name] = python_type + + # Generate unique name + type_name = self._generate_unique_type_name(field_specs) + + # Create TypedDict dynamically + typeddict_class = TypedDict(type_name, field_specs) + + # Cache the mapping + self._struct_signature_to_typeddict[struct_type] = typeddict_class + self._typeddict_to_struct_signature[typeddict_class] = struct_type + + return typeddict_class + + def _generate_unique_type_name(self, field_specs: Dict[str, Type]) -> str: + """Generate a unique name for TypedDict based on field specifications.""" + + # Create deterministic signature that includes both names and types + field_items = sorted(field_specs.items()) + signature_parts = [] + + for field_name, field_type in field_items: + type_name = getattr(field_type, "__name__", str(field_type)) + if type_name.startswith("typing."): + type_name = type_name[7:] + signature_parts.append(f"{field_name}_{type_name}") + + # Create base name from signature + if len(signature_parts) <= 2: + base_name = "Struct_" + "_".join(signature_parts) + else: + # Use hash-based approach for larger structs + signature_str = "_".join(signature_parts) + signature_hash = hashlib.md5(signature_str.encode()).hexdigest()[:8] + field_names = [item[0] for item in field_items] + + if len(field_names) <= 3: + base_name = f"Struct_{'_'.join(field_names)}_{signature_hash}" + else: + base_name = f"Struct_{len(field_names)}fields_{signature_hash}" + + # Clean up the name + base_name = ( + base_name.replace("[", "_") + .replace("]", "_") + .replace(",", "_") + .replace(" ", "") + ) + + self._created_type_names.add(base_name) + return base_name + + def _create_python_to_arrow_converter( + self, python_type: type + ) -> Callable[[Any], Any]: + """Create a cached conversion function for Python → Arrow values.""" + + # Get the Arrow type for this Python type + arrow_type = self.python_type_to_arrow_type(python_type) + + # Check for semantic type first + if self.semantic_registry: + converter = self.semantic_registry.get_converter_for_python_type( + python_type + ) + if converter: + return converter.python_to_struct_dict + + # Create conversion function based on type + + origin = get_origin(python_type) + args = get_args(python_type) + + if python_type in {int, float, str, bool, bytes} or origin is None: + # Basic types - no conversion needed + return lambda value: value + + elif origin is list: + element_converter = self.get_python_to_arrow_converter(args[0]) + return ( + lambda value: [element_converter(item) for item in value] + if value + else [] + ) + + elif origin is dict: + key_converter = self.get_python_to_arrow_converter(args[0]) + value_converter = self.get_python_to_arrow_converter(args[1]) + return ( + lambda value: [ + {"key": key_converter(k), "value": value_converter(v)} + for k, v in value.items() + ] + if value + else [] + ) + + elif origin is tuple: + if len(set(args)) == 1: + # Homogeneous tuple + element_converter = self.get_python_to_arrow_converter(args[0]) + return lambda value: [element_converter(item) for item in value] + else: + # Heterogeneous tuple + converters = [self.get_python_to_arrow_converter(arg) for arg in args] + return lambda value: { + f"f{i}": converters[i](item) for i, item in enumerate(value) + } + + else: + # Default passthrough + return lambda value: value + + def _create_arrow_to_python_converter( + self, arrow_type: pa.DataType + ) -> Callable[[Any], Any]: + """Create a cached conversion function for Arrow → Python values.""" + + # Get the Python type for this Arrow type + python_type = self.arrow_type_to_python_type(arrow_type) + + # Check for semantic type first + if self.semantic_registry and pa.types.is_struct(arrow_type): + registered_python_type = ( + self.semantic_registry.get_python_type_for_struct_signature(arrow_type) + ) + if registered_python_type: + converter = self.semantic_registry.get_converter_for_python_type( + registered_python_type + ) + if converter: + return converter.struct_dict_to_python + + # Handle basic types - no conversion needed + if ( + pa.types.is_integer(arrow_type) + or pa.types.is_floating(arrow_type) + or pa.types.is_boolean(arrow_type) + or pa.types.is_string(arrow_type) + or pa.types.is_large_string(arrow_type) + or pa.types.is_binary(arrow_type) + or pa.types.is_large_binary(arrow_type) + ): + return lambda value: value + + # Handle list types + elif ( + pa.types.is_list(arrow_type) + or pa.types.is_large_list(arrow_type) + or pa.types.is_fixed_size_list(arrow_type) + ): + element_type = arrow_type.value_type + + # Check if this is a dict representation + if pa.types.is_struct(element_type): + field_names = [field.name for field in element_type] + if set(field_names) == {"key", "value"}: + # Dict representation + key_field = next(f for f in element_type if f.name == "key") + value_field = next(f for f in element_type if f.name == "value") + + key_converter = self.get_arrow_to_python_converter(key_field.type) + value_converter = self.get_arrow_to_python_converter( + value_field.type + ) + + return ( + lambda value: { + key_converter(item["key"]): value_converter(item["value"]) + for item in value + if item is not None + } + if value + else {} + ) + + # Regular list + element_converter = self.get_arrow_to_python_converter(element_type) + + if pa.types.is_fixed_size_list(arrow_type): + # Fixed-size list → tuple + return ( + lambda value: tuple(element_converter(item) for item in value) + if value + else () + ) + else: + # Variable-size list → list + return ( + lambda value: [element_converter(item) for item in value] + if value + else [] + ) + + # Handle struct types (TypedDict) + elif pa.types.is_struct(arrow_type): + # Create converters for each field + field_converters = {} + for field in arrow_type: + field_converters[field.name] = self.get_arrow_to_python_converter( + field.type + ) + + return ( + lambda value: { + field_name: field_converters[field_name](value.get(field_name)) + for field_name in field_converters + } + if value + else {} + ) + + else: + # Default passthrough + return lambda value: value + + def is_dynamic_typeddict(self, python_type: type) -> bool: + """Check if a type is one of our dynamically created TypedDicts.""" + return python_type in self._typeddict_to_struct_signature + + def get_struct_signature_for_typeddict( + self, python_type: type + ) -> pa.StructType | None: + """Get the Arrow struct signature for a dynamically created TypedDict.""" + return self._typeddict_to_struct_signature.get(python_type) + + def clear_cache(self) -> None: + """Clear all caches (useful for testing or memory management).""" + self._struct_signature_to_typeddict.clear() + self._typeddict_to_struct_signature.clear() + self._created_type_names.clear() + self._python_to_arrow_converters.clear() + self._arrow_to_python_converters.clear() + self._python_to_arrow_types.clear() + self._arrow_to_python_types.clear() + + def get_cache_stats(self) -> dict[str, int]: + """Get statistics about cache usage (useful for debugging/optimization).""" + return { + "typeddict_count": len(self._struct_signature_to_typeddict), + "python_to_arrow_converters": len(self._python_to_arrow_converters), + "arrow_to_python_converters": len(self._arrow_to_python_converters), + "type_mappings": len(self._python_to_arrow_types) + + len(self._arrow_to_python_types), + } + + +# Convenience functions that use a global instance +_global_converter: UniversalTypeConverter | None = None + + +def prepare_arrow_table_to_python_dicts_converter( + schema: pa.Schema, semantic_registry: SemanticTypeRegistry | None = None +) -> Callable[[pa.Table], list[dict]]: + """ + Prepare a converter function that converts an Arrow Table to a list of Python dicts. + + This uses the global UniversalTypeConverter instance to handle type conversions. + """ + + # TODO: + converter = get_global_converter(semantic_registry) + + # construct the converter lookup table to be used as closure + converter_lut: dict[str, Callable[[Any], Any]] = {} + for field in schema: + python_type = converter.arrow_type_to_python_type(field.type) + python_to_arrow = converter.get_python_to_arrow_converter(python_type) + converter_lut[field.name] = python_to_arrow + + def schema_specific_converter(table: pa.Table) -> list[dict]: + result = [] + for row in table.to_pylist(): + converted_row = {k: converter_lut[k](v) for k, v in row.items()} + result.append(converted_row) + return result + + return schema_specific_converter + + +def get_global_converter( + semantic_registry: SemanticTypeRegistry | None = None, +) -> UniversalTypeConverter: + """Get or create the global type converter instance.""" + global _global_converter + if ( + _global_converter is None + or _global_converter.semantic_registry != semantic_registry + ): + _global_converter = UniversalTypeConverter(semantic_registry) + return _global_converter + + +# Public API functions +def python_type_to_arrow_type( + python_type: type, semantic_registry: SemanticTypeRegistry | None = None +) -> pa.DataType: + """Convert Python type to Arrow type using the global converter.""" + converter = get_global_converter(semantic_registry) + return converter.python_type_to_arrow_type(python_type) + + +def arrow_type_to_python_type( + arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None +) -> type: + """Convert Arrow type to Python type using the global converter.""" + converter = get_global_converter(semantic_registry) + return converter.arrow_type_to_python_type(arrow_type) + + +def get_conversion_functions( + python_type: type, semantic_registry: SemanticTypeRegistry | None = None +) -> tuple[Callable, Callable]: + """Get both conversion functions for a Python type.""" + converter = get_global_converter(semantic_registry) + arrow_type = converter.python_type_to_arrow_type(python_type) + + python_to_arrow = converter.get_python_to_arrow_converter(python_type) + arrow_to_python = converter.get_arrow_to_python_converter(arrow_type) + + return python_to_arrow, arrow_to_python + + +# Example usage and demonstration +if __name__ == "__main__": + print("=== Universal Type Conversion Engine ===\n") + + from pathlib import Path + import uuid + from sample_converters import create_standard_semantic_registry + + # Create converter with semantic registry + registry = create_standard_semantic_registry() + converter = UniversalTypeConverter(registry) + + print("Testing comprehensive type conversion:") + print("=" * 50) + + # Test various type conversions + test_types = [ + int, + str, + list[int], + dict[str, float], + tuple[int, str, bool], + Path, # Semantic type + uuid.UUID, # Semantic type + ] + + print("\nType Conversions:") + for python_type in test_types: + arrow_type = converter.python_type_to_arrow_type(python_type) + recovered_type = converter.arrow_type_to_python_type(arrow_type) + + print(f" {python_type} → {arrow_type} → {recovered_type}") + print(f" Round-trip successful: {recovered_type == python_type}") + + print(f"\n" + "=" * 50) + print("Testing conversion function caching:") + + # Test conversion functions + test_data = { + "id": 123, + "name": "Alice", + "tags": ["python", "arrow"], + "metadata": {"active": True, "score": 95.5}, + "file_path": Path("/home/alice/data.csv"), + "user_id": uuid.uuid4(), + } + + schema = { + "id": int, + "name": str, + "tags": list[str], + "metadata": dict[str, Any], + "file_path": Path, + "user_id": uuid.UUID, + } + + # Get conversion functions (these get cached) + converters = {} + for field_name, python_type in schema.items(): + python_to_arrow = converter.get_python_to_arrow_converter(python_type) + arrow_type = converter.python_type_to_arrow_type(python_type) + arrow_to_python = converter.get_arrow_to_python_converter(arrow_type) + converters[field_name] = (python_to_arrow, arrow_to_python) + + print("Conversion functions created and cached for all fields") + + # Test round-trip conversion using cached functions + converted_data = {} + for field_name, value in test_data.items(): + python_to_arrow, arrow_to_python = converters[field_name] + + # Convert to Arrow format + arrow_value = python_to_arrow(value) + # Convert back to Python + recovered_value = arrow_to_python(arrow_value) + + converted_data[field_name] = recovered_value + + print( + f" {field_name}: {type(value).__name__} → Arrow → {type(recovered_value).__name__}" + ) + + print(f"\n" + "=" * 50) + print("Cache Statistics:") + stats = converter.get_cache_stats() + for stat_name, count in stats.items(): + print(f" {stat_name}: {count}") + + print(f"\n" + "=" * 50) + print("✅ Universal Type Conversion Engine Benefits:") + print("✅ Single self-contained system for all conversions") + print("✅ Holds semantic registry internally") + print("✅ Caches all conversion functions for performance") + print("✅ Handles both Python→Arrow and Arrow→Python") + print("✅ Creates TypedDicts preserving struct field info") + print("✅ Dramatic reduction in function creation overhead") + print("✅ Central caching reduces memory usage") From 8b44eb2ab239c1d839a3d478a99aca2fee0a39f3 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 2 Aug 2025 01:33:11 +0000 Subject: [PATCH 162/224] feat: add convenience method for converting between pydict and pylist --- src/orcapod/utils/arrow_utils.py | 72 ++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 1e7865a..74e5c71 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -6,6 +6,78 @@ from typing import Any +def pylist_to_pydict(pylist: list[dict]) -> dict: + """ + Convert a list of dictionaries to a dictionary of lists (columnar format). + + This function transforms row-based data (list of dicts) to column-based data + (dict of lists), similar to converting from records format to columnar format. + Missing keys in individual dictionaries are filled with None values. + + Args: + pylist: List of dictionaries representing rows of data + + Returns: + Dictionary where keys are column names and values are lists of column data + + Example: + >>> data = [{'a': 1, 'b': 2}, {'a': 3, 'c': 4}] + >>> pylist_to_pydict(data) + {'a': [1, 3], 'b': [2, None], 'c': [None, 4]} + """ + result = {} + known_keys = set() + for i, d in enumerate(pylist): + known_keys.update(d.keys()) + for k in known_keys: + result.setdefault(k, [None] * i).append(d.get(k, None)) + return result + + +def pydict_to_pylist(pydict: dict) -> list[dict]: + """ + Convert a dictionary of lists (columnar format) to a list of dictionaries. + + This function transforms column-based data (dict of lists) to row-based data + (list of dicts), similar to converting from columnar format to records format. + All arrays in the input dictionary must have the same length. + + Args: + pydict: Dictionary where keys are column names and values are lists of column data + + Returns: + List of dictionaries representing rows of data + + Raises: + ValueError: If arrays in the dictionary have inconsistent lengths + + Example: + >>> data = {'a': [1, 3], 'b': [2, None], 'c': [None, 4]} + >>> pydict_to_pylist(data) + [{'a': 1, 'b': 2, 'c': None}, {'a': 3, 'b': None, 'c': 4}] + """ + if not pydict: + return [] + + # Check all arrays have same length + lengths = [len(v) for v in pydict.values()] + if not all(length == lengths[0] for length in lengths): + raise ValueError( + f"Inconsistent array lengths: {dict(zip(pydict.keys(), lengths))}" + ) + + num_rows = lengths[0] + if num_rows == 0: + return [] + + result = [] + keys = pydict.keys() + for i in range(num_rows): + row = {k: pydict[k][i] for k in keys} + result.append(row) + return result + + def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, no field names should collide.""" From 9854d3f28fa0cce2bdfa2b5194cc621b12445c0b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 2 Aug 2025 01:39:15 +0000 Subject: [PATCH 163/224] feat: start implementing submodule system --- src/orcapod/contexts/__init__.py | 208 +++++++++++ src/orcapod/contexts/core.py | 48 +++ .../contexts/data/schemas/context_schema.json | 205 +++++++++++ src/orcapod/contexts/data/v0.1.json | 60 ++++ src/orcapod/contexts/registry.py | 334 ++++++++++++++++++ src/orcapod/data/streams.py | 23 +- src/orcapod/protocols/data_protocols.py | 29 ++ 7 files changed, 906 insertions(+), 1 deletion(-) create mode 100644 src/orcapod/contexts/__init__.py create mode 100644 src/orcapod/contexts/core.py create mode 100644 src/orcapod/contexts/data/schemas/context_schema.json create mode 100644 src/orcapod/contexts/data/v0.1.json create mode 100644 src/orcapod/contexts/registry.py diff --git a/src/orcapod/contexts/__init__.py b/src/orcapod/contexts/__init__.py new file mode 100644 index 0000000..649c480 --- /dev/null +++ b/src/orcapod/contexts/__init__.py @@ -0,0 +1,208 @@ +""" +OrcaPod Data Context System + +This package manages versioned data contexts that define how data +should be interpreted and processed throughout the OrcaPod system. + +A DataContext contains: +- Semantic type registry for handling structured data types +- Arrow hasher for hashing Arrow tables +- Object hasher for general object hashing +- Versioning information for reproducibility + +Example usage: + # Get default context + context = resolve_context() + + # Get specific version + context = resolve_context("v0.1") + + # Use context components + registry = context.semantic_type_registry + hasher = context.arrow_hasher + + # List available contexts + versions = get_available_contexts() +""" + +from .core import DataContext, ContextValidationError, ContextResolutionError +from .registry import JSONDataContextRegistry +from typing import Any + +# Global registry instance (lazily initialized) +_registry: JSONDataContextRegistry | None = None + + +def _get_registry() -> JSONDataContextRegistry: + """Get the global context registry, initializing if needed.""" + global _registry + if _registry is None: + _registry = JSONDataContextRegistry() + return _registry + + +def resolve_context(context_info: str | DataContext | None = None) -> DataContext: + """ + Resolve context information to a DataContext instance. + + Args: + context_info: One of: + - None: Use default context + - str: Version string ("v0.1") or full key ("std:v0.1:default") + - DataContext: Return as-is + + Returns: + DataContext instance + + Raises: + ContextResolutionError: If context cannot be resolved + + Examples: + >>> context = resolve_context() # Default + >>> context = resolve_context("v0.1") # Specific version + >>> context = resolve_context("std:v0.1:default") # Full key + >>> context = resolve_context("latest") # Latest version + """ + # If already a DataContext, return as-is + if isinstance(context_info, DataContext): + return context_info + + # Use registry to resolve string/None to DataContext + registry = _get_registry() + return registry.get_context(context_info) + + +def get_available_contexts() -> list[str]: + """ + Get list of all available context versions. + + Returns: + Sorted list of version strings + + Example: + >>> get_available_contexts() + ['v0.1', 'v0.1-fast', 'v0.2'] + """ + registry = _get_registry() + return registry.get_available_versions() + + +def get_context_info(version: str) -> dict[str, Any]: + """ + Get metadata about a specific context version. + + Args: + version: Context version string + + Returns: + Dictionary with context metadata + + Example: + >>> info = get_context_info("v0.1") + >>> print(info['description']) + 'Initial stable release with basic Path semantic type support' + """ + registry = _get_registry() + return registry.get_context_info(version) + + +def set_default_context_version(version: str) -> None: + """ + Set the default context version globally. + + Args: + version: Version string to set as default + + Raises: + ContextResolutionError: If version doesn't exist + """ + registry = _get_registry() + registry.set_default_version(version) + + +def validate_all_contexts() -> dict[str, str | None]: + """ + Validate that all available contexts can be instantiated. + + Returns: + Dict mapping version -> error message (None if valid) + + Example: + >>> results = validate_all_contexts() + >>> for version, error in results.items(): + ... if error: + ... print(f"{version}: {error}") + ... else: + ... print(f"{version}: OK") + """ + registry = _get_registry() + return registry.validate_all_contexts() + + +def reload_contexts() -> None: + """ + Reload context specifications from disk. + + Useful during development or when context files have been updated. + Clears all cached contexts and reloads from JSON files. + """ + registry = _get_registry() + registry.reload_contexts() + + +def get_default_context() -> DataContext: + """ + Get the default data context. + + Returns: + DataContext instance for the default version + """ + return resolve_context() + + +# Convenience function for creating custom registries +def create_registry( + contexts_dir: str | None = None, + schema_file: str | None = None, + default_version: str = "v0.1", +) -> JSONDataContextRegistry: + """ + Create a custom context registry. + + Useful for testing or when you need to use a different set of contexts. + + Args: + contexts_dir: Directory containing context JSON files + schema_file: JSON schema file for validation + default_version: Default version to use + + Returns: + JSONDataContextRegistry instance + + Example: + >>> # Create registry for testing + >>> test_registry = create_registry("/path/to/test/contexts") + >>> test_context = test_registry.get_context("test") + """ + return JSONDataContextRegistry(contexts_dir, schema_file, default_version) + + +# Public API +__all__ = [ + # Core types + "DataContext", + "ContextValidationError", + "ContextResolutionError", + # Main functions + "resolve_context", + "get_available_contexts", + "get_context_info", + "get_default_context", + # Management functions + "set_default_context_version", + "validate_all_contexts", + "reload_contexts", + # Advanced usage + "create_registry", + "JSONDataContextRegistry", +] diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py new file mode 100644 index 0000000..06e0899 --- /dev/null +++ b/src/orcapod/contexts/core.py @@ -0,0 +1,48 @@ +""" +Core data structures and exceptions for the OrcaPod context system. + +This module defines the basic types and exceptions used throughout +the context management system. +""" + +from dataclasses import dataclass +from orcapod.semantic_types import SemanticTypeRegistry +from orcapod.protocols import hashing_protocols as hp + + +@dataclass +class DataContext: + """ + Data context containing all versioned components needed for data interpretation. + + A DataContext represents a specific version of the OrcaPod system configuration, + including semantic type registries, hashers, and other components that affect + how data is processed and interpreted. + + Attributes: + context_key: Unique identifier (e.g., "std:v0.1:default") + version: Version string (e.g., "v0.1") + description: Human-readable description of this context + semantic_type_registry: Registry of semantic type converters + arrow_hasher: Arrow table hasher for this context + object_hasher: General object hasher for this context + """ + + context_key: str + version: str + description: str + semantic_type_registry: SemanticTypeRegistry + arrow_hasher: hp.ArrowHasher + object_hasher: hp.ObjectHasher + + +class ContextValidationError(Exception): + """Raised when context validation fails.""" + + pass + + +class ContextResolutionError(Exception): + """Raised when context cannot be resolved.""" + + pass diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json new file mode 100644 index 0000000..462a2db --- /dev/null +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -0,0 +1,205 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://orcapod.dev/schemas/data-context.json", + "$comment": "Located at: src/orcapod/contexts/data/schemas/context_schema.json", + "title": "OrcaPod Data Context Specification", + "description": "Schema for OrcaPod data context JSON files", + "type": "object", + "required": [ + "context_key", + "version", + "semantic_type_registry", + "arrow_hasher", + "object_hasher" + ], + "properties": { + "context_key": { + "type": "string", + "pattern": "^[a-zA-Z0-9_-]+:v[0-9]+\\.[0-9]+:[a-zA-Z0-9_-]+$", + "description": "Unique identifier for this context (e.g., 'std:v0.1:default')", + "examples": [ + "std:v0.1:default", + "std:v0.2:fast", + "custom:v1.0:production" + ] + }, + "version": { + "type": "string", + "pattern": "^v[0-9]+\\.[0-9]+$", + "description": "Version identifier matching the filename", + "examples": [ + "v0.1", + "v1.0", + "v2.3" + ] + }, + "description": { + "type": "string", + "description": "Human-readable description of this context version", + "examples": [ + "Initial stable release with basic Path semantic type support", + "Enhanced version with timestamp support and improved hashing" + ] + }, + "semantic_type_registry": { + "$ref": "#/$defs/objectspec", + "description": "ObjectSpec for the semantic type registry" + }, + "arrow_hasher": { + "$ref": "#/$defs/objectspec", + "description": "ObjectSpec for the Arrow hasher component" + }, + "object_hasher": { + "$ref": "#/$defs/objectspec", + "description": "ObjectSpec for the object hasher component" + }, + "metadata": { + "type": "object", + "description": "Optional metadata about this context", + "properties": { + "created_date": { + "type": "string", + "format": "date", + "description": "Date this context was created" + }, + "author": { + "type": "string", + "description": "Author of this context specification" + }, + "changelog": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of changes in this version" + }, + "compatibility": { + "type": "object", + "properties": { + "backwards_compatible_with": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of versions this context can read data from" + }, + "breaking_changes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of breaking changes from previous versions" + } + } + } + } + } + }, + "additionalProperties": false, + "$defs": { + "objectspec": { + "description": "ObjectSpec pattern used throughout OrcaPod", + "oneOf": [ + { + "type": "object", + "required": [ + "_class" + ], + "properties": { + "_class": { + "type": "string", + "pattern": "^[a-zA-Z_][a-zA-Z0-9_.]*\\.[a-zA-Z_][a-zA-Z0-9_]*$", + "description": "Fully qualified class name", + "examples": [ + "orcapod.types.semantic_types.SemanticTypeRegistry", + "orcapod.hashing.arrow_hashers.SemanticArrowHasher" + ] + }, + "_config": { + "type": "object", + "description": "Configuration parameters for the class", + "additionalProperties": true + } + }, + "additionalProperties": false + }, + { + "type": "array", + "description": "Array of object specifications", + "items": { + "$ref": "#/$defs/objectspec" + } + }, + { + "type": [ + "string", + "number", + "boolean", + "null" + ], + "description": "Primitive values" + } + ] + } + }, + "examples": [ + { + "context_key": "std:v0.1:default", + "version": "v0.1", + "description": "Initial stable release with basic Path semantic type support", + "semantic_type_registry": { + "_class": "orcapod.types.semantic_types.SemanticTypeRegistry", + "_config": { + "converters": [ + { + "_class": "orcapod.types.semantic_types.PathStructConverter", + "_config": {} + } + ] + } + }, + "arrow_hasher": { + "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", + "_config": { + "hasher_id": "arrow_v0.1", + "hash_algorithm": "sha256", + "serialization_method": "logical", + "semantic_type_hashers": { + "path": { + "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", + "_config": { + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "_config": { + "algorithm": "sha256" + } + } + } + } + } + } + }, + "object_hasher": { + "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", + "_config": { + "hasher_id": "object_v0.1", + "function_info_extractor": { + "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", + "_config": { + "include_module": true, + "include_defaults": true + } + } + } + }, + "metadata": { + "created_date": "2025-08-01", + "author": "OrcaPod Team", + "changelog": [ + "Initial release with semantic type registry", + "Basic Arrow and object hashing capabilities" + ] + } + } + ] +} \ No newline at end of file diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json new file mode 100644 index 0000000..6923f31 --- /dev/null +++ b/src/orcapod/contexts/data/v0.1.json @@ -0,0 +1,60 @@ +{ + "context_key": "std:v0.1:default", + "version": "v0.1", + "description": "Initial stable release with basic Path semantic type support", + "semantic_type_registry": { + "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", + "_config": { + "converters": [ + { + "_class": "orcapod.semantic_types.struct_converters.PathStructConverter", + "_config": {} + } + ] + } + }, + "arrow_hasher": { + "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", + "_config": { + "hasher_id": "arrow_v0.1", + "hash_algorithm": "sha256", + "chunk_size": 8192, + "serialization_method": "logical", + "semantic_type_hashers": { + "path": { + "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", + "_config": { + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "_config": { + "algorithm": "sha256" + } + } + } + } + } + } + }, + "object_hasher": { + "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", + "_config": { + "hasher_id": "object_v0.1", + "function_info_extractor": { + "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", + "_config": { + "include_module": true, + "include_defaults": true + } + } + } + }, + "metadata": { + "created_date": "2025-08-01", + "author": "OrcaPod Core Team", + "changelog": [ + "Initial release with Path semantic type support", + "Basic SHA-256 hashing for files and objects", + "Arrow logical serialization method" + ] + } +} \ No newline at end of file diff --git a/src/orcapod/contexts/registry.py b/src/orcapod/contexts/registry.py new file mode 100644 index 0000000..c9d3226 --- /dev/null +++ b/src/orcapod/contexts/registry.py @@ -0,0 +1,334 @@ +""" +JSON-based Context Registry implementation. + +This module contains the core registry that loads and manages +data contexts from JSON files with validation and caching. +""" + +import json + + +from pathlib import Path +from typing import Any +import logging +from orcapod.utils.object_spec import parse_objectspec +from .core import DataContext, ContextValidationError, ContextResolutionError + +logger = logging.getLogger(__name__) + +try: + import jsonschema +except ImportError: + jsonschema = None + logger.info("jsonschema not available, skipping schema validation") + + +class JSONDataContextRegistry: + """ + Registry that loads data contexts from JSON files with validation. + + Features: + - Loads context specs from JSON files in a directory + - Validates JSON structure against schema + - Lazy loading with caching + - Robust error handling and logging + - Context string resolution (e.g., "v0.1", "std:v0.1:default") + """ + + def __init__( + self, + contexts_dir: Path | str | None = None, + schema_file: Path | str | None = None, + default_version: str = "v0.1", + ): + """ + Initialize the context registry. + + Args: + contexts_dir: Directory containing JSON context files + schema_file: JSON schema file for validation (optional) + default_version: Default context version to use + """ + # Set up paths + if contexts_dir is None: + contexts_dir = self._get_default_contexts_dir() + self.contexts_dir = Path(contexts_dir) + + if schema_file is None: + schema_file = self.contexts_dir / "schemas" / "context_schema.json" + self.schema_file = Path(schema_file) if schema_file else None + + # Internal state + self._specs: dict[str, dict[str, Any]] = {} + self._contexts: dict[str, DataContext] = {} + self._schema: dict[str, Any] | None = None + self._default_version = default_version + + # Load everything on initialization + self._load_schema() + self._load_all_specs() + logger.info(f"Loaded {len(self._specs)} context specifications") + + def _get_default_contexts_dir(self) -> Path: + """Get the default contexts directory from package data.""" + try: + # Python 3.9+ preferred method + import importlib.resources as resources + + contexts_path = resources.files("orcapod.contexts") / "data" + return Path(str(contexts_path)) + except (ImportError, AttributeError): + # Fallback for older Python versions + return Path(__file__).parent / "data" + + def _load_schema(self) -> None: + """Load JSON schema for validation if available.""" + if self.schema_file and self.schema_file.exists(): + try: + with open(self.schema_file, "r") as f: + self._schema = json.load(f) + logger.info(f"Loaded validation schema from {self.schema_file}") + except Exception as e: + logger.warning(f"Failed to load schema from {self.schema_file}: {e}") + self._schema = None + else: + logger.info("No validation schema specified or found") + self._schema = None + + def _load_all_specs(self) -> None: + """Load all JSON context specifications from the contexts directory.""" + if not self.contexts_dir.exists(): + raise ContextValidationError( + f"Contexts directory not found: {self.contexts_dir}" + ) + + json_files = list(self.contexts_dir.glob("*.json")) + if not json_files: + raise ContextValidationError( + f"No JSON context files found in {self.contexts_dir}" + ) + + for json_file in json_files: + try: + self._load_spec_file(json_file) + except Exception as e: + logger.error(f"Failed to load context spec from {json_file}: {e}") + raise ContextValidationError(f"Invalid context file {json_file}: {e}") + + def _load_spec_file(self, json_file: Path) -> None: + """Load and validate a single context specification file.""" + version = json_file.stem # e.g., "v0.1" from "v0.1.json" + + # Load JSON + with open(json_file, "r") as f: + spec = json.load(f) + + # Validate basic structure + if not isinstance(spec, dict): + raise ContextValidationError("Context spec must be a JSON object") + + # Check version consistency + spec_version = spec.get("version") + if spec_version != version: + raise ContextValidationError( + f"Version mismatch in {json_file}: filename suggests '{version}' " + f"but spec contains '{spec_version}'" + ) + + # Validate required fields + required_fields = [ + "context_key", + "version", + "semantic_type_registry", + "arrow_hasher", + "object_hasher", + ] + missing_fields = [field for field in required_fields if field not in spec] + if missing_fields: + raise ContextValidationError(f"Missing required fields: {missing_fields}") + + # Validate against JSON schema if available + if self._schema and jsonschema is not None: + try: + jsonschema.validate(spec, self._schema) + except jsonschema.ValidationError as e: + raise ContextValidationError(f"Schema validation failed: {e.message}") + + # Store the validated spec + self._specs[version] = spec + logger.debug(f"Loaded context spec: {version} -> {spec.get('context_key')}") + + def get_available_versions(self) -> list[str]: + """Get all available context versions, sorted.""" + return sorted(self._specs.keys()) + + def get_context_info(self, version: str) -> dict[str, Any]: + """Get context metadata without creating the full context.""" + if version not in self._specs: + available = ", ".join(self.get_available_versions()) + raise ContextResolutionError( + f"Unknown context version '{version}'. Available: {available}" + ) + + spec = self._specs[version] + return { + "version": spec["version"], + "context_key": spec["context_key"], + "description": spec.get("description", "No description provided"), + "file_path": self.contexts_dir / f"{version}.json", + } + + def resolve_context_string(self, context_string: str | None) -> str: + """ + Resolve context string to a version identifier. + + Supports various formats: + - None -> default version + - "v0.1" -> "v0.1" + - "std:v0.1:default" -> "v0.1" (extract version from full key) + - "latest" -> highest version number + """ + if context_string is None: + return self._default_version + + # Handle special cases + if context_string == "latest": + versions = self.get_available_versions() + return versions[-1] if versions else self._default_version + + # If it looks like a simple version (v0.1), use directly + if context_string.startswith("v") and ":" not in context_string: + return context_string + + # If it looks like a full context key (std:v0.1:default), extract version + if ":" in context_string: + parts = context_string.split(":") + if len(parts) >= 2 and parts[1].startswith("v"): + return parts[1] # Extract version part + + # Fallback: treat as version string + return context_string + + def get_context(self, context_string: str | None = None) -> DataContext: + """ + Get DataContext instance, creating it lazily if needed. + + Args: + context_string: Version string, full context key, or None for default + + Returns: + DataContext instance + + Raises: + ContextResolutionError: If context cannot be resolved or created + """ + try: + # Resolve to version + version = self.resolve_context_string(context_string) + + # Return cached context if available + if version in self._contexts: + logger.debug(f"Returning cached context for version {version}") + return self._contexts[version] + + # Validate version exists + if version not in self._specs: + available = ", ".join(self.get_available_versions()) + raise ContextResolutionError( + f"Unknown context version '{version}' (resolved from '{context_string}'). " + f"Available: {available}" + ) + + # Create context from spec + logger.info(f"Creating new context for version {version}") + spec = self._specs[version] + context = self._create_context_from_spec(spec) + + # Cache and return + self._contexts[version] = context + return context + + except Exception as e: + if isinstance(e, (ContextResolutionError, ContextValidationError)): + raise + else: + raise ContextResolutionError( + f"Failed to resolve context '{context_string}': {e}" + ) + + def _create_context_from_spec(self, spec: dict[str, Any]) -> DataContext: + """Create DataContext instance from validated specification.""" + try: + # Parse each component using ObjectSpec + context_key = spec["context_key"] + version = spec["version"] + description = spec.get("description", "") + + logger.debug(f"Creating semantic registry for {version}") + semantic_registry = parse_objectspec(spec["semantic_type_registry"]) + + logger.debug(f"Creating arrow hasher for {version}") + arrow_hasher = parse_objectspec(spec["arrow_hasher"]) + + logger.debug(f"Creating object hasher for {version}") + object_hasher = parse_objectspec(spec["object_hasher"]) + + return DataContext( + context_key=context_key, + version=version, + description=description, + semantic_type_registry=semantic_registry, + arrow_hasher=arrow_hasher, + object_hasher=object_hasher, + ) + + except Exception as e: + raise ContextValidationError(f"Failed to create context from spec: {e}") + + def set_default_version(self, version: str) -> None: + """Set the default context version.""" + if version not in self._specs: + available = ", ".join(self.get_available_versions()) + raise ContextResolutionError( + f"Cannot set default to unknown version '{version}'. Available: {available}" + ) + + old_default = self._default_version + self._default_version = version + logger.info(f"Changed default context version from {old_default} to {version}") + + def reload_contexts(self) -> None: + """Reload all context specifications from disk.""" + logger.info("Reloading context specifications from disk") + + # Clear caches + self._specs.clear() + self._contexts.clear() + + # Reload + self._load_schema() + self._load_all_specs() + + logger.info(f"Reloaded {len(self._specs)} context specifications") + + def validate_all_contexts(self) -> dict[str, str | None]: + """ + Validate that all context specifications can be instantiated. + + Returns: + Dict mapping version -> error message (None if valid) + """ + results = {} + + for version in self.get_available_versions(): + try: + # Try to create the context (don't cache it) + spec = self._specs[version] + self._create_context_from_spec(spec) + results[version] = None # Success + logger.debug(f"Context {version} validates successfully") + except Exception as e: + results[version] = str(e) + logger.error(f"Context {version} validation failed: {e}") + + return results diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 14cb191..c381ad4 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -16,7 +16,6 @@ from orcapod.data.system_constants import orcapod_constants as constants from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec, schemas -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -103,6 +102,28 @@ def __init__( data_context = source.data_context_key self._data_context = DataContext.resolve_data_context(data_context) + @property + def substream_identities(self) -> tuple[str, ...]: + """ + Returns the identities of the substreams that this stream is composed of. + This is used to identify the substreams in the computational graph. + """ + return ( + self._data_context.object_hasher.hash_to_hex( + self.identity_structure(), compressed=True, prefix_hasher_id=True + ), + ) + + def get_substream(self, substream_id: str) -> dp.Stream: + """ + Returns the substream with the given substream_id. + This is used to retrieve a specific substream from the stream. + """ + if substream_id == self.substream_identities[0]: + return self + else: + raise ValueError(f"Substream with ID {substream_id} not found.") + @property def data_context(self) -> DataContext: """ diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 8a6f330..e16491d 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -994,6 +994,35 @@ class Stream(ContentIdentifiable, Labelable, Protocol): - Conversion to common formats (tables, dictionaries) """ + @property + def substream_identities(self) -> tuple[str, ...]: + """ + Unique identifiers for sub-streams within this stream. + + This property provides a way to identify and differentiate + sub-streams that may be part of a larger stream. It is useful + for tracking and managing complex data flows. + + Returns: + tuple[str, ...]: Unique identifiers for each sub-stream + """ + ... + + def get_substream(self, substream_id: str) -> "Stream": + """ + Retrieve a specific sub-stream by its identifier. + + This method allows access to individual sub-streams within the + main stream, enabling focused operations on specific data segments. + + Args: + substream_id: Unique identifier for the desired sub-stream. + + Returns: + Stream: The requested sub-stream if it exists + """ + ... + @property def source(self) -> "Kernel | None": """ From f368db4a5499308aa46bb785628ae4c49b723c4c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 2 Aug 2025 22:05:28 +0000 Subject: [PATCH 164/224] refactor: update context to load universal type converter --- src/orcapod/contexts/core.py | 4 ++-- .../contexts/data/schemas/context_schema.json | 6 +++--- src/orcapod/contexts/data/v0.1.json | 19 ++++++++++++------- src/orcapod/contexts/registry.py | 13 ++++++++----- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index 06e0899..078f857 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -6,7 +6,7 @@ """ from dataclasses import dataclass -from orcapod.semantic_types import SemanticTypeRegistry +from orcapod.semantic_types import UniversalTypeConverter from orcapod.protocols import hashing_protocols as hp @@ -31,7 +31,7 @@ class DataContext: context_key: str version: str description: str - semantic_type_registry: SemanticTypeRegistry + type_converter: UniversalTypeConverter arrow_hasher: hp.ArrowHasher object_hasher: hp.ObjectHasher diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 462a2db..0d5a305 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -8,7 +8,7 @@ "required": [ "context_key", "version", - "semantic_type_registry", + "type_converter", "arrow_hasher", "object_hasher" ], @@ -41,9 +41,9 @@ "Enhanced version with timestamp support and improved hashing" ] }, - "semantic_type_registry": { + "type_converter": { "$ref": "#/$defs/objectspec", - "description": "ObjectSpec for the semantic type registry" + "description": "ObjectSpec for the python-arrow type converter" }, "arrow_hasher": { "$ref": "#/$defs/objectspec", diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 6923f31..188bd9c 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -2,15 +2,20 @@ "context_key": "std:v0.1:default", "version": "v0.1", "description": "Initial stable release with basic Path semantic type support", - "semantic_type_registry": { - "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", + "type_converter": { + "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", "_config": { - "converters": [ - { - "_class": "orcapod.semantic_types.struct_converters.PathStructConverter", - "_config": {} + "semantic_registry": { + "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", + "_config": { + "converters": [ + { + "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", + "_config": {} + } + ] } - ] + } } }, "arrow_hasher": { diff --git a/src/orcapod/contexts/registry.py b/src/orcapod/contexts/registry.py index c9d3226..7bf869a 100644 --- a/src/orcapod/contexts/registry.py +++ b/src/orcapod/contexts/registry.py @@ -135,11 +135,12 @@ def _load_spec_file(self, json_file: Path) -> None: f"but spec contains '{spec_version}'" ) + # TODO: clean this up -- sounds redundant to the validation performed by schema check # Validate required fields required_fields = [ "context_key", "version", - "semantic_type_registry", + "type_converter", "arrow_hasher", "object_hasher", ] @@ -264,8 +265,8 @@ def _create_context_from_spec(self, spec: dict[str, Any]) -> DataContext: version = spec["version"] description = spec.get("description", "") - logger.debug(f"Creating semantic registry for {version}") - semantic_registry = parse_objectspec(spec["semantic_type_registry"]) + logger.debug(f"Creating type converter for {version}") + type_converter = parse_objectspec(spec["type_converter"]) logger.debug(f"Creating arrow hasher for {version}") arrow_hasher = parse_objectspec(spec["arrow_hasher"]) @@ -277,13 +278,15 @@ def _create_context_from_spec(self, spec: dict[str, Any]) -> DataContext: context_key=context_key, version=version, description=description, - semantic_type_registry=semantic_registry, + type_converter=type_converter, arrow_hasher=arrow_hasher, object_hasher=object_hasher, ) except Exception as e: - raise ContextValidationError(f"Failed to create context from spec: {e}") + raise ContextValidationError( + f"Failed to create context from spec: {e}" + ) from e def set_default_version(self, version: str) -> None: """Set the default context version.""" From 66939f8e400c726cfc36dc3d927fd831da182fa4 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 3 Aug 2025 01:26:12 +0000 Subject: [PATCH 165/224] refactor: major change of data context and semantic type system --- src/orcapod/__init__.py | 2 +- src/orcapod/contexts/core.py | 8 +- src/orcapod/data/context.py | 59 -- src/orcapod/data/datagrams/arrow_datagram.py | 88 +- .../data/datagrams/arrow_tag_packet.py | 21 +- src/orcapod/data/datagrams/base.py | 12 +- src/orcapod/data/datagrams/dict_datagram.py | 79 +- src/orcapod/data/datagrams/dict_tag_packet.py | 13 +- src/orcapod/data/kernels.py | 8 +- src/orcapod/data/operators/base.py | 14 +- src/orcapod/data/pods.py | 26 +- src/orcapod/data/sources.py | 25 +- src/orcapod/data/streams.py | 43 +- src/orcapod/data/trackers.py | 6 +- src/orcapod/pipeline/graph.py | 4 +- src/orcapod/pipeline/legacy_nodes.py | 817 ------------------ src/orcapod/pipeline/nodes.py | 2 +- src/orcapod/protocols/data_protocols.py | 63 +- src/orcapod/protocols/hashing_protocols.py | 23 +- src/orcapod/protocols/semantic_protocols.py | 45 +- src/orcapod/protocols/store_protocols.py | 27 + src/orcapod/semantic_types/__init__.py | 7 + .../semantic_types/precomputed_converters.py | 147 ++++ src/orcapod/semantic_types/schemas.py | 357 -------- .../semantic_types/semantic_registry.py | 318 +++++++ .../semantic_struct_converters.py | 79 ++ .../semantic_types/struct_converters.py | 307 ------- src/orcapod/semantic_types/type_inference.py | 250 ++++++ .../semantic_types/universal_converter.py | 506 +++++++---- .../{ => unused}/complete_converter_test.py | 76 +- .../{ => unused}/python_arrow_types.py | 202 ++++- src/orcapod/semantic_types/unused/schemas.py | 357 ++++++++ .../semantic_converters.py} | 546 +++--------- .../semantic_types/unused/struct_types.py | 312 +++++++ .../{ => unused}/table_converters.py | 48 +- src/orcapod/types/__init__.py | 1 - src/orcapod/types/defaults.py | 51 -- src/orcapod/types/schemas.py | 357 -------- src/orcapod/types/semantic_converter.py | 135 --- src/orcapod/types/semantic_types.py | 623 ------------- src/orcapod/utils/dict_utils.py | 0 src/orcapod/utils/function_info.py | 320 +++++++ src/orcapod/utils/object_spec.py | 99 ++- 43 files changed, 2849 insertions(+), 3634 deletions(-) delete mode 100644 src/orcapod/data/context.py delete mode 100644 src/orcapod/pipeline/legacy_nodes.py create mode 100644 src/orcapod/semantic_types/precomputed_converters.py delete mode 100644 src/orcapod/semantic_types/schemas.py create mode 100644 src/orcapod/semantic_types/semantic_registry.py create mode 100644 src/orcapod/semantic_types/semantic_struct_converters.py delete mode 100644 src/orcapod/semantic_types/struct_converters.py create mode 100644 src/orcapod/semantic_types/type_inference.py rename src/orcapod/semantic_types/{ => unused}/complete_converter_test.py (90%) rename src/orcapod/semantic_types/{ => unused}/python_arrow_types.py (83%) create mode 100644 src/orcapod/semantic_types/unused/schemas.py rename src/orcapod/semantic_types/{complete_converter.py => unused/semantic_converters.py} (67%) create mode 100644 src/orcapod/semantic_types/unused/struct_types.py rename src/orcapod/semantic_types/{ => unused}/table_converters.py (88%) delete mode 100644 src/orcapod/types/defaults.py delete mode 100644 src/orcapod/types/schemas.py delete mode 100644 src/orcapod/types/semantic_converter.py delete mode 100644 src/orcapod/types/semantic_types.py create mode 100644 src/orcapod/utils/dict_utils.py create mode 100644 src/orcapod/utils/function_info.py diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 5d7c423..cd9f09a 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -17,5 +17,5 @@ "streams", "stores", "operators", - "Pipeline" + "Pipeline", ] diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index 078f857..7e87319 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -6,7 +6,11 @@ """ from dataclasses import dataclass -from orcapod.semantic_types import UniversalTypeConverter +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + # TODO: consider establishing type converter protocol + from orcapod.semantic_types import UniversalTypeConverter from orcapod.protocols import hashing_protocols as hp @@ -31,7 +35,7 @@ class DataContext: context_key: str version: str description: str - type_converter: UniversalTypeConverter + type_converter: "UniversalTypeConverter" arrow_hasher: hp.ArrowHasher object_hasher: hp.ObjectHasher diff --git a/src/orcapod/data/context.py b/src/orcapod/data/context.py deleted file mode 100644 index 9e402ab..0000000 --- a/src/orcapod/data/context.py +++ /dev/null @@ -1,59 +0,0 @@ -from orcapod.types.semantic_types import SemanticTypeRegistry -from orcapod.types import default_registry -from orcapod.protocols import hashing_protocols as hp -from orcapod.hashing.defaults import get_default_arrow_hasher, get_default_object_hasher -from dataclasses import dataclass - - -@dataclass -class DataContext: - context_key: str - semantic_type_registry: SemanticTypeRegistry - arrow_hasher: hp.ArrowHasher - object_hasher: hp.ObjectHasher - - @staticmethod - def resolve_data_context(data_context: "str | DataContext | None") -> "DataContext": - """ - Returns the default data context manager. - This is typically used when no specific context is provided. - """ - return orcapod_system_data_context_manager.resolve_context(data_context) - - -class DataContextManager(dict[str, DataContext]): - def register_context(self, data_context: DataContext): - """ - Register a new DataContext instance. - - Args: - data_context: The DataContext instance to register. - """ - if data_context.context_key in self: - raise ValueError( - f"DataContext with key {data_context.context_key} already exists." - ) - self[data_context.context_key] = data_context - - def resolve_context(self, context_info: str | DataContext | None) -> DataContext: - if isinstance(context_info, DataContext): - return context_info - if context_info is None: - return default_data_context - if isinstance(context_info, str): - if context_info in self: - return self[context_info] - else: - raise ValueError(f"DataContext with key {context_info} not found.") - - -default_data_context = DataContext( - "std:v0.1.0:default", - default_registry, - get_default_arrow_hasher(), - get_default_object_hasher(), -) - - -orcapod_system_data_context_manager = DataContextManager() -orcapod_system_data_context_manager.register_context(default_data_context) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index c29cf58..2a21a58 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -4,14 +4,11 @@ import pyarrow as pa -from orcapod.data.context import ( - DataContext, -) +from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.types import schemas, typespec_utils +from orcapod.types import TypeSpec, typespec_utils from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) @@ -53,8 +50,7 @@ def __init__( self, table: pa.Table, meta_info: Mapping[str, DataValue] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: """ Initialize ArrowDatagram from PyArrow Table. @@ -86,7 +82,7 @@ def __init__( else [] ) - # Extract context table if present + # Extract context table from passed in table if present if constants.CONTEXT_KEY in table.column_names and data_context is None: context_table = table.select([constants.CONTEXT_KEY]) data_context = context_table[constants.CONTEXT_KEY].to_pylist()[0] @@ -104,16 +100,6 @@ def __init__( if len(self._data_table.column_names) == 0: raise ValueError("Data table must contain at least one data column.") - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - self._data_table.schema, - self._data_context.semantic_type_registry, - ) - ) - self._semantic_converter = semantic_converter - # process supplemented meta info if provided if meta_info is not None: # make sure it has the expected prefixes @@ -125,11 +111,12 @@ def __init__( ): v for k, v in meta_info.items() } - # Note that meta information cannot contain semantic types - typespec = typespec_utils.get_typespec_from_dict(meta_info) - new_meta_table = self._semantic_converter.from_python_to_arrow( - meta_info, typespec + new_meta_table = ( + self._data_context.type_converter.python_dicts_to_arrow_table( + [meta_info], + ) ) + if self._meta_table is None: self._meta_table = new_meta_table else: @@ -151,9 +138,9 @@ def __init__( ) # Initialize caches - self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_schema: TypeSpec | None = None self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_meta_python_schema: schemas.PythonSchema | None = None + self._cached_meta_python_schema: TypeSpec | None = None self._cached_content_hash: str | None = None # 1. Core Properties (Identity & Structure) @@ -224,7 +211,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> schemas.PythonSchema: + ) -> dict[str, type]: """ Return Python schema for the datagram. @@ -244,7 +231,7 @@ def types( # Get data schema (cached) if self._cached_python_schema is None: self._cached_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( + self._data_context.type_converter.arrow_schema_to_python_schema( self._data_table.schema ) ) @@ -259,7 +246,7 @@ def types( if include_meta_columns and self._meta_table is not None: if self._cached_meta_python_schema is None: self._cached_meta_python_schema = ( - self._semantic_converter.from_arrow_to_python_schema( + self._data_context.type_converter.arrow_schema_to_python_schema( self._meta_table.schema ) ) @@ -274,7 +261,7 @@ def types( } schema.update(filtered_meta_schema) - return schemas.PythonSchema(schema) + return schema def arrow_schema( self, @@ -371,9 +358,11 @@ def as_dict( # Get data dict (cached) if self._cached_python_dict is None: - self._cached_python_dict = self._semantic_converter.from_arrow_to_python( - self._data_table - )[0] + self._cached_python_dict = ( + self._data_context.type_converter.arrow_table_to_python_dicts( + self._data_table + )[0] + ) result_dict = dict(self._cached_python_dict) @@ -569,7 +558,7 @@ def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: ) new_datagram = self.copy(include_cache=False) - new_datagram._meta_table = self._meta_table.drop_columns(prefixed_keys) + new_datagram._meta_table = self._meta_table.drop_columns(list(prefixed_keys)) return new_datagram @@ -616,7 +605,7 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: column_names = tuple(c for c in column_names if self._data_table.columns) new_datagram = self.copy(include_cache=False) - new_datagram._data_table = self._data_table.drop_columns(column_names) + new_datagram._data_table = self._data_table.drop_columns(list(column_names)) # TODO: consider dropping extra semantic columns if they are no longer needed return new_datagram @@ -641,11 +630,6 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: new_datagram = self.copy(include_cache=False) new_datagram._data_table = new_datagram._data_table.rename_columns(new_names) - # apply the same rename to the converters - new_datagram._semantic_converter = self._semantic_converter.rename( - column_mapping - ) - return new_datagram def update(self, **updates: DataValue) -> Self: @@ -681,12 +665,12 @@ def update(self, **updates: DataValue) -> Self: new_datagram = self.copy(include_cache=False) - updates_typespec = schemas.PythonSchema( - {k: v for k, v in self.types().items() if k in updates} - ) - update_table = self._semantic_converter.from_python_to_arrow( - updates, updates_typespec + updates_typespec = {k: v for k, v in self.types().items() if k in updates} + + update_table = self._data_context.type_converter.python_dicts_to_arrow_table( + [updates], python_schema=updates_typespec ) + new_datagram._data_table = arrow_utils.hstack_tables( self._data_table.drop_columns(list(updates.keys())), update_table ).select(self._data_table.column_names) # adjsut the order to match original @@ -731,25 +715,20 @@ def with_columns( # TODO: consider simplifying this conversion logic # prepare update's table - typespec = typespec_utils.get_typespec_from_dict(updates, column_types) + typespec: dict[str, type] = typespec_utils.get_typespec_from_dict( + updates, column_types + ) # type: ignore[assignment] - updates_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_typespec( - typespec, self._data_context.semantic_type_registry - ) - ) # TODO: cleanup the handling of typespec python schema and various conversion points - new_data_table = updates_converter.from_python_to_arrow(updates, typespec) + new_data_table = self._data_context.type_converter.python_dicts_to_arrow_table( + [updates], python_schema=typespec + ) # perform in-place update new_datagram._data_table = arrow_utils.hstack_tables( new_datagram._data_table, new_data_table ) - # prepare the joined converter - new_datagram._semantic_converter = self._semantic_converter.join( - updates_converter - ) return new_datagram # 7. Context Operations @@ -784,7 +763,6 @@ def copy(self, include_cache: bool = True) -> Self: new_datagram._data_table = self._data_table new_datagram._meta_table = self._meta_table new_datagram._data_context = self._data_context - new_datagram._semantic_converter = self._semantic_converter if include_cache: new_datagram._cached_python_schema = self._cached_python_schema diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index ec1ec0f..e45fa35 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -6,12 +6,10 @@ import pyarrow as pa from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.data.context import ( - DataContext, -) -from orcapod.types import schemas +from orcapod import contexts +from orcapod.types import TypeSpec + from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils from orcapod.data.datagrams.arrow_datagram import ArrowDatagram @@ -38,8 +36,7 @@ class ArrowTag(ArrowDatagram): def __init__( self, table: pa.Table, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: if len(table) != 1: raise ValueError( @@ -48,7 +45,6 @@ def __init__( ) super().__init__( table=table, - semantic_converter=semantic_converter, data_context=data_context, ) @@ -83,8 +79,7 @@ def __init__( table: pa.Table | pa.RecordBatch, meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: if len(table) != 1: raise ValueError( @@ -116,13 +111,12 @@ def __init__( super().__init__( data_table, meta_info=meta_info, - semantic_converter=semantic_converter, data_context=data_context, ) self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: schemas.PythonSchema | None = None + self._cached_python_schema: TypeSpec | None = None self._cached_content_hash: str | None = None def keys( @@ -147,7 +141,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> schemas.PythonSchema: + ) -> dict[str, type]: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, @@ -247,7 +241,6 @@ def as_datagram( ) return ArrowDatagram( table, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 9f6d4a8..9e9cc1a 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -19,13 +19,11 @@ import logging from abc import ABC, abstractmethod from collections.abc import Collection, Iterator, Mapping -from typing import Any, Self, TypeAlias +from typing import Self, TypeAlias +from orcapod import contexts import pyarrow as pa -from orcapod.data.context import ( - DataContext, -) from orcapod.types import TypeSpec from orcapod.types.core import DataValue @@ -116,7 +114,7 @@ class BaseDatagram(ABC): is interpreted and used is left to concrete implementations. """ - def __init__(self, data_context: DataContext | str | None = None) -> None: + def __init__(self, data_context: contexts.DataContext | str | None = None) -> None: """ Initialize base datagram with data context. @@ -124,7 +122,7 @@ def __init__(self, data_context: DataContext | str | None = None) -> None: data_context: Context for semantic interpretation. Can be a string key or a DataContext object, or None for default. """ - self._data_context = DataContext.resolve_data_context(data_context) + self._data_context = contexts.resolve_context(data_context) # 1. Core Properties (Identity & Structure) @property @@ -138,6 +136,8 @@ def meta_columns(self) -> tuple[str, ...]: """Return tuple of meta column names.""" ... + # TODO: add meta info + # 2. Dict-like Interface (Data Access) @abstractmethod def __getitem__(self, key: str) -> DataValue: diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 9d6664a..3da58ac 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -5,14 +5,11 @@ import pyarrow as pa from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.data.context import ( - DataContext, -) +from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram -from orcapod.types import TypeSpec, schemas +from orcapod.types import TypeSpec from orcapod.types import typespec_utils as tsutils from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) @@ -55,8 +52,7 @@ def __init__( data: Mapping[str, DataValue], typespec: TypeSpec | None = None, meta_info: Mapping[str, DataValue] | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: """ Initialize DictDatagram from dictionary data. @@ -103,29 +99,17 @@ def __init__( # Combine provided typespec info with inferred typespec from content # If the column value is None and no type spec is provided, defaults to str. - self._data_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._data, - typespec, - ) + self._data_python_schema = tsutils.get_typespec_from_dict( + self._data, + typespec, ) - # Create semantic converter - if semantic_converter is None: - semantic_converter = SemanticConverter.from_semantic_schema( - self._data_python_schema.to_semantic_schema( - semantic_type_registry=self._data_context.semantic_type_registry - ), - ) - self._semantic_converter = semantic_converter - # Create schema for meta data - self._meta_python_schema = schemas.PythonSchema( - tsutils.get_typespec_from_dict( - self._meta_data, - typespec=typespec, - ) + self._meta_python_schema = tsutils.get_typespec_from_dict( + self._meta_data, + typespec=typespec, ) + # Initialize caches self._cached_data_table: pa.Table | None = None @@ -196,7 +180,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> schemas.PythonSchema: + ) -> TypeSpec: """ Return Python schema for the datagram. @@ -232,7 +216,7 @@ def types( } schema.update(filtered_meta_schema) - return schemas.PythonSchema(schema) + return schema def arrow_schema( self, @@ -258,11 +242,10 @@ def arrow_schema( # Build data schema (cached) if self._cached_data_arrow_schema is None: - self._cached_data_arrow_schema = ( - self._semantic_converter.from_python_to_arrow_schema( - self._data_python_schema - ) + self._cached_data_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema( + self._data_python_schema ) + all_schemas = [self._cached_data_arrow_schema] @@ -273,24 +256,14 @@ def arrow_schema( # Add meta schema if requested if include_meta_columns and self._meta_data: - if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self._semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) - ) - - assert self._cached_meta_arrow_schema is not None, ( - "Meta Arrow schema should be initialized by now" - ) if include_meta_columns is True: - meta_schema = self._cached_meta_arrow_schema + meta_schema = self._get_meta_arrow_schema() elif isinstance(include_meta_columns, Collection): # Filter meta schema by prefix matching matched_fields = [ field - for field in self._cached_meta_arrow_schema + for field in self._get_meta_arrow_schema() if any( field.name.startswith(prefix) for prefix in include_meta_columns ) @@ -381,11 +354,10 @@ def _get_meta_arrow_table(self) -> pa.Table: def _get_meta_arrow_schema(self) -> pa.Schema: if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = ( - self._semantic_converter.from_python_to_arrow_schema( - self._meta_python_schema - ) + self._cached_meta_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema( + self._meta_python_schema ) + assert self._cached_meta_arrow_schema is not None, ( "Meta Arrow schema should be initialized by now" ) @@ -415,8 +387,8 @@ def as_table( # Build data table (cached) if self._cached_data_table is None: - self._cached_data_table = self._semantic_converter.from_python_to_arrow( - self._data, + self._cached_data_table = self._data_context.type_converter.python_dicts_to_arrow_table( + [self._data], self._data_python_schema, ) assert self._cached_data_table is not None, ( @@ -500,7 +472,6 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: return self.__class__( data=full_data, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -543,7 +514,6 @@ def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: return self.__class__( data=full_data, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -573,7 +543,6 @@ def select(self, *column_names: str) -> Self: return self.__class__( data=full_data, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -606,7 +575,6 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: return self.__class__( data=full_data, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -647,7 +615,6 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: return self.__class__( data=full_data, typespec=new_typespec, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) @@ -686,7 +653,6 @@ def update(self, **updates: DataValue) -> Self: return self.__class__( data=full_data, - semantic_converter=self._semantic_converter, # Keep existing converter data_context=self._data_context, ) @@ -786,7 +752,6 @@ def copy(self, include_cache: bool = True) -> Self: new_datagram._data = self._data.copy() new_datagram._meta_data = self._meta_data.copy() new_datagram._data_python_schema = self._data_python_schema.copy() - new_datagram._semantic_converter = self._semantic_converter new_datagram._meta_python_schema = self._meta_python_schema.copy() if include_cache: diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index a45a22c..f61fefe 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -1,16 +1,14 @@ import logging from collections.abc import Collection, Mapping from typing import Self -from xml.etree.ElementInclude import include import pyarrow as pa from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.data.context import DataContext +from orcapod import contexts from orcapod.data.datagrams.dict_datagram import DictDatagram -from orcapod.types import TypeSpec, schemas +from orcapod.types import TypeSpec from orcapod.types.core import DataValue -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) @@ -50,8 +48,7 @@ def __init__( meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, typespec: TypeSpec | None = None, - semantic_converter: SemanticConverter | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: # normalize the data content and remove any source info keys data_only = { @@ -67,7 +64,6 @@ def __init__( data_only, typespec=typespec, meta_info=meta_info, - semantic_converter=semantic_converter, data_context=data_context, ) @@ -171,7 +167,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> schemas.PythonSchema: + ) -> dict[str, type]: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, @@ -238,7 +234,6 @@ def as_datagram( return DictDatagram( data, typespec=typespec, - semantic_converter=self._semantic_converter, data_context=self._data_context, ) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index ad4c900..dfe1680 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -6,7 +6,7 @@ import logging from orcapod.data.streams import KernelStream from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod.data.context import DataContext +from orcapod import contexts from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER from orcapod.types import TypeSpec @@ -28,7 +28,7 @@ class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): def __init__( self, label: str | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, **kwargs, @@ -36,7 +36,7 @@ def __init__( super().__init__(**kwargs) self._label = label - self._data_context = DataContext.resolve_data_context(data_context) + self._data_context = contexts.resolve_context(data_context) self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER @@ -59,7 +59,7 @@ def kernel_id(self) -> tuple[str, ...]: return (f"{self.__class__.__name__}", self._kernel_hash) @property - def data_context(self) -> DataContext: + def data_context(self) -> contexts.DataContext: return self._data_context @property diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/data/operators/base.py index 638cc60..a7fac6f 100644 --- a/src/orcapod/data/operators/base.py +++ b/src/orcapod/data/operators/base.py @@ -1,3 +1,4 @@ +from ast import Not from orcapod.data.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec @@ -42,7 +43,18 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: It expects exactly one stream as input. """ stream = streams[0] - return self.op_forward(stream) + # visit each substream + output_substreams = [] + for substream_id in stream.substream_identities: + substream = stream.get_substream(substream_id) + output_substreams.append(self.op_forward(substream)) + + # at the moment only single output substream is supported + if len(output_substreams) != 1: + raise NotImplementedError( + "Support for multiple output substreams is not implemented yet." + ) + return output_substreams[0] def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: stream = streams[0] diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 65793cd..84246e4 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -4,8 +4,7 @@ from collections.abc import Callable, Collection, Iterable, Sequence from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal - -from orcapod.data.context import DataContext +from orcapod import contexts from orcapod.data.datagrams import ( ArrowPacket, DictPacket, @@ -14,14 +13,11 @@ from orcapod.data.operators import Join from orcapod.data.streams import LazyPodResultStream, EfficientPodResultStream from orcapod.data.system_constants import orcapod_constants as constants -from orcapod.hashing.hash_utils import get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore from orcapod.types import TypeSpec from orcapod.types import typespec_utils as tsutils -from orcapod.types.schemas import PythonSchema -from orcapod.types.semantic_converter import SemanticConverter from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components @@ -252,13 +248,10 @@ def __init__( input_typespec=input_typespec, output_typespec=output_typespec, ) - self._input_packet_schema = PythonSchema(input_packet_types) - self._output_packet_schema = PythonSchema(output_packet_types) - self._output_semantic_converter = SemanticConverter.from_semantic_schema( - self._output_packet_schema.to_semantic_schema( - semantic_type_registry=self.data_context.semantic_type_registry - ) - ) + self._input_packet_schema = input_packet_types + self._output_packet_schema = output_packet_types + # TODO: add output packet converter for speed up + self._function_info_extractor = function_info_extractor object_hasher = self.data_context.object_hasher self._function_signature_hash = object_hasher.hash_to_hex( @@ -285,14 +278,14 @@ def get_record_id(self, packet: dp.Packet) -> str: content, prefix_hasher_id=True ) - def input_packet_types(self) -> PythonSchema: + def input_packet_types(self) -> dict[str, type]: """ Return the input typespec for the function pod. This is used to validate the input streams. """ return self._input_packet_schema.copy() - def output_packet_types(self) -> PythonSchema: + def output_packet_types(self) -> dict[str, type]: """ Return the output typespec for the function pod. This is used to validate the output streams. @@ -349,7 +342,6 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non {k: v for k, v in zip(self.output_keys, output_values)}, source_info=source_info, typespec=self.output_packet_types(), - semantic_converter=self._output_semantic_converter, data_context=self._data_context, ) return tag, output_packet @@ -398,7 +390,7 @@ def __init__( self, pod: dp.Pod, label: str | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, **kwargs, ) -> None: if data_context is None: @@ -544,7 +536,7 @@ def record_packet( pa.array([input_packet.content_hash()], type=pa.large_string()), ) - result_flag = self.result_store.add_record( + self.result_store.add_record( self.record_path, self.pod.get_record_id(input_packet), data_table, diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index ec7fa0c..f5754aa 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -8,7 +8,6 @@ from pyarrow.lib import Table from orcapod.data.kernels import TrackedKernelBase -from orcapod.data.pods import PythonSchema from orcapod.data.streams import ( ImmutableTableStream, KernelStream, @@ -16,7 +15,7 @@ ) from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue, TypeSpec, schemas, typespec_utils +from orcapod.types import DataValue, TypeSpec, typespec_utils from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import orcapod_constants as constants @@ -321,10 +320,9 @@ def __init__( raise ValueError( "At least one tag column must be provided when creating a new Delta table." ) - python_schema = schemas.PythonSchema(schema) - arrow_schema = python_schema.to_arrow_schema( - self.data_context.semantic_type_registry - ) + python_schema = schema + arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(python_schema) + fields = [] for field in arrow_schema: if field.name in tag_columns: @@ -334,9 +332,8 @@ def __init__( else: arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) - python_schema = schemas.PythonSchema.from_arrow_schema( - arrow_schema, self.data_context.semantic_type_registry - ) + python_schema = self._data_context.type_converter.arrow_schema_to_python_schema(arrow_schema) + inferred_tag_columns = [] for field in arrow_schema: if ( @@ -663,13 +660,9 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: This is called by forward() and creates a fresh snapshot each time. """ - tag_arrow_schema = schemas.PythonSchema(self.tag_typespec).to_arrow_schema( - self.data_context.semantic_type_registry - ) - packet_arrow_schema = schemas.PythonSchema( - self.packet_typespec - ).to_arrow_schema(self.data_context.semantic_type_registry) - + tag_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(self.tag_typespec) + packet_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(self.packet_typespec) + joined_data = [ {**tag, **packet} for tag, packet in zip(self.tags, self.packets) ] diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index c381ad4..55c8bfe 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, cast from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod.data.context import DataContext +from orcapod import contexts from orcapod.data.datagrams import ( ArrowPacket, ArrowTag, @@ -15,7 +15,7 @@ ) from orcapod.data.system_constants import orcapod_constants as constants from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec, schemas +from orcapod.types import TypeSpec from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -87,7 +87,7 @@ def __init__( self, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -100,7 +100,7 @@ def __init__( if data_context is None and source is not None: # if source is provided, use its data context data_context = source.data_context_key - self._data_context = DataContext.resolve_data_context(data_context) + self._data_context = contexts.resolve_context(data_context) @property def substream_identities(self) -> tuple[str, ...]: @@ -125,7 +125,7 @@ def get_substream(self, substream_id: str) -> dp.Stream: raise ValueError(f"Substream with ID {substream_id} not found.") @property - def data_context(self) -> DataContext: + def data_context(self) -> contexts.DataContext: """ Returns the data context for the stream. This is used to resolve semantic types and other context-specific information. @@ -318,16 +318,16 @@ def __init__( self._tag_schema = tag_schema self._packet_schema = packet_schema - self._tag_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - tag_schema, self._data_context.semantic_type_registry - ) - ) - self._packet_converter = SemanticConverter.from_semantic_schema( - schemas.SemanticSchema.from_arrow_schema( - packet_schema, self._data_context.semantic_type_registry - ) - ) + # self._tag_converter = SemanticConverter.from_semantic_schema( + # schemas.SemanticSchema.from_arrow_schema( + # tag_schema, self._data_context.semantic_type_registry + # ) + # ) + # self._packet_converter = SemanticConverter.from_semantic_schema( + # schemas.SemanticSchema.from_arrow_schema( + # packet_schema, self._data_context.semantic_type_registry + # ) + # ) self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now @@ -339,19 +339,16 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ return self._tag_columns, self._packet_columns - def types(self) -> tuple[schemas.PythonSchema, schemas.PythonSchema]: + def types(self) -> tuple[dict[str, type], dict[str, type]]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ # TODO: consider using MappingProxyType to avoid copying the dicts + converter = self._data_context.type_converter return ( - schemas.PythonSchema.from_arrow_schema( - self._tag_schema, converters=self._tag_converter.as_dict() - ), - schemas.PythonSchema.from_arrow_schema( - self._packet_schema, converters=self._packet_converter.as_dict() - ), + converter.arrow_schema_to_python_schema(self._tag_schema), + converter.arrow_schema_to_python_schema(self._packet_schema) ) def as_table( @@ -414,7 +411,6 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: if tag_present: tag = ArrowTag( tag_batch.slice(i, 1), # type: ignore - semantic_converter=self._tag_converter, data_context=self._data_context, ) @@ -429,7 +425,6 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: source_info=self._source_info_table.slice( i, 1 ).to_pylist()[0], - semantic_converter=self._packet_converter, data_context=self._data_context, ), ) diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index d7d1597..0b7ce33 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,6 +1,6 @@ +from orcapod import contexts from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.data.context import DataContext from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from collections.abc import Generator, Collection @@ -240,10 +240,10 @@ class GraphTracker(AutoRegisteringContextBasedTracker): def __init__( self, tracker_manager: dp.TrackerManager | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: super().__init__(tracker_manager=tracker_manager) - self._data_context = DataContext.resolve_data_context(data_context) + self._data_context = contexts.resolve_context(data_context) # Dictionary to map kernels to the streams they have invoked # This is used to track the computational graph and the invocations of kernels diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 29fd1bb..f18e6f6 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -1,6 +1,6 @@ from orcapod.data.trackers import GraphTracker, Invocation from orcapod.pipeline.nodes import KernelNode, PodNode -from orcapod.data.context import DataContext +from orcapod import contexts from orcapod.protocols import data_protocols as dp from orcapod.protocols import store_protocols as sp from typing import Any @@ -24,7 +24,7 @@ def __init__( pipeline_store: sp.ArrowDataStore, results_store: sp.ArrowDataStore | None = None, tracker_manager: dp.TrackerManager | None = None, - data_context: str | DataContext | None = None, + data_context: str | contexts.DataContext | None = None, auto_compile: bool = True, ): super().__init__(tracker_manager=tracker_manager, data_context=data_context) diff --git a/src/orcapod/pipeline/legacy_nodes.py b/src/orcapod/pipeline/legacy_nodes.py deleted file mode 100644 index 9470c1e..0000000 --- a/src/orcapod/pipeline/legacy_nodes.py +++ /dev/null @@ -1,817 +0,0 @@ -from orcapod.core.pod import Pod, FunctionPod -from orcapod.core import SyncStream, Source, Kernel -from orcapod.core.streams import PolarsStream -from orcapod.core.streams import EmptyStream -from orcapod.stores import ArrowDataStore -from orcapod.types import Tag, Packet, PacketLike, TypeSpec, default_registry -from orcapod.types.legacy import packets -from orcapod.types.typespec_utils import union_typespecs -from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry -from orcapod.types import schemas -from orcapod.hashing import ObjectHasher, ArrowHasher -from orcapod.hashing.defaults import get_default_object_hasher, get_default_arrow_hasher -from typing import Any, Literal -from collections.abc import Collection, Iterator -import polars as pl -from orcapod.core.streams import SyncStreamFromGenerator - -import logging - -logger = logging.getLogger(__name__) - - -def get_tag_typespec(tag: Tag) -> dict[str, type]: - return {k: str for k in tag} - - -class KernelInvocationWrapper(Kernel): - def __init__( - self, kernel: Kernel, input_streams: Collection[SyncStream], **kwargs - ) -> None: - super().__init__(**kwargs) - self.kernel = kernel - self.input_streams = list(input_streams) - - def __repr__(self) -> str: - return f"{self.__class__.__name__}<{self.kernel!r}>" - - def __str__(self) -> str: - return f"{self.__class__.__name__}<{self.kernel}>" - - def computed_label(self) -> str | None: - """ - Return the label of the wrapped kernel. - """ - return self.kernel.label - - def resolve_input_streams(self, *input_streams) -> Collection[SyncStream]: - if input_streams: - raise ValueError( - "Wrapped pod with specified streams cannot be invoked with additional streams" - ) - return self.input_streams - - def identity_structure(self, *streams: SyncStream) -> Any: - """ - Identity structure that includes the wrapped kernel's identity structure. - """ - resolved_streams = self.resolve_input_streams(*streams) - return self.kernel.identity_structure(*resolved_streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - resolved_streams = self.resolve_input_streams(*streams) - return self.kernel.keys(*resolved_streams, trigger_run=trigger_run) - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - resolved_streams = self.resolve_input_streams(*streams) - return self.kernel.types(*resolved_streams, trigger_run=trigger_run) - - def claims_unique_tags( - self, *streams: SyncStream, trigger_run: bool = False - ) -> bool | None: - resolved_streams = self.resolve_input_streams(*streams) - return self.kernel.claims_unique_tags( - *resolved_streams, trigger_run=trigger_run - ) - - def post_call(self, tag: Tag, packet: Packet) -> None: ... - - def output_iterator_completion_hook(self) -> None: ... - - -class CachedKernelWrapper(KernelInvocationWrapper, Source): - """ - A Kernel wrapper that wraps a kernel and stores the outputs of the kernel. - If the class is instantiated with input_streams that is not None, then this wrapper - will strictly represent the invocation of the wrapped Kernel on the given input streams. - Passing in an empty list into input_streams would still be registered as a specific invocation. - If input_streams is None, the class instance largely acts as a proxy of the underlying kernel - but will try to save all results. Note that depending on the storage type passed in, the saving - may error out if you invoke the instance on input streams with non-compatible schema (e.g., tags with - different keys). - """ - - def __init__( - self, - kernel: Kernel, - input_streams: Collection[SyncStream], - output_store: ArrowDataStore, - store_path_prefix: tuple[str, ...] = (), - kernel_hasher: ObjectHasher | None = None, - arrow_packet_hasher: ArrowHasher | None = None, - packet_type_registry: SemanticTypeRegistry | None = None, - **kwargs, - ) -> None: - super().__init__(kernel, input_streams, **kwargs) - - self.output_store = output_store - self.store_path_prefix = store_path_prefix - - # These are configurable but are not expected to be modified except for special circumstances - if kernel_hasher is None: - kernel_hasher = get_default_object_hasher() - self._kernel_hasher = kernel_hasher - if arrow_packet_hasher is None: - arrow_packet_hasher = get_default_arrow_hasher() - self._arrow_packet_hasher = arrow_packet_hasher - if packet_type_registry is None: - packet_type_registry = default_registry - self._packet_type_registry = packet_type_registry - - self.update_cached_values() - - self._cache_computed = False - - @property - def arrow_hasher(self): - return self._arrow_packet_hasher - - @property - def registry(self): - return self._packet_type_registry - - @property - def kernel_hasher(self) -> ObjectHasher: - if self._kernel_hasher is None: - return get_default_object_hasher() - return self._kernel_hasher - - @kernel_hasher.setter - def kernel_hasher(self, kernel_hasher: ObjectHasher | None = None): - if kernel_hasher is None: - kernel_hasher = get_default_object_hasher() - self._kernel_hasher = kernel_hasher - # hasher changed -- trigger recomputation of properties that depend on kernel hasher - self.update_cached_values() - - @property - def source_info(self) -> tuple[str, ...]: - """ - Returns a tuple of (label, kernel_hash) that uniquely identifies the source of the cached outputs. - This is used to store and retrieve the outputs from the output store. - """ - return self.label, self.kernel_hasher.hash_to_hex( - self.kernel, prefix_hasher_id=True - ) - - @property - def store_path(self) -> tuple[str, ...]: - """ - Returns the path prefix for the output store. - This is used to store and retrieve the outputs from the output store. - """ - return self.store_path_prefix + self.source_info - - def update_cached_values(self): - self.kernel_hash = self.kernel_hasher.hash_to_hex( - self.kernel, prefix_hasher_id=True - ) - tag_keys, packet_keys = self.keys(trigger_run=False) - self.tag_keys = tuple(tag_keys) if tag_keys is not None else None - self.packet_keys = tuple(packet_keys) if packet_keys is not None else None - - self.tag_typespec, self.packet_typespec = self.types(trigger_run=False) - if self.tag_typespec is None or self.packet_typespec is None: - raise ValueError( - "Currently, cached kernel wrapper can only work with kernels that have typespecs defined." - ) - # TODO: clean up and make it unnecessary to convert packet typespec - packet_schema = schemas.PythonSchema(self.packet_typespec) - joined_typespec = union_typespecs( - self.tag_typespec, packet_schema.with_source_info - ) - if joined_typespec is None: - raise ValueError( - "Joined typespec should not be None. " - "This may happen if the tag typespec and packet typespec are incompatible." - ) - # Add any additional fields to the output converter here - self.output_converter = packets.PacketConverter( - joined_typespec, registry=self.registry, include_source_info=False - ) - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - if self._cache_computed: - logger.info(f"Returning cached outputs for {self}") - if (lazy_df := self.get_all_records_as_polars(flush=False)) is not None: - if self.tag_keys is None: - raise ValueError( - "CachedKernelWrapper has no tag keys defined, cannot return PolarsStream" - ) - return PolarsStream( - lazy_df.collect(), - tag_keys=self.tag_keys, - packet_keys=self.packet_keys, - ) - else: - return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.packet_keys) - - resolved_streams = self.resolve_input_streams(*streams) - output_stream = self.kernel.forward(*resolved_streams, **kwargs) - - # Cache the output stream of the underlying kernel - # If an entry with same tag and packet already exists in the output store, - # it will not be added again, thus avoiding duplicates. - def generator() -> Iterator[tuple[Tag, Packet]]: - logger.info(f"Computing and caching outputs for {self}") - for tag, packet in output_stream: - self.post_call(tag, packet) - yield tag, packet - self.output_iterator_completion_hook() - - logger.info(f"Results cached for {self}") - self._cache_computed = True - - return SyncStreamFromGenerator(generator) - - def post_call(self, tag: Tag, packet: Packet) -> None: - # Cache the output stream of the underlying kernel - # If an entry with same tag and packet already exists in the output store, - # it will not be added again, thus avoiding duplicates. - merged_info = {**tag, **packet.get_composite()} - output_table = self.output_converter.from_python_packet_to_arrow_table( - merged_info - ) - # TODO: revisit this logic - output_id = self.arrow_hasher.hash_table(output_table, prefix_hasher_id=True) - if not self.output_store.get_record(self.store_path, output_id, flush=False): - self.output_store.add_record( - self.store_path, - output_id, - output_table, - ) - - def output_iterator_completion_hook(self) -> None: - """ - Hook to be called when the generator is completed. - """ - logger.info(f"Results cached for {self}") - self._cache_computed = True - - def get_all_records_as_polars(self, flush: bool = True) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(self.store_path, flush=flush) - - @property - def lazy_df(self) -> pl.LazyFrame | None: - lazydf = self.output_store.get_all_records_as_polars(self.store_path) - if lazydf is None: - return None - if self.tag_keys is None or self.packet_keys is None: - raise ValueError( - "CachedKernelWrapper has no tag keys or packet keys defined, and currently this is not supported" - ) - return lazydf.select(self.tag_keys + self.packet_keys) - - @property - def df(self) -> pl.DataFrame | None: - lazy_df = self.lazy_df - if lazy_df is None: - return None - return lazy_df.collect() - - def reset_cache(self): - self._cache_computed = False - - -class FunctionPodInvocationWrapper(KernelInvocationWrapper, Pod): - """ - Convenience class to wrap a function pod, providing default pass-through - implementations - """ - - def __init__( - self, function_pod: FunctionPod, input_streams: Collection[SyncStream], **kwargs - ): - # note that this would be an alias to the self.kernel but here explicitly taken as function_pod - # for better type hints - # MRO will be KernelInvocationWrapper -> Pod -> Kernel - super().__init__(function_pod, input_streams, **kwargs) - self.function_pod = function_pod - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - resolved_streams = self.resolve_input_streams(*streams) - return super().forward(*resolved_streams, **kwargs) - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: - return self.function_pod.call(tag, packet) - - # =============pass through methods/properties to the underlying function pod============= - - def set_active(self, active=True): - """ - Set the active state of the function pod. - """ - self.function_pod.set_active(active) - - def is_active(self) -> bool: - """ - Check if the function pod is active. - """ - return self.function_pod.is_active() - - -class CachedFunctionPodWrapper(FunctionPodInvocationWrapper, Source): - def __init__( - self, - function_pod: FunctionPod, - input_streams: Collection[SyncStream], - output_store: ArrowDataStore, - tag_store: ArrowDataStore | None = None, - label: str | None = None, - store_path_prefix: tuple[str, ...] = (), - output_store_path_prefix: tuple[str, ...] = (), - tag_store_path_prefix: tuple[str, ...] = (), - skip_memoization_lookup: bool = False, - skip_memoization: bool = False, - skip_tag_record: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - object_hasher: ObjectHasher | None = None, - arrow_hasher: ArrowHasher | None = None, - registry: SemanticTypeRegistry | None = None, - **kwargs, - ) -> None: - super().__init__( - function_pod, - input_streams, - label=label, - error_handling=error_handling, - **kwargs, - ) - self.output_store_path_prefix = store_path_prefix + output_store_path_prefix - self.tag_store_path_prefix = store_path_prefix + tag_store_path_prefix - - self.output_store = output_store - self.tag_store = tag_store - - self.skip_memoization_lookup = skip_memoization_lookup - self.skip_memoization = skip_memoization - self.skip_tag_record = skip_tag_record - - # These are configurable but are not expected to be modified except for special circumstances - # Here I'm assigning to the hidden properties directly to avoid triggering setters - if object_hasher is None: - object_hasher = get_default_object_hasher() - self._object_hasher = object_hasher - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self._arrow_hasher = arrow_hasher - if registry is None: - registry = default_registry - self._registry = registry - - # compute and cache properties and converters for efficiency - self.update_cached_values() - self._cache_computed = False - - @property - def tag_keys(self) -> tuple[str, ...]: - if self._tag_keys is None: - raise ValueError("Tag keys are not set, cannot return tag keys") - return self._tag_keys - - @property - def output_keys(self) -> tuple[str, ...]: - if self._output_keys is None: - raise ValueError("Output keys are not set, cannot return output keys") - return self._output_keys - - @property - def object_hasher(self) -> ObjectHasher: - return self._object_hasher - - @object_hasher.setter - def object_hasher(self, object_hasher: ObjectHasher | None = None): - if object_hasher is None: - object_hasher = get_default_object_hasher() - self._object_hasher = object_hasher - # hasher changed -- trigger recomputation of properties that depend on object hasher - self.update_cached_values() - - @property - def arrow_hasher(self) -> ArrowHasher: - return self._arrow_hasher - - @arrow_hasher.setter - def arrow_hasher(self, arrow_hasher: ArrowHasher | None = None): - if arrow_hasher is None: - arrow_hasher = get_default_arrow_hasher() - self._arrow_hasher = arrow_hasher - # hasher changed -- trigger recomputation of properties that depend on arrow hasher - self.update_cached_values() - - @property - def registry(self) -> SemanticTypeRegistry: - return self._registry - - @registry.setter - def registry(self, registry: SemanticTypeRegistry | None = None): - if registry is None: - registry = default_registry - self._registry = registry - # registry changed -- trigger recomputation of properties that depend on registry - self.update_cached_values() - - def update_cached_values(self) -> None: - self.function_pod_hash = self.object_hasher.hash_to_hex( - self.function_pod, prefix_hasher_id=True - ) - self.node_hash = self.object_hasher.hash_to_hex(self, prefix_hasher_id=True) - self.input_typespec, self.output_typespec = ( - self.function_pod.get_function_typespecs() - ) - tag_keys, output_keys = self.keys(trigger_run=False) - - if tag_keys is None or output_keys is None: - raise ValueError( - "Currently, cached function pod wrapper can only work with function pods that have keys defined." - ) - self._tag_keys = tuple(tag_keys) - self._output_keys = tuple(output_keys) - - self.tag_typespec, self.output_typespec = self.types(trigger_run=False) - if self.tag_typespec is None or self.output_typespec is None: - raise ValueError( - "Currently, cached function pod wrapper can only work with function pods that have typespecs defined." - ) - self.input_typespec, self.output_typespec = ( - self.function_pod.get_function_typespecs() - ) - - self.input_converter = packets.PacketConverter( - self.input_typespec, self.registry, include_source_info=False - ) - self.output_converter = packets.PacketConverter( - self.output_typespec, self.registry, include_source_info=True - ) - - input_packet_source_typespec = { - f"_source_info_{k}": str for k in self.input_typespec - } - - # prepare typespec for tag record: __packet_key, tag, input packet source_info, - tag_record_typespec = { - "__packet_key": str, - **self.tag_typespec, - **input_packet_source_typespec, - } - self.tag_record_converter = packets.PacketConverter( - tag_record_typespec, self.registry, include_source_info=False - ) - - def reset_cache(self): - self._cache_computed = False - - def generator_completion_hook(self, n_computed: int) -> None: - """ - Hook to be called when the generator is completed. - """ - logger.info(f"Results cached for {self}") - self._cache_computed = True - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - if self._cache_computed: - logger.info(f"Returning cached outputs for {self}") - lazy_df = self.get_all_entries_with_tags(keep_hidden_fields=True) - if lazy_df is not None: - if self.tag_keys is None: - raise ValueError("Tag keys are not set, cannot return PolarsStream") - return PolarsStream( - lazy_df.collect(), self.tag_keys, packet_keys=self.output_keys - ) - else: - return EmptyStream(tag_keys=self.tag_keys, packet_keys=self.output_keys) - logger.info(f"Computing and caching outputs for {self}") - return super().forward(*streams, **kwargs) - - def get_packet_key(self, packet: Packet) -> str: - return self.arrow_hasher.hash_table( - self.input_converter.from_python_packet_to_arrow_table(packet), - prefix_hasher_id=True, - ) - - @property - def pod_source_info(self): - return self.function_pod.function_name, self.function_pod_hash - - @property - def node_source_info(self): - return self.label, self.node_hash - - @property - def output_store_path(self) -> tuple[str, ...]: - """ - Returns the path prefix for the output store. - This is used to store and retrieve the outputs from the output store. - """ - return self.output_store_path_prefix + self.pod_source_info - - @property - def tag_store_path(self) -> tuple[str, ...]: - """ - Returns the path prefix for the tag store. - This is used to store and retrieve the tags associated with memoized packets. - """ - return self.tag_store_path_prefix + self.node_source_info - - def is_memoized(self, packet: Packet) -> bool: - return self.retrieve_memoized(packet) is not None - - def add_pipeline_record(self, tag: Tag, packet: Packet) -> Tag: - """ - Record the tag for the packet in the record store. - This is used to keep track of the tags associated with memoized packets. - """ - return self._add_pipeline_record_with_packet_key( - tag, self.get_packet_key(packet), packet.source_info - ) - - def _add_pipeline_record_with_packet_key( - self, tag: Tag, packet_key: str, packet_source_info: dict[str, str | None] - ) -> Tag: - if self.tag_store is None: - raise ValueError("Recording of tag requires tag_store but none provided") - - combined_info = dict(tag) # ensure we don't modify the original tag - combined_info["__packet_key"] = packet_key - for k, v in packet_source_info.items(): - combined_info[f"_source_info_{k}"] = v - - table = self.tag_record_converter.from_python_packet_to_arrow_table( - combined_info - ) - - entry_hash = self.arrow_hasher.hash_table(table, prefix_hasher_id=True) - - # TODO: add error handling - # check if record already exists: - retrieved_table = self.tag_store.get_record( - self.tag_store_path, entry_hash, flush=False - ) - if retrieved_table is None: - self.tag_store.add_record(self.tag_store_path, entry_hash, table) - - return tag - - def retrieve_memoized(self, packet: Packet) -> Packet | None: - """ - Retrieve a memoized packet from the data store. - Returns None if no memoized packet is found. - """ - logger.debug("Retrieving memoized packet") - return self._retrieve_memoized_with_packet_key(self.get_packet_key(packet)) - - def _retrieve_memoized_with_packet_key(self, packet_key: str) -> Packet | None: - """ - Retrieve a memoized result packet from the data store, looking up by the packet key - Returns None if no memoized packet is found. - """ - logger.debug(f"Retrieving memoized packet with key {packet_key}") - arrow_table = self.output_store.get_record( - self.output_store_path, - packet_key, - flush=False, - ) - if arrow_table is None: - return None - packets = self.output_converter.from_arrow_table_to_python_packets(arrow_table) - # since memoizing single packet, it should only contain one packet - assert len(packets) == 1, ( - f"Memoizing single packet return {len(packets)} packets!" - ) - return packets[0] - - def memoize( - self, - packet: Packet, - output_packet: Packet, - ) -> Packet: - """ - Memoize the output packet in the data store. - Returns the memoized packet. - """ - logger.debug("Memoizing packet") - return self._memoize_with_packet_key( - self.get_packet_key(packet), output_packet.get_composite() - ) - - def _memoize_with_packet_key( - self, packet_key: str, output_packet: PacketLike - ) -> Packet: - """ - Memoize the output packet in the data store, looking up by packet key. - Returns the memoized packet. - """ - logger.debug(f"Memoizing packet with key {packet_key}") - # TODO: this logic goes through the entire store and retrieve cycle with two conversions - # consider simpler alternative - packets = self.output_converter.from_arrow_table_to_python_packets( - self.output_store.add_record( - self.output_store_path, - packet_key, - self.output_converter.from_python_packet_to_arrow_table(output_packet), - ) - ) - # since passed in a single packet, it should only return a single packet - assert len(packets) == 1, ( - f"Memoizing single packet returned {len(packets)} packets!" - ) - packet = packets[0] - # TODO: reconsider the right place to attach this information - # attach provenance information - return Packet(packet) - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: - packet_key = "" - if ( - not self.skip_tag_record - or not self.skip_memoization_lookup - or not self.skip_memoization - ): - packet_key = self.get_packet_key(packet) - - output_packet = None - if not self.skip_memoization_lookup: - output_packet = self._retrieve_memoized_with_packet_key( - packet_key, - ) - if output_packet is not None: - logger.debug( - f"Memoized output for {packet} with {packet_key} found, skipping computation" - ) - else: - logger.debug( - f"Memoized output for packet {packet} with {packet_key} not found" - ) - - if output_packet is None: - # TODO: revisit the logic around active state and how to use it - tag, output_packet = self.function_pod.call(tag, packet) - if output_packet is not None and not self.skip_memoization: - # output packet may be modified by the memoization process - # e.g. if the output is a file, the path may be changed - # add source info to the output packet - source_info = { - k: "-".join(self.pod_source_info) + "-" + packet_key + ":" + str(k) - for k in output_packet.source_info - } - # TODO: fix and make this not access protected field directly - output_packet.source_info = source_info - output_packet = self._memoize_with_packet_key(packet_key, output_packet) # type: ignore - - if output_packet is None: - if self.is_active(): - logger.warning( - f"Function pod {self.function_pod.function_name} returned None for packet {packet} despite being active" - ) - return tag, None - - # result was successfully computed/retrieved -- save the tag - if not self.skip_tag_record and self.tag_store is not None: - self._add_pipeline_record_with_packet_key( - tag, packet_key, packet.source_info - ) - - return tag, output_packet - - def get_all_outputs(self) -> pl.LazyFrame | None: - return self.output_store.get_all_records_as_polars(self.output_store_path) - - def get_all_tags(self, with_packet_id: bool = False) -> pl.LazyFrame | None: - if self.tag_store is None: - raise ValueError("Tag store is not set, no tag record can be retrieved") - data = self.tag_store.get_all_records_as_polars(self.tag_store_path) - if not with_packet_id: - return data.drop("__packet_key") if data is not None else None - return data - - def get_all_entries_with_tags( - self, keep_hidden_fields: bool = False - ) -> pl.LazyFrame | None: - """ - Retrieve all entries from the tag store with their associated tags. - Returns a DataFrame with columns for tag and packet key. - """ - if self.tag_store is None: - raise ValueError("Tag store is not set, no tag record can be retrieved") - - tag_records = self.tag_store.get_all_records_as_polars(self.tag_store_path) - if tag_records is None: - return None - result_packets = self.output_store.get_records_by_ids_as_polars( - self.output_store_path, - tag_records.collect()["__packet_key"], - preserve_input_order=True, - ) - if result_packets is None: - return None - - pl_df = pl.concat([tag_records, result_packets], how="horizontal").drop( - ["__packet_key"] - ) - if not keep_hidden_fields: - pl_df = pl_df.select(self.tag_keys + self.output_keys) - return pl_df.lazy() - - @property - def df(self) -> pl.DataFrame | None: - lazy_df = self.lazy_df - if lazy_df is None: - return None - return lazy_df.collect() - - @property - def lazy_df(self) -> pl.LazyFrame | None: - return self.get_all_entries_with_tags() - - @property - def tags(self) -> pl.DataFrame | None: - data = self.get_all_tags() - if data is None: - return None - - return data.collect() - - @property - def outputs(self) -> pl.DataFrame | None: - """ - Retrieve all outputs from the result store as a DataFrame. - Returns None if no outputs are available. - """ - data = self.get_all_outputs() - if data is None: - return None - - return data.collect() - - -class DummyFunctionPod(Pod): - def __init__(self, function_name="dummy", **kwargs): - super().__init__(**kwargs) - self.function_name = function_name - - def set_active(self, active: bool = True): - # no-op - pass - - def is_active(self) -> bool: - return False - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: - raise NotImplementedError( - "DummyFunctionPod cannot be called, it is only used to access previously stored tags and outputs." - ) - - -# TODO: Create this instead using compositional pattern -class DummyCachedFunctionPod(CachedFunctionPodWrapper): - """ - Dummy for a cached function pod. This is convenient to just allow the user to access - previously stored function pod tags and outputs without requiring instantiating the identical - function used for computation. - - Consequently, this function pod CANNOT be used to compute and insert new entries into the storage. - """ - - def __init__(self, source_pod: CachedFunctionPodWrapper): - self._pod_source_info = source_pod.pod_source_info - self._node_source_info = source_pod.node_source_info - self.output_store = source_pod.output_store - self.tag_store = source_pod.tag_store - self.function_pod = DummyFunctionPod(source_pod.function_pod.function_name) - - @property - def pod_source_info(self) -> tuple[str, str]: - return self._pod_source_info - - @property - def node_source_info(self) -> tuple[str, str]: - return self._node_source_info - - -class Node(KernelInvocationWrapper, Source): - def __init__(self, kernel: Kernel, input_nodes: Collection["Node"], **kwargs): - """ - Create a node that wraps a kernel and provides a Node interface. - This is useful for creating nodes in a pipeline that can be executed. - """ - return super().__init__(kernel, input_nodes, **kwargs) - - def reset_cache(self) -> None: ... - - -class KernelNode(CachedKernelWrapper, Node): - """ - A node that wraps a Kernel and provides a Node interface. - This is useful for creating nodes in a pipeline that can be executed. - """ - - -class FunctionPodNode(CachedFunctionPodWrapper, Node): - """ - A node that wraps a FunctionPod and provides a Node interface. - This is useful for creating nodes in a pipeline that can be executed. - """ diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 35c6875..e38248f 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,4 +1,4 @@ -from orcapod.data.kernels import KernelStream, WrappedKernel, TrackedKernelBase +from orcapod.data.kernels import KernelStream, WrappedKernel from orcapod.data.sources import SourceBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index e16491d..b8a4e0d 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -322,45 +322,44 @@ def as_table( """ ... - # TODO: add this back - # def as_arrow_compatible_dict( - # self, - # include_all_info: bool = False, - # include_meta_columns: bool | Collection[str] = False, - # include_context: bool = False, - # ) -> dict[str, Any]: - # """ - # Return dictionary with values optimized for Arrow table conversion. + def as_arrow_compatible_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, Any]: + """ + Return dictionary with values optimized for Arrow table conversion. - # This method returns a dictionary where values are in a form that can be - # efficiently converted to Arrow format using pa.Table.from_pylist(). + This method returns a dictionary where values are in a form that can be + efficiently converted to Arrow format using pa.Table.from_pylist(). - # The key insight is that this avoids the expensive as_table() → concat pattern - # by providing values that are "Arrow-ready" while remaining in dict format - # for efficient batching. + The key insight is that this avoids the expensive as_table() → concat pattern + by providing values that are "Arrow-ready" while remaining in dict format + for efficient batching. - # Implementation note: This may involve format conversions (e.g., Path objects - # to strings, datetime objects to ISO strings, etc.) to ensure compatibility - # with Arrow's expected input formats. + Implementation note: This may involve format conversions (e.g., Path objects + to strings, datetime objects to ISO strings, etc.) to ensure compatibility + with Arrow's expected input formats. - # Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied - # with arrow_schema(...) with the same argument options to ensure that the schema matches the table. + Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied + with arrow_schema(...) with the same argument options to ensure that the schema matches the table. - # Args: - # include_all_info: Include all available information - # include_meta_columns: Controls meta column inclusion - # include_context: Whether to include context key + Args: + include_all_info: Include all available information + include_meta_columns: Controls meta column inclusion + include_context: Whether to include context key - # Returns: - # Dictionary with values optimized for Arrow conversion + Returns: + Dictionary with values optimized for Arrow conversion - # Example: - # # Efficient batch conversion pattern - # arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] - # schema = datagrams[0].arrow_schema() - # table = pa.Table.from_pylist(arrow_dicts, schema=schema) - # """ - # ... + Example: + # Efficient batch conversion pattern + arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] + schema = datagrams[0].arrow_schema() + table = pa.Table.from_pylist(arrow_dicts, schema=schema) + """ + ... # 5. Meta Column Operations def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 677fa5c..27c1e78 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -52,9 +52,13 @@ class ObjectHasher(Protocol): """Protocol for general object hashing.""" # TODO: consider more explicitly stating types of objects accepted - def hash(self, obj: Any) -> bytes: + def hash(self, obj: Any, compressed: bool = False) -> bytes: """ - Hash an object to a byte representation. + Hash an object to a byte representation. Object hasher must be + able to handle ContentIdentifiable objects to hash them based on their + identity structure. If compressed=True, the content identifiable object + is immediately replaced with its compressed string identity and used in the + computation of containing identity structure. Args: obj (Any): The object to hash. @@ -71,10 +75,16 @@ def get_hasher_id(self) -> str: ... def hash_to_hex( - self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False + self, + obj: Any, + char_count: int | None = None, + compressed: bool = False, + prefix_hasher_id: bool = True, ) -> str: ... - def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: + def hash_to_int( + self, obj: Any, hexdigits: int = 16, compressed: bool = False + ) -> int: """ Hash an object to an integer. @@ -88,7 +98,10 @@ def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: ... def hash_to_uuid( - self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + self, + obj: Any, + namespace: uuid.UUID = uuid.NAMESPACE_OID, + comrpressed: bool = False, ) -> uuid.UUID: ... diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_protocols.py index 5458cad..9a19b8d 100644 --- a/src/orcapod/protocols/semantic_protocols.py +++ b/src/orcapod/protocols/semantic_protocols.py @@ -1,38 +1,35 @@ -from typing import Protocol, Any +from typing import Protocol, Any, TYPE_CHECKING +if TYPE_CHECKING: + import pyarrow as pa -class TypeHandler(Protocol): - """Protocol for handling conversion between Python type and Arrow - data types used for storage. - The handler itself IS the definition of a semantic type. The semantic type - name/identifier is provided by the registerer when registering the handler. - - TypeHandlers should clearly communicate what Python types they can handle, - and focus purely on conversion logic. - """ +# Core protocols +class SemanticStructConverter(Protocol): + """Protocol for converting between Python objects and semantic structs.""" + @property def python_type(self) -> type: - """Return the Python type(s) this handler can process. + """The Python type this converter can handle.""" + ... - Returns: - Python type the handler supports + @property + def arrow_struct_type(self) -> "pa.StructType": + """The Arrow struct type this converter produces.""" + ... - Examples: - - PathHandler: return Path - - NumericHandler: return (int, float) - - CollectionHandler: return (list, tuple, set) - """ + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert Python value to struct dictionary.""" ... - def storage_type(self) -> type: - """Return the Arrow DataType instance for schema definition.""" + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dictionary back to Python value.""" ... - def python_to_storage(self, value: Any) -> Any: - """Convert Python value to Arrow-compatible storage representation.""" + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type.""" ... - def storage_to_python(self, value: Any) -> Any: - """Convert storage representation back to Python object.""" + def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: + """Check if this converter can handle the given struct type.""" ... diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/store_protocols.py index 2f11b53..3933bf7 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/store_protocols.py @@ -59,3 +59,30 @@ def get_records_with_column_value( def flush(self) -> None: """Flush any buffered writes to the underlying storage.""" ... + + +class MetadataCapable(Protocol): + def set_metadata( + self, + record_path: tuple[str, ...], + metadata: Mapping[str, Any], + merge: bool = True, + ) -> None: ... + + def get_metadata( + self, + record_path: tuple[str, ...], + ) -> Mapping[str, Any]: ... + + def get_supported_metadata_schema(self) -> Mapping[str, type]: ... + + def validate_metadata( + self, + metadata: Mapping[str, Any], + ) -> Collection[str]: ... + + +class ArrowDataStoreWithMetadata(ArrowDataStore, MetadataCapable, Protocol): + """A protocol that combines ArrowDataStore with metadata capabilities.""" + + pass diff --git a/src/orcapod/semantic_types/__init__.py b/src/orcapod/semantic_types/__init__.py index e69de29..31c664e 100644 --- a/src/orcapod/semantic_types/__init__.py +++ b/src/orcapod/semantic_types/__init__.py @@ -0,0 +1,7 @@ +from .semantic_registry import SemanticTypeRegistry +from .universal_converter import UniversalTypeConverter + +__all__ = [ + "SemanticTypeRegistry", + "UniversalTypeConverter", +] diff --git a/src/orcapod/semantic_types/precomputed_converters.py b/src/orcapod/semantic_types/precomputed_converters.py new file mode 100644 index 0000000..0fdedb8 --- /dev/null +++ b/src/orcapod/semantic_types/precomputed_converters.py @@ -0,0 +1,147 @@ +""" +Pre-computed column converters for efficient semantic type conversion. + +This module provides a way to pre-analyze schemas and create optimized +conversion functions for each column, eliminating runtime schema parsing +and type detection overhead. +""" + +from typing import Any, TYPE_CHECKING +from collections.abc import Callable +import pyarrow as pa +from orcapod.utils.lazy_module import LazyModule + + +from orcapod.semantic_types.universal_converter import UniversalTypeConverter + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class SemanticSchemaConverter: + def __init__( + self, + universal_converter: UniversalTypeConverter, + python_schema: dict[str, type] | None = None, + arrow_schema: pa.Schema | None = None, + ): + """ + self.universal_converter = universal_converter + """ + self.universal_converter = universal_converter + + if python_schema is not None: + self.python_schema = python_schema + self.arrow_schema = self.universal_converter.python_schema_to_arrow_schema( + python_schema + ) + elif arrow_schema is not None: + self.arrow_schema = arrow_schema + self.python_schema = self.universal_converter.arrow_schema_to_python_schema( + arrow_schema + ) + else: + raise ValueError( + "Either python_schema or arrow_schema must be provided to initialize SemanticSchemaConverter." + ) + + # Pre-compute converters for each field + self._python_to_arrow_converters: dict[str, Callable[[Any], Any]] = {} + self._arrow_to_python_converters: dict[str, Callable[[Any], Any]] = {} + + for field_name, python_type in self.python_schema.items(): + self._python_to_arrow_converters[field_name] = ( + self.universal_converter.get_python_to_arrow_converter(python_type) + ) + + for field in self.arrow_schema: + self._arrow_to_python_converters[field.name] = ( + self.universal_converter.get_arrow_to_python_converter(field.type) + ) + + def python_dict_to_struct_dict(self, record: dict[str, Any]) -> dict[str, Any]: + """ + Convert a single Python dictionary to an Arrow-compatible struct dictionary. + """ + if not record: + raise ValueError("Cannot convert empty record") + + return self.python_dicts_to_struct_dicts([record])[0] + + def python_dicts_to_struct_dicts( + self, data: list[dict[str, Any]] + ) -> list[dict[str, Any]]: + """ + Convert a list of Python dictionaries to a list of Arrow-compatible struct dictionaries. + """ + if not data: + raise ValueError("Cannot convert empty data list") + + converted_data = [] + for record in data: + converted_record = {} + for field_name, converter in self._python_to_arrow_converters.items(): + # TODO: test the case of None/missing value + value = record.get(field_name) + if value is None: + converted_record[field_name] = None + else: + # Convert using the pre-computed converter + converted_record[field_name] = converter(value) + converted_data.append(converted_record) + + return converted_data + + def struct_dicts_to_python_dicts( + self, struct_dicts: list[dict[str, Any]] + ) -> list[dict[str, Any]]: + """ + Convert a list of Arrow-compatible struct dictionaries back to Python dictionaries. + """ + if not struct_dicts: + raise ValueError("Cannot convert empty struct dicts list") + + # TODO: benchmark which approach of conversion would be faster + # 1) turn pylist to pydict and then convert each column at once + # 2) convert each row separately + converted_data = [] + for struct_dict in struct_dicts: + converted_record = {} + for field_name, converter in self._arrow_to_python_converters.items(): + value = struct_dict.get(field_name) + if value is None: + converted_record[field_name] = None + else: + # Convert using the pre-computed converter + converted_record[field_name] = converter(value) + converted_data.append(converted_record) + + return converted_data + + def struct_dict_to_python_dict(self, struct_dict: dict[str, Any]) -> dict[str, Any]: + """ + Convert a single Arrow-compatible struct dictionary back to a Python dictionary. + """ + if not struct_dict: + raise ValueError("Cannot convert empty struct dict") + + return self.struct_dicts_to_python_dicts([struct_dict])[0] + + def python_dicts_to_arrow_table(self, data: list[dict[str, Any]]) -> pa.Table: + """ + Convert a list of Python dictionaries to an Arrow table using pre-computed converters. + """ + struct_dicts = self.python_dicts_to_struct_dicts(data) + return pa.Table.from_pylist(struct_dicts, schema=self.arrow_schema) + + def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: + """ + Convert an Arrow table to a list of Python dictionaries using pre-computed converters. + """ + if not table: + raise ValueError("Cannot convert empty table") + + struct_dicts = table.to_pylist() + return self.struct_dicts_to_python_dicts(struct_dicts) diff --git a/src/orcapod/semantic_types/schemas.py b/src/orcapod/semantic_types/schemas.py deleted file mode 100644 index 57f0551..0000000 --- a/src/orcapod/semantic_types/schemas.py +++ /dev/null @@ -1,357 +0,0 @@ -from typing import Self -from orcapod.types.core import DataType, TypeSpec -from orcapod.types.semantic_types import ( - SemanticType, - SemanticTypeRegistry, - PythonArrowConverter, -) -import pyarrow as pa -import datetime - -# This mapping is expected to be stable -# Be sure to test this assumption holds true -DEFAULT_ARROW_TYPE_LUT = { - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), - bool: pa.bool_(), -} - - -def python_to_arrow_type(python_type: type) -> pa.DataType: - if python_type in DEFAULT_ARROW_TYPE_LUT: - return DEFAULT_ARROW_TYPE_LUT[python_type] - raise TypeError(f"Converstion of python type {python_type} is not supported yet") - - -def arrow_to_python_type(arrow_type: pa.DataType) -> type: - if pa.types.is_integer(arrow_type): - return int - elif pa.types.is_floating(arrow_type): - return float - elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): - return str - elif pa.types.is_boolean(arrow_type): - return bool - elif pa.types.is_date(arrow_type): - return datetime.date - elif pa.types.is_timestamp(arrow_type): - return datetime.datetime - elif pa.types.is_binary(arrow_type): - return bytes - else: - raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") - - -class PythonSchema(dict[str, DataType]): - """ - A schema for Python data types, mapping string keys to Python types. - - This is used to define the expected structure of data packets in OrcaPod. - - Attributes - ---------- - keys : str - The keys of the schema. - values : type - The types corresponding to each key. - - Examples - -------- - >>> schema = PythonSchema(name=str, age=int) - >>> print(schema) - {'name': , 'age': } - """ - - def copy(self) -> "PythonSchema": - return PythonSchema(self) - - def to_semantic_schema( - self, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticSchema": - """ - Convert the Python schema to a semantic schema using the provided semantic type registry. - - Parameters - ---------- - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - - Examples - -------- - >>> python_schema = PythonSchema(name=str, age=int) - >>> semantic_schema = python_schema.to_semantic_schema(registry) - >>> print(semantic_schema) - {'name': (str, None), 'age': (int, None)} - """ - return SemanticSchema.from_typespec(self, semantic_type_registry) - - def to_arrow_schema( - self, - semantic_type_registry: SemanticTypeRegistry | None = None, - converters: dict[str, PythonArrowConverter] | None = None, - ) -> pa.Schema: - """ - Convert the Python schema to an Arrow schema. - If converters are provided, they are used to convert the schema. Note that - no validation is performed on the converters, so they must be compatible with the schema. - """ - if converters is not None: - # If converters are provided, use them to convert the schema - fields = [] - for field_name, python_type in self.items(): - if field_name in converters: - converter = converters[field_name] - arrow_type = converter.arrow_type - metadata = None - if converter.semantic_type_name is not None: - metadata = { - b"semantic_type": converter.semantic_type_name.encode( - "utf-8" - ) - } - else: - arrow_type = python_to_arrow_type(python_type) - metadata = None - fields.append(pa.field(field_name, arrow_type, metadata=metadata)) - return pa.schema(fields) - - if semantic_type_registry is None: - raise ValueError( - "semantic_type_registry must be provided if converters are not" - ) - # Otherwise, convert using the semantic type registry - return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() - - @classmethod - def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: - """ - Create a PythonSchema from a SemanticSchema. - - Parameters - ---------- - semantic_schema : SemanticSchema - The semantic schema to convert. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - return cls(semantic_schema.get_python_types()) - - @classmethod - def from_arrow_schema( - cls, - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry | None = None, - converters: dict[str, PythonArrowConverter] | None = None, - ) -> Self: - """ - Create a PythonSchema from an Arrow schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The Arrow schema to convert. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - skip_system_columns : bool, optional - Whether to skip system columns (default is True). - converters : dict[str, PythonArrowConverter], optional - A dictionary of converters to use for converting the schema. If provided, the schema will be - converted using the converters. If not provided, the schema will be converted using the semantic type - registry. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - if converters is not None: - # If converters are provided, use them to convert the schema - python_types = {} - for field in arrow_schema: - # TODO: consider performing validation of semantic type - if field.name in converters: - converter = converters[field.name] - python_types[field.name] = converter.python_type - else: - python_types[field.name] = arrow_to_python_type(field.type) - return cls(python_types) - - if semantic_type_registry is None: - raise ValueError( - "semantic_type_registry must be provided if converters are not" - ) - semantic_schema = SemanticSchema.from_arrow_schema( - arrow_schema, - semantic_type_registry, - ) - return cls(semantic_schema.get_python_types()) - - -class SemanticSchema(dict[str, type | SemanticType]): - """ - A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. - - This is used to define the expected structure of data packets with semantic types in OrcaPod. - - Attributes - ---------- - keys : str - The keys of the schema. - values : type | SemanticType - Either type for simple fields or SemanticType for semantic fields. - - Examples - -------- - >>> schema = SemanticSchema(image=SemanticType('path'), age=int) - >>> print(schema) - {"image": SemanticType(name='path'), "age": })} - """ - - def get_semantic_fields(self) -> dict[str, SemanticType]: - """ - Get a dictionary of semantic fields in the schema. - - Returns - ------- - dict[str, SemanticType] - A dictionary mapping keys to their corresponding SemanticType. - """ - return {k: v for k, v in self.items() if isinstance(v, SemanticType)} - - def get_python_types(self) -> dict[str, type]: - """ - Get the Python types for all keys in the schema. - - Returns - ------- - dict[str, type] - A dictionary mapping keys to their corresponding Python types. - """ - return { - k: v.get_default_python_type() if isinstance(v, SemanticType) else v - for k, v in self.items() - } - - def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: - """ - Get the Arrow types for all keys in the schema. - - Returns - ------- - dict[str, tuple[pa.DataType, str|None]] - A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, - the second element of the tuple is the semantic type name; otherwise, it is None. - """ - return { - k: (v.get_default_arrow_type(), v.name) - if isinstance(v, SemanticType) - else (python_to_arrow_type(v), None) - for k, v in self.items() - } - - def to_arrow_schema(self) -> pa.Schema: - """ - Get the Arrow schema, which is a PythonSchema representation of the semantic schema. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - fields = [] - for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): - if semantic_type_name is not None: - field = pa.field( - k, - arrow_type, - metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, - ) - else: - field = pa.field(k, arrow_type) - fields.append(field) - - return pa.schema(fields) - - def to_python_schema(self) -> PythonSchema: - """ - Get the Python schema, which is a PythonSchema representation of the semantic schema. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - return PythonSchema.from_semantic_schema(self) - - @classmethod - def from_arrow_schema( - cls, - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry, - ) -> Self: - """ - Create a SemanticSchema from an Arrow schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The Arrow schema to convert. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - """ - - semantic_schema = {} - for field in arrow_schema: - field_type = None - if field.metadata is not None: - semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() - if semantic_type_name: - semantic_type = semantic_type_registry.get_semantic_type( - semantic_type_name - ) - if semantic_type is None: - raise ValueError( - f"Semantic type '{semantic_type_name}' not found in registry" - ) - if not semantic_type.supports_arrow_type(field.type): - raise ValueError( - f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" - ) - field_type = semantic_type - - if ( - field_type is None - ): # was not set to semantic type, so fallback to simple conversion - field_type = arrow_to_python_type(field.type) - - semantic_schema[field.name] = field_type - return cls(semantic_schema) - - @classmethod - def from_typespec( - cls, - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, - ) -> Self: - semantic_schema = {} - for key, python_type in typespec.items(): - semantic_type = semantic_type_registry.get_semantic_type_for_python_type( - python_type - ) - if semantic_type is not None: - semantic_schema[key] = semantic_type - else: - semantic_schema[key] = python_type - return cls(semantic_schema) diff --git a/src/orcapod/semantic_types/semantic_registry.py b/src/orcapod/semantic_types/semantic_registry.py new file mode 100644 index 0000000..28f16e2 --- /dev/null +++ b/src/orcapod/semantic_types/semantic_registry.py @@ -0,0 +1,318 @@ +from typing import Any, TYPE_CHECKING +from collections.abc import Collection +from orcapod.protocols.semantic_protocols import SemanticStructConverter +from orcapod.utils.lazy_module import LazyModule + + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class SemanticTypeRegistry: + """ + Registry that manages semantic type converters using struct signature recognition. + + This registry maps Python types to PyArrow struct signatures, enabling + automatic detection and conversion of semantic types based on their + struct schema alone. + """ + + def __init__(self, converters: Collection[SemanticStructConverter] | None = None): + # Bidirectional mappings between Python types and struct signatures + self._python_to_struct: dict[type, "pa.StructType"] = {} + self._struct_to_python: dict["pa.StructType", type] = {} + self._struct_to_converter: dict["pa.StructType", SemanticStructConverter] = {} + + # Name mapping for convenience + self._name_to_converter: dict[str, SemanticStructConverter] = {} + self._struct_to_name: dict["pa.StructType", str] = {} + + if converters: + for converter in converters: + self.register_converter(converter) + + def register_converter( + self, converter: SemanticStructConverter, semantic_name: str | None = None + ) -> None: + """ + Register a semantic type converter. + + This creates bidirectional mappings between: + - Python type ↔ Arrow struct signature + - Arrow struct signature ↔ converter instance + + Optionally, a semantic type name can be provided. + """ + python_type = converter.python_type + struct_signature = converter.arrow_struct_type + + # Check for conflicts + if python_type in self._python_to_struct: + existing_struct = self._python_to_struct[python_type] + if existing_struct != struct_signature: + raise ValueError( + f"Python type {python_type} already registered with different struct signature. " + f"Existing: {existing_struct}, New: {struct_signature}" + ) + + if struct_signature in self._struct_to_python: + existing_python = self._struct_to_python[struct_signature] + if existing_python != python_type: + raise ValueError( + f"Struct signature {struct_signature} already registered with different Python type. " + f"Existing: {existing_python}, New: {python_type}" + ) + + if semantic_name in self._name_to_converter: + existing = self._name_to_converter[semantic_name] + if existing != converter: + raise ValueError( + f"Semantic type name '{semantic_name}' already registered" + ) + + # Register bidirectional mappings + self._python_to_struct[python_type] = struct_signature + self._struct_to_python[struct_signature] = python_type + self._struct_to_converter[struct_signature] = converter + if semantic_name is not None: + self._name_to_converter[semantic_name] = converter + self._struct_to_name[struct_signature] = semantic_name + + def get_converter_for_python_type( + self, python_type: type + ) -> SemanticStructConverter | None: + """Get converter for a Python type.""" + # Direct lookup first + struct_signature = self._python_to_struct.get(python_type) + if struct_signature: + return self._struct_to_converter[struct_signature] + + # Handle subclass relationships - add safety check + for registered_type, struct_signature in self._python_to_struct.items(): + try: + if ( + isinstance(registered_type, type) + and isinstance(python_type, type) + and issubclass(python_type, registered_type) + ): + return self._struct_to_converter[struct_signature] + except TypeError: + # Handle cases where issubclass fails (e.g., with generic types) + continue + + return None + + def get_converter_for_semantic_type( + self, semantic_type_name: str + ) -> SemanticStructConverter | None: + """Get converter by semantic type name.""" + return self._name_to_converter.get(semantic_type_name) + + def get_converter_for_struct_signature( + self, struct_signature: "pa.StructType" + ) -> SemanticStructConverter | None: + """ + Get converter for an Arrow struct signature. + + This is the core method for struct signature recognition. + """ + return self._struct_to_converter.get(struct_signature) + + def get_python_type_for_semantic_struct_signature( + self, struct_signature: "pa.StructType" + ) -> type | None: + """ + Get Python type for an Arrow struct signature. + + This enables automatic type inference from struct schemas. + """ + return self._struct_to_python.get(struct_signature) + + def get_semantic_struct_signature_for_python_type( + self, python_type: type + ) -> "pa.StructType | None": + """Get Arrow struct signature for a Python type.""" + return self._python_to_struct.get(python_type) + + def is_semantic_struct_signature(self, struct_signature: "pa.StructType") -> bool: + """Check if a struct signature represents a semantic type.""" + return struct_signature in self._struct_to_python + + def has_python_type(self, python_type: type) -> bool: + """Check if a Python type is registered.""" + return python_type in self._python_to_struct + + def has_semantic_type(self, semantic_type_name: str) -> bool: + """Check if a semantic type name is registered.""" + return semantic_type_name in self._name_to_converter + + def list_semantic_types(self) -> list[str]: + """Get all registered semantic type names.""" + return list(self._name_to_converter.keys()) + + def list_python_types(self) -> list[type]: + """Get all registered Python types.""" + return list(self._python_to_struct.keys()) + + def list_struct_signatures(self) -> list["pa.StructType"]: + """Get all registered struct signatures.""" + return list(self._struct_to_python.keys()) + + def find_semantic_fields_in_schema(self, schema: "pa.Schema") -> dict[str, str]: + """ + Find all semantic type fields in a schema by struct signature recognition. + + Args: + schema: PyArrow schema to examine + + Returns: + Dictionary mapping field names to semantic type names + + Example: + schema with fields: + - name: string + - file_path: struct + - location: struct + + Returns: {"file_path": "path", "location": "geolocation"} + """ + semantic_fields = {} + for field in schema: + if pa.types.is_struct(field.type) and field.type in self._struct_to_name: + semantic_fields[field.name] = self._struct_to_name[field.type] + return semantic_fields + + def get_semantic_field_info(self, schema: "pa.Schema") -> dict[str, dict[str, Any]]: + """ + Get detailed information about semantic fields in a schema. + + Returns: + Dictionary with field names as keys and info dictionaries as values. + Each info dict contains: semantic_type, python_type, struct_signature + """ + semantic_info = {} + for field in schema: + if pa.types.is_struct(field.type): + converter = self.get_converter_for_struct_signature(field.type) + if converter: + semantic_info[field.name] = { + "python_type": converter.python_type, + "struct_signature": field.type, + "converter": converter, + } + return semantic_info + + def validate_struct_signature( + self, struct_signature: "pa.StructType", expected_python_type: type + ) -> bool: + """ + Validate that a struct signature matches the expected Python type. + + Args: + struct_signature: Arrow struct type to validate + expected_python_type: Expected Python type + + Returns: + True if the struct signature is registered for the Python type + """ + registered_type = self.get_python_type_for_semantic_struct_signature( + struct_signature + ) + return registered_type == expected_python_type + + +# # Conversion utilities using struct signature recognition +# class SemanticStructConverter: +# """Main converter class for working with semantic structs using signature recognition.""" + +# def __init__(self, registry: SemanticTypeRegistry): +# self.registry = registry + +# def python_to_struct_dict(self, value: Any) -> dict[str, Any] | None: +# """Convert Python value to struct dict if it's a semantic type.""" +# converter = self.registry.get_converter_for_python_type(type(value)) +# if converter: +# return converter.python_to_struct_dict(value) +# return None + +# def struct_dict_to_python( +# self, struct_dict: dict[str, Any], struct_signature: "pa.StructType" +# ) -> Any: +# """ +# Convert struct dict back to Python value using struct signature recognition. + +# Args: +# struct_dict: Dictionary representation of the struct +# struct_signature: PyArrow struct type signature + +# Returns: +# Python object corresponding to the semantic type +# """ +# converter = self.registry.get_converter_for_struct_signature(struct_signature) +# if not converter: +# raise ValueError( +# f"No converter found for struct signature: {struct_signature}" +# ) + +# return converter.struct_dict_to_python(struct_dict) + +# def is_semantic_struct_dict( +# self, struct_dict: dict[str, Any], struct_signature: "pa.StructType" +# ) -> bool: +# # FIXME: inconsistent implementation -- should check the passed in struct_dict +# """Check if a dict represents a semantic struct based on signature.""" +# return self.registry.is_semantic_struct_signature(struct_signature) + +# def get_semantic_type_from_struct_signature( +# self, struct_signature: "pa.StructType" +# ) -> str | None: +# """Extract semantic type name from struct signature.""" +# converter = self.registry.get_converter_for_struct_signature(struct_signature) +# return converter.semantic_type_name if converter else None + +# def python_to_arrow_array(self, values: list[Any]) -> "pa.Array": +# """Convert list of Python values to Arrow array of structs.""" +# if not values: +# raise ValueError("Cannot convert empty list") + +# # Check if first value is a semantic type +# first_converter = self.registry.get_converter_for_python_type(type(values[0])) +# if not first_converter: +# raise ValueError(f"No semantic type converter for {type(values[0])}") + +# # Convert all values to struct dicts +# struct_dicts = [] +# for value in values: +# converter = self.registry.get_converter_for_python_type(type(value)) +# if converter is None or converter != first_converter: +# raise ValueError("All values must be the same semantic type") +# struct_dicts.append(converter.python_to_struct_dict(value)) + +# # Create Arrow array with the registered struct signature +# return pa.array(struct_dicts, type=first_converter.arrow_struct_type) + +# # Create Arrow array with the registered struct signature +# return pa.array(struct_dicts, type=first_converter.arrow_struct_type) + +# def arrow_array_to_python(self, array: "pa.Array") -> list[Any]: +# """Convert Arrow struct array back to list of Python values.""" +# if not pa.types.is_struct(array.type): +# raise ValueError(f"Expected struct array, got {array.type}") + +# converter = self.registry.get_converter_for_struct_signature(array.type) +# if not converter: +# raise ValueError(f"No converter found for struct signature: {array.type}") + +# # Convert each struct to Python value +# python_values = [] +# for i in range(len(array)): +# struct_scalar = array[i] +# if struct_scalar.is_valid: +# struct_dict = struct_scalar.as_py() +# python_values.append(converter.struct_dict_to_python(struct_dict)) +# else: +# python_values.append(None) + +# return python_values diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py new file mode 100644 index 0000000..e675b61 --- /dev/null +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -0,0 +1,79 @@ +""" +Struct-based semantic type system for OrcaPod. + +This replaces the metadata-based approach with explicit struct fields, +making semantic types visible in schemas and preserved through operations. +""" + +from typing import Any, TYPE_CHECKING +from pathlib import Path +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +# Path-specific implementation +class PathStructConverter: + """Converter for pathlib.Path objects to/from semantic structs.""" + + def __init__(self): + self._python_type = Path + + # Define the Arrow struct type for paths + self._arrow_struct_type = pa.struct( + [ + pa.field("path", pa.large_string()), + ] + ) + + @property + def python_type(self) -> type: + return self._python_type + + @property + def arrow_struct_type(self) -> pa.StructType: + return self._arrow_struct_type + + def python_to_struct_dict(self, value: Path) -> dict[str, Any]: + """Convert Path to struct dictionary.""" + if not isinstance(value, Path): + raise TypeError(f"Expected Path, got {type(value)}") + + return { + "path": str(value), + } + + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: + """Convert struct dictionary back to Path.""" + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct") + + return Path(path_str) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type.""" + return issubclass(python_type, Path) + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + """Check if this converter can handle the given struct type.""" + # Check if struct has the expected fields + field_names = [field.name for field in struct_type] + expected_fields = {"path"} + + if set(field_names) != expected_fields: + return False + + # Check field types + field_types = {field.name: field.type for field in struct_type} + + return field_types["path"] == pa.large_string() + + def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: + """Check if a struct dictionary represents this semantic type.""" + return set(struct_dict.keys()) == {"path"} and isinstance( + struct_dict["path"], str + ) diff --git a/src/orcapod/semantic_types/struct_converters.py b/src/orcapod/semantic_types/struct_converters.py deleted file mode 100644 index b5ab182..0000000 --- a/src/orcapod/semantic_types/struct_converters.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -Struct-based semantic type system for OrcaPod. - -This replaces the metadata-based approach with explicit struct fields, -making semantic types visible in schemas and preserved through operations. -""" - -from typing import Any, Protocol -from pathlib import Path -import pyarrow as pa -from collections.abc import Collection - - -# Core protocols -class StructConverter(Protocol): - """Protocol for converting between Python objects and semantic structs.""" - - @property - def semantic_type_name(self) -> str: - """The semantic type name this converter handles.""" - ... - - @property - def python_type(self) -> type: - """The Python type this converter can handle.""" - ... - - @property - def arrow_struct_type(self) -> pa.StructType: - """The Arrow struct type this converter produces.""" - ... - - def python_to_struct_dict(self, value: Any) -> dict[str, Any]: - """Convert Python value to struct dictionary.""" - ... - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: - """Convert struct dictionary back to Python value.""" - ... - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type.""" - ... - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - """Check if this converter can handle the given struct type.""" - ... - - -# Path-specific implementation -class PathStructConverter: - """Converter for pathlib.Path objects to/from semantic structs.""" - - def __init__(self): - self._semantic_type_name = "path" - self._python_type = Path - - # Define the Arrow struct type for paths - self._arrow_struct_type = pa.struct( - [ - pa.field("semantic_type", pa.string()), - pa.field("path", pa.large_string()), - ] - ) - - @property - def semantic_type_name(self) -> str: - return self._semantic_type_name - - @property - def python_type(self) -> type: - return self._python_type - - @property - def arrow_struct_type(self) -> pa.StructType: - return self._arrow_struct_type - - def python_to_struct_dict(self, value: Path) -> dict[str, Any]: - """Convert Path to struct dictionary.""" - if not isinstance(value, Path): - raise TypeError(f"Expected Path, got {type(value)}") - - return { - "semantic_type": self._semantic_type_name, - "path": str(value), - } - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: - """Convert struct dictionary back to Path.""" - if struct_dict.get("semantic_type") != self._semantic_type_name: - raise ValueError( - f"Expected semantic_type '{self._semantic_type_name}', " - f"got '{struct_dict.get('semantic_type')}'" - ) - - path_str = struct_dict.get("path") - if path_str is None: - raise ValueError("Missing 'path' field in struct") - - return Path(path_str) - - def can_handle_python_type(self, python_type: type) -> bool: - """Check if this converter can handle the given Python type.""" - return issubclass(python_type, Path) - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - """Check if this converter can handle the given struct type.""" - # Check if struct has the expected fields - field_names = [field.name for field in struct_type] - expected_fields = {"semantic_type", "path"} - - if set(field_names) != expected_fields: - return False - - # Check field types - field_types = {field.name: field.type for field in struct_type} - - return ( - field_types["semantic_type"] == pa.string() - and field_types["path"] == pa.large_string() - ) - - def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: - """Check if a struct dictionary represents this semantic type.""" - return struct_dict.get("semantic_type") == self._semantic_type_name - - -# Registry for managing semantic type converters -class SemanticTypeRegistry: - """Registry that manages struct-based semantic type converters.""" - - def __init__(self, converters: Collection[StructConverter] | None = None): - self._python_to_converter: dict[type, StructConverter] = {} - self._name_to_converter: dict[str, StructConverter] = {} - self._struct_type_to_converter: dict[pa.StructType, StructConverter] = {} - - if converters: - for converter in converters: - self.register_converter(converter) - - def register_converter(self, converter: StructConverter) -> None: - """Register a semantic type converter.""" - # Register by Python type - python_type = converter.python_type - if python_type in self._python_to_converter: - existing = self._python_to_converter[python_type] - raise ValueError( - f"Python type {python_type} already registered with converter " - f"for semantic type '{existing.semantic_type_name}'" - ) - self._python_to_converter[python_type] = converter - - # Register by semantic type name - name = converter.semantic_type_name - if name in self._name_to_converter: - raise ValueError(f"Semantic type '{name}' already registered") - self._name_to_converter[name] = converter - - # Register by struct type - struct_type = converter.arrow_struct_type - self._struct_type_to_converter[struct_type] = converter - - def get_converter_for_python_type( - self, python_type: type - ) -> StructConverter | None: - """Get converter for a Python type.""" - # Direct lookup first - converter = self._python_to_converter.get(python_type) - if converter: - return converter - - # Check for subclass relationships - for registered_type, converter in self._python_to_converter.items(): - if issubclass(python_type, registered_type): - return converter - - return None - - def get_converter_for_semantic_type( - self, semantic_type_name: str - ) -> StructConverter | None: - """Get converter by semantic type name.""" - return self._name_to_converter.get(semantic_type_name) - - def get_converter_for_struct_type( - self, struct_type: pa.StructType - ) -> StructConverter | None: - """Get converter for an Arrow struct type.""" - # Direct lookup first - converter = self._struct_type_to_converter.get(struct_type) - if converter: - return converter - - # Check if any converter can handle this struct type - for converter in self._name_to_converter.values(): - if converter.can_handle_struct_type(struct_type): - return converter - - return None - - def is_semantic_struct_type(self, struct_type: pa.StructType) -> bool: - """Check if a struct type represents a semantic type.""" - return self.get_converter_for_struct_type(struct_type) is not None - - def has_python_type(self, python_type: type) -> bool: - """Check if a Python type is registered.""" - return self.get_converter_for_python_type(python_type) is not None - - def has_semantic_type(self, semantic_type_name: str) -> bool: - """Check if a semantic type is registered.""" - return semantic_type_name in self._name_to_converter - - def list_semantic_types(self) -> list[str]: - """Get all registered semantic type names.""" - return list(self._name_to_converter.keys()) - - def list_python_types(self) -> list[type]: - """Get all registered Python types.""" - return list(self._python_to_converter.keys()) - - -# Conversion utilities -class SemanticStructConverter: - """Main converter class for working with semantic structs.""" - - def __init__(self, registry: SemanticTypeRegistry): - self.registry = registry - - def python_to_struct_dict(self, value: Any) -> dict[str, Any] | None: - """Convert Python value to struct dict if it's a semantic type.""" - converter = self.registry.get_converter_for_python_type(type(value)) - if converter: - return converter.python_to_struct_dict(value) - return None - - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: - """Convert struct dict back to Python value.""" - semantic_type = struct_dict.get("semantic_type") - if not semantic_type: - raise ValueError("Struct dict missing 'semantic_type' field") - - converter = self.registry.get_converter_for_semantic_type(semantic_type) - if not converter: - raise ValueError(f"No converter found for semantic type '{semantic_type}'") - - return converter.struct_dict_to_python(struct_dict) - - def is_semantic_struct_dict(self, struct_dict: dict[str, Any]) -> bool: - """Check if a dict represents a semantic struct.""" - semantic_type = struct_dict.get("semantic_type") - if not semantic_type: - return False - return self.registry.has_semantic_type(semantic_type) - - def python_to_arrow_array(self, values: list[Any]) -> pa.Array: - """Convert list of Python values to Arrow array of structs.""" - if not values: - raise ValueError("Cannot convert empty list") - - # Check if first value is a semantic type - first_converter = self.registry.get_converter_for_python_type(type(values[0])) - if not first_converter: - raise ValueError(f"No semantic type converter for {type(values[0])}") - - # Convert all values to struct dicts - struct_dicts = [] - for value in values: - converter = self.registry.get_converter_for_python_type(type(value)) - if converter is None or converter != first_converter: - raise ValueError("All values must be the same semantic type") - struct_dicts.append(converter.python_to_struct_dict(value)) - - # Create Arrow array - return pa.array(struct_dicts, type=first_converter.arrow_struct_type) - - def arrow_array_to_python(self, array: pa.Array) -> list[Any]: - """Convert Arrow struct array back to list of Python values.""" - if not pa.types.is_struct(array.type): - raise ValueError(f"Expected struct array, got {array.type}") - - converter = self.registry.get_converter_for_struct_type(array.type) - if not converter: - raise ValueError(f"No converter found for struct type {array.type}") - - # Convert each struct to Python value - python_values = [] - for i in range(len(array)): - struct_scalar = array[i] - if struct_scalar.is_valid: - struct_dict = struct_scalar.as_py() - python_values.append(converter.struct_dict_to_python(struct_dict)) - else: - python_values.append(None) - - return python_values - - -# Default registry with Path support -def create_default_registry() -> SemanticTypeRegistry: - """Create default registry with built-in semantic types.""" - registry = SemanticTypeRegistry() - registry.register_converter(PathStructConverter()) - return registry - - -# Global default registry -DEFAULT_REGISTRY = create_default_registry() diff --git a/src/orcapod/semantic_types/type_inference.py b/src/orcapod/semantic_types/type_inference.py new file mode 100644 index 0000000..e8519f4 --- /dev/null +++ b/src/orcapod/semantic_types/type_inference.py @@ -0,0 +1,250 @@ +from typing import Any, Union, Optional, get_origin, get_args + + +def infer_schema_from_pylist_data( + data: list[dict], default_type=str +) -> dict[str, type]: + """ + Infer schema from sample data (best effort). + + Args: + data: List of sample dictionaries + default_type: Default type to use for fields with no values + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema = {} + + # Get all possible field names + all_fields = [] + for record in data: + all_fields.extend(record.keys()) + + all_fields = list(dict.fromkeys(all_fields)) # Remove duplicates + + # Infer type for each field + for field_name in all_fields: + # Get all values for this field (including None) + all_field_values = [ + record.get(field_name) for record in data if field_name in record + ] + + # Separate None and non-None values + non_none_values = [v for v in all_field_values if v is not None] + # check if there is at least one None value + has_none = len(non_none_values) < len(all_field_values) + + if not non_none_values: + # Handle case where all values are None + schema[field_name] = default_type | None + continue + + # Infer type from non-None values + inferred_type = _infer_type_from_values(non_none_values) + + if inferred_type is None: + schema[field_name] = default_type | None + elif has_none: + # Wrap with Optional if None values present + schema[field_name] = inferred_type | None if inferred_type != Any else Any + else: + schema[field_name] = inferred_type + + return schema + + +def _infer_type_from_values(values: list) -> type | None: + """Infer type from a list of non-None values.""" + if not values: + return None + + # Get types of all values + value_types = {type(v) for v in values} + + if len(value_types) == 1: + # All values have same type + value_type = next(iter(value_types)) + return _infer_container_type(value_type, values) + else: + # Mixed types - handle common cases + return _handle_mixed_types(value_types, values) + + +def _infer_container_type(value_type: type, values: list) -> type: + """Infer container type with element types.""" + if value_type is list: + return _infer_list_type(values) + elif value_type is tuple: + return _infer_tuple_type(values) + elif value_type in {set, frozenset}: + return _infer_set_type(values, value_type) + elif value_type is dict: + return _infer_dict_type(values) + else: + return value_type + + +def _infer_list_type(lists: list[list]) -> type: + """Infer list element type.""" + all_elements = [] + for lst in lists: + all_elements.extend(lst) + + if not all_elements: + return list[Any] + + element_type = _infer_type_from_values(all_elements) + return list[element_type] + + +def _infer_tuple_type(tuples: list[tuple]) -> type: + """Infer tuple element types.""" + if not tuples: + return tuple[Any, ...] + + # Check if all tuples have same length + lengths = {len(t) for t in tuples} + + if len(lengths) == 1: + # Fixed-length tuples - infer type for each position + tuple_length = next(iter(lengths)) + if tuple_length == 0: + return tuple[()] + + position_types = [] + for i in range(tuple_length): + position_values = [t[i] for t in tuples if len(t) > i] + position_type = _infer_type_from_values(position_values) + position_types.append(position_type) + + # Always use fixed-length notation for same-length tuples + return tuple[tuple(position_types)] + else: + # Variable-length tuples - infer common element type + all_elements = [] + for t in tuples: + all_elements.extend(t) + + if not all_elements: + return tuple[Any, ...] + + element_type = _infer_type_from_values(all_elements) + return tuple[element_type, ...] + + +def _infer_set_type(sets: list, set_type: type) -> type: + """Infer set element type.""" + all_elements = [] + for s in sets: + all_elements.extend(s) + + if not all_elements: + return set_type[Any] + + element_type = _infer_type_from_values(all_elements) + return set_type[element_type] + + +def _infer_dict_type(dicts: list[dict]) -> type: + """Infer dictionary key and value types.""" + all_keys = [] + all_values = [] + + for d in dicts: + all_keys.extend(d.keys()) + all_values.extend(d.values()) + + if not all_keys or not all_values: + return dict[Any, Any] + + key_type = _infer_type_from_values(all_keys) + value_type = _infer_type_from_values(all_values) + + return dict[key_type, value_type] + + +def _handle_mixed_types(value_types: set, values: list) -> type: + """Handle mixed types by creating appropriate Union types.""" + + # Handle common int/float mixing + if value_types == {int, float}: + return Union[int, float] + + # Handle numeric types with broader compatibility + numeric_types = {int, float, complex} + if value_types.issubset(numeric_types): + if complex in value_types: + return Union[int, float, complex] + else: + return Union[int, float] + + # For small number of types, create Union + if len(value_types) <= 4: # Arbitrary limit to avoid huge unions + sorted_types = sorted(value_types, key=lambda t: t.__name__) + return Union[tuple(sorted_types)] + + # Too many types, fall back to Any + return Any + + +# Example usage and test function +def test_schema_inference(): + """Test the improved schema inference function.""" + + test_data = [ + { + "id": 1, + "name": "Alice", + "scores": [85, 92, 78], + "coordinates": (10.5, 20.3), + "tags": {"python", "data"}, + "metadata": {"created": "2023-01-01", "version": 1}, + "optional_field": "present", + }, + { + "id": 2, + "name": "Bob", + "scores": [88, 91], + "coordinates": (15.2, 25.7), + "tags": {"java", "backend"}, + "metadata": {"created": "2023-01-02", "version": 2}, + "optional_field": None, + }, + { + "id": 3, + "name": "Charlie", + "scores": [95, 87, 89, 92], + "coordinates": (5.1, 30.9), + "tags": {"javascript", "frontend"}, + "metadata": {"created": "2023-01-03", "version": 1}, + "mixed_field": 42, + }, + { + "id": 4, + "name": "Diana", + "scores": [], + "coordinates": (0.0, 0.0), + "tags": set(), + "metadata": {}, + "mixed_field": "text", + }, + ] + + schema = infer_schema_from_pylist_data(test_data) + + print("Inferred Schema:") + for field, field_type in sorted(schema.items()): + print(f" {field}: {field_type}") + + return schema + + +if __name__ == "__main__": + test_schema_inference() diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 872d267..8362e8e 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -9,16 +9,77 @@ 5. Integrates seamlessly with semantic type registries """ -from typing import TypedDict, Dict, Type, Any, Callable, Tuple, Optional, get_type_hints +import types +from typing import TypedDict, Any +from collections.abc import Callable import pyarrow as pa -from functools import lru_cache import hashlib +import logging +from orcapod.contexts import DataContext, resolve_context from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry +from orcapod.semantic_types.type_inference import infer_schema_from_pylist_data # Handle generic types from typing import get_origin, get_args import typing +logger = logging.getLogger(__name__) + + +# Basic type mapping for Python -> Arrow conversion +_PYTHON_TO_ARROW_MAP = { + # Python built-ins + int: pa.int64(), + float: pa.float64(), + str: pa.large_string(), # Use large_string by default for Polars compatibility + bool: pa.bool_(), + bytes: pa.large_binary(), # Use large_binary by default for Polars compatibility + # String representations (for when we get type names as strings) + "int": pa.int64(), + "float": pa.float64(), + "str": pa.large_string(), + "bool": pa.bool_(), + "bytes": pa.large_binary(), + # Specific integer types + "int8": pa.int8(), + "int16": pa.int16(), + "int32": pa.int32(), + "int64": pa.int64(), + "uint8": pa.uint8(), + "uint16": pa.uint16(), + "uint32": pa.uint32(), + "uint64": pa.uint64(), + # Specific float types + "float32": pa.float32(), + "float64": pa.float64(), + # Date/time types + "date": pa.date32(), + "datetime": pa.timestamp("us"), + "timestamp": pa.timestamp("us"), +} + +# Add numpy types if available +try: + import numpy as np + + _PYTHON_TO_ARROW_MAP.update( + { + np.int8: pa.int8(), + np.int16: pa.int16(), + np.int32: pa.int32(), + np.int64: pa.int64(), + np.uint8: pa.uint8(), + np.uint16: pa.uint16(), + np.uint32: pa.uint32(), + np.uint64: pa.uint64(), + np.float32: pa.float32(), + np.float64: pa.float64(), + np.bool_: pa.bool_(), + } + ) +except ImportError: + pass + class UniversalTypeConverter: """ @@ -65,7 +126,37 @@ def python_type_to_arrow_type(self, python_type: type) -> pa.DataType: return arrow_type - def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> Type: + def python_schema_to_arrow_schema( + self, python_schema: dict[str, type] + ) -> pa.Schema: + """ + Convert a Python schema (dict of field names to types) to an Arrow schema. + + This uses the main conversion logic and caches results for performance. + """ + fields = [] + for field_name, python_type in python_schema.items(): + arrow_type = self.python_type_to_arrow_type(python_type) + fields.append(pa.field(field_name, arrow_type)) + + return pa.schema(fields) + + def infer_python_schema_from_data(self, python_data: Any) -> type: + """ + Infer Python schema from data, returning a TypedDict type. + + This is useful for dynamic data structures where the schema is not known in advance. + """ + if not isinstance(python_data, dict): + raise ValueError("Expected a dictionary to infer schema") + + field_specs = {} + for key, value in python_data.items(): + field_specs[key] = type(value) + + return TypedDict("DynamicSchema", field_specs) # type: ignore[call-arg] + + def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> type: """ Convert Arrow type to Python type hint with caching. @@ -82,7 +173,79 @@ def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> Type: return python_type - def get_python_to_arrow_converter(self, python_type: Type) -> Callable[[Any], Any]: + def arrow_schema_to_python_schema(self, arrow_schema: pa.Schema) -> dict[str, type]: + """ + Convert an Arrow schema to a Python schema (dict of field names to types). + + This uses the main conversion logic and caches results for performance. + """ + python_schema = {} + for field in arrow_schema: + python_type = self.arrow_type_to_python_type(field.type) + python_schema[field.name] = python_type + + return python_schema + + def python_dicts_to_arrow_table( + self, + python_dicts: list[dict[str, Any]], + python_schema: dict[str, type] | None = None, + ) -> pa.Table: + """ + Convert a list of Python dictionaries to an Arrow table. + + This uses the main conversion logic and caches results for performance. + """ + if python_schema is None: + python_schema = infer_schema_from_pylist_data(python_dicts) + + converters = { + field_name: self.get_python_to_arrow_converter(python_type) + for field_name, python_type in python_schema.items() + } + + converted_data = [] + for record in python_dicts: + converted_record = {} + for field_name, converter in converters.items(): + if field_name in record: + converted_record[field_name] = converter(record[field_name]) + else: + converted_record[field_name] = None + converted_data.append(converted_record) + + # Convert to Arrow schema + arrow_schema = self.python_schema_to_arrow_schema(python_schema) + + return pa.Table.from_pylist(converted_data, schema=arrow_schema) + + def arrow_table_to_python_dicts( + self, arrow_table: pa.Table + ) -> list[dict[str, Any]]: + """ + Convert an Arrow table to a list of Python dictionaries. + + This uses the main conversion logic and caches results for performance. + """ + # Prepare converters for each field + converters = { + field.name: self.get_arrow_to_python_converter(field.type) + for field in arrow_table.schema + } + + python_dicts = [] + for row in arrow_table.to_pylist(): + python_dict = {} + for field_name, value in row.items(): + if value is not None: + python_dict[field_name] = converters[field_name](value) + else: + python_dict[field_name] = None + python_dicts.append(python_dict) + + return python_dicts + + def get_python_to_arrow_converter(self, python_type: type) -> Callable[[Any], Any]: """ Get cached conversion function for Python value → Arrow value. @@ -116,20 +279,11 @@ def get_arrow_to_python_converter( return converter - def _convert_python_to_arrow(self, python_type: Type) -> pa.DataType: + def _convert_python_to_arrow(self, python_type: type) -> pa.DataType: """Core Python → Arrow type conversion logic.""" - # Handle basic types - basic_type_map = { - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), - bool: pa.bool_(), - bytes: pa.large_binary(), - } - - if python_type in basic_type_map: - return basic_type_map[python_type] + if python_type in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[python_type] # Check semantic registry for registered types if self.semantic_registry: @@ -151,8 +305,8 @@ def _convert_python_to_arrow(self, python_type: Type) -> pa.DataType: # Handle string type names if hasattr(python_type, "__name__"): type_name = python_type.__name__ - if type_name in basic_type_map: - return basic_type_map[type_name] + if type_name in _PYTHON_TO_ARROW_MAP: + return _PYTHON_TO_ARROW_MAP[type_name] raise ValueError(f"Unsupported Python type: {python_type}") # Handle list types @@ -193,7 +347,7 @@ def _convert_python_to_arrow(self, python_type: Type) -> pa.DataType: return pa.large_list(key_value_struct) # Handle Union/Optional types - elif origin is typing.Union: + elif origin is typing.Union or origin is types.UnionType: if len(args) == 2 and type(None) in args: # Optional[T] → just T (nullability handled at field level) non_none_type = args[0] if args[1] is type(None) else args[1] @@ -233,14 +387,36 @@ def _convert_arrow_to_python(self, arrow_type: pa.DataType) -> type | Any: elif pa.types.is_struct(arrow_type): # Check if it's a registered semantic type first if self.semantic_registry: - python_type = ( - self.semantic_registry.get_python_type_for_struct_signature( - arrow_type - ) + python_type = self.semantic_registry.get_python_type_for_semantic_struct_signature( + arrow_type ) if python_type: return python_type + # Check if it is heterogeneous tuple + if len(arrow_type) > 0 and all( + field.name.startswith("f") and field.name[1:].isdigit() + for field in arrow_type + ): + # This is likely a heterogeneous tuple, extract digits and ensure it + # is continuous + field_digits = [int(field.name[1:]) for field in arrow_type] + if field_digits == list(range(len(field_digits))): + return tuple[ + tuple( + self.arrow_type_to_python_type( + arrow_type.field(f"f{pos}").type + ) + for pos in range(len(arrow_type)) + ) + ] + else: + # Non-continuous field names, treat as dynamic TypedDict + logger.info( + "Detected heterogeneous tuple with non-continuous field names, " + "treating as dynamic TypedDict" + ) + # Create dynamic TypedDict for unregistered struct # TODO: add check for heterogeneous tuple checking each field starts with f return self._get_or_create_typeddict_for_struct(arrow_type) @@ -303,7 +479,7 @@ def _convert_arrow_to_python(self, arrow_type: pa.DataType) -> type | Any: # Default case for unsupported types return Any - def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> Type: + def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> type: """Get or create a TypedDict class for an Arrow struct type.""" # Check cache first @@ -321,7 +497,7 @@ def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> Typ type_name = self._generate_unique_type_name(field_specs) # Create TypedDict dynamically - typeddict_class = TypedDict(type_name, field_specs) + typeddict_class = TypedDict(type_name, field_specs) # type: ignore[call-arg] # Cache the mapping self._struct_signature_to_typeddict[struct_type] = typeddict_class @@ -329,7 +505,7 @@ def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> Typ return typeddict_class - def _generate_unique_type_name(self, field_specs: Dict[str, Type]) -> str: + def _generate_unique_type_name(self, field_specs: dict[str, type]) -> str: """Generate a unique name for TypedDict based on field specifications.""" # Create deterministic signature that includes both names and types @@ -439,7 +615,9 @@ def _create_arrow_to_python_converter( # Check for semantic type first if self.semantic_registry and pa.types.is_struct(arrow_type): registered_python_type = ( - self.semantic_registry.get_python_type_for_struct_signature(arrow_type) + self.semantic_registry.get_python_type_for_semantic_struct_signature( + arrow_type + ) ) if registered_python_type: converter = self.semantic_registry.get_converter_for_python_type( @@ -509,8 +687,21 @@ def _create_arrow_to_python_converter( else [] ) - # Handle struct types (TypedDict) + # Handle struct types - heterogeneous tuple or dynamic TypedDict elif pa.types.is_struct(arrow_type): + # if python_type + if python_type is tuple or get_origin(python_type) is tuple: + n = len(get_args(python_type)) + # prepare list of converters + converters = [ + self.get_arrow_to_python_converter(arrow_type.field(f"f{i}").type) + for i in range(n) + ] + # this is a heterogeneous tuple + return lambda value: tuple( + converter(value[f"f{i}"]) for i, converter in enumerate(converters) + ) + # Create converters for each field field_converters = {} for field in arrow_type: @@ -562,176 +753,137 @@ def get_cache_stats(self) -> dict[str, int]: } -# Convenience functions that use a global instance -_global_converter: UniversalTypeConverter | None = None - - -def prepare_arrow_table_to_python_dicts_converter( - schema: pa.Schema, semantic_registry: SemanticTypeRegistry | None = None -) -> Callable[[pa.Table], list[dict]]: - """ - Prepare a converter function that converts an Arrow Table to a list of Python dicts. - - This uses the global UniversalTypeConverter instance to handle type conversions. - """ - - # TODO: - converter = get_global_converter(semantic_registry) - - # construct the converter lookup table to be used as closure - converter_lut: dict[str, Callable[[Any], Any]] = {} - for field in schema: - python_type = converter.arrow_type_to_python_type(field.type) - python_to_arrow = converter.get_python_to_arrow_converter(python_type) - converter_lut[field.name] = python_to_arrow - - def schema_specific_converter(table: pa.Table) -> list[dict]: - result = [] - for row in table.to_pylist(): - converted_row = {k: converter_lut[k](v) for k, v in row.items()} - result.append(converted_row) - return result - - return schema_specific_converter - - -def get_global_converter( - semantic_registry: SemanticTypeRegistry | None = None, -) -> UniversalTypeConverter: - """Get or create the global type converter instance.""" - global _global_converter - if ( - _global_converter is None - or _global_converter.semantic_registry != semantic_registry - ): - _global_converter = UniversalTypeConverter(semantic_registry) - return _global_converter +# def infer_schema_from_pylist_data(data: list[dict]) -> dict[str, type]: +# """ +# Infer schema from sample data (best effort). + +# Args: +# data: List of sample dictionaries + +# Returns: +# Dictionary mapping field names to inferred Python types + +# Note: This is best-effort inference and may not handle all edge cases. +# For production use, explicit schemas are recommended. +# """ +# if not data: +# return {} + +# schema = {} + +# # Get all possible field names +# # use list to preserve order of appearance as much as possible +# all_fields = [] +# for record in data: +# all_fields.extend(record.keys()) + +# all_fields = list( +# dict.fromkeys(all_fields) +# ) # Remove duplicates while preserving order + +# # Infer type for each field +# for field_name in all_fields: +# field_values = [ +# record.get(field_name) +# for record in data +# if field_name in record and record[field_name] is not None +# ] + +# if not field_values: +# schema[field_name] = Any # No non-null values found +# continue + +# # Get types of all values +# value_types = {type(v) for v in field_values} + +# if len(value_types) == 1: +# # All values have same type +# value_type = next(iter(value_types)) + +# # For containers, try to infer element types +# if value_type is list and field_values: +# # Infer list element type from first non-empty list +# for lst in field_values: +# if lst: # non-empty list +# element_types = {type(elem) for elem in lst} +# if len(element_types) == 1: +# element_type = next(iter(element_types)) +# schema[field_name] = list[element_type] +# else: +# schema[field_name] = list[Any] # Mixed types +# break +# else: +# schema[field_name] = list[Any] # All lists empty + +# elif value_type in {set, frozenset} and field_values: +# # Infer set element type from first non-empty set +# for s in field_values: +# if s: # non-empty set +# element_types = {type(elem) for elem in s} +# if len(element_types) == 1: +# element_type = next(iter(element_types)) +# schema[field_name] = set[element_type] +# else: +# schema[field_name] = set[Any] # Mixed types +# break +# else: +# schema[field_name] = set[Any] # All sets empty + +# elif value_type is dict and field_values: +# # Infer dict types from first non-empty dict +# for d in field_values: +# if d: # non-empty dict +# key_types = {type(k) for k in d.keys()} +# value_types = {type(v) for v in d.values()} + +# if len(key_types) == 1 and len(value_types) == 1: +# key_type = next(iter(key_types)) +# val_type = next(iter(value_types)) +# schema[field_name] = dict[key_type, val_type] +# else: +# schema[field_name] = dict[Any, Any] # Mixed types +# break +# else: +# schema[field_name] = dict[Any, Any] # All dicts empty + +# else: +# schema[field_name] = value_type + +# else: +# # Mixed types - use Union or Any +# schema[field_name] = Any + +# return schema # Public API functions def python_type_to_arrow_type( - python_type: type, semantic_registry: SemanticTypeRegistry | None = None + python_type: type, data_context: DataContext | str | None = None ) -> pa.DataType: """Convert Python type to Arrow type using the global converter.""" - converter = get_global_converter(semantic_registry) + data_context = resolve_context(data_context) + converter = data_context.type_converter return converter.python_type_to_arrow_type(python_type) def arrow_type_to_python_type( - arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None + arrow_type: pa.DataType, data_context: DataContext | str | None = None ) -> type: """Convert Arrow type to Python type using the global converter.""" - converter = get_global_converter(semantic_registry) + data_context = resolve_context(data_context) + converter = data_context.type_converter return converter.arrow_type_to_python_type(arrow_type) def get_conversion_functions( - python_type: type, semantic_registry: SemanticTypeRegistry | None = None + python_type: type, data_context: DataContext | str | None = None ) -> tuple[Callable, Callable]: """Get both conversion functions for a Python type.""" - converter = get_global_converter(semantic_registry) + data_context = resolve_context(data_context) + converter = data_context.type_converter arrow_type = converter.python_type_to_arrow_type(python_type) python_to_arrow = converter.get_python_to_arrow_converter(python_type) arrow_to_python = converter.get_arrow_to_python_converter(arrow_type) return python_to_arrow, arrow_to_python - - -# Example usage and demonstration -if __name__ == "__main__": - print("=== Universal Type Conversion Engine ===\n") - - from pathlib import Path - import uuid - from sample_converters import create_standard_semantic_registry - - # Create converter with semantic registry - registry = create_standard_semantic_registry() - converter = UniversalTypeConverter(registry) - - print("Testing comprehensive type conversion:") - print("=" * 50) - - # Test various type conversions - test_types = [ - int, - str, - list[int], - dict[str, float], - tuple[int, str, bool], - Path, # Semantic type - uuid.UUID, # Semantic type - ] - - print("\nType Conversions:") - for python_type in test_types: - arrow_type = converter.python_type_to_arrow_type(python_type) - recovered_type = converter.arrow_type_to_python_type(arrow_type) - - print(f" {python_type} → {arrow_type} → {recovered_type}") - print(f" Round-trip successful: {recovered_type == python_type}") - - print(f"\n" + "=" * 50) - print("Testing conversion function caching:") - - # Test conversion functions - test_data = { - "id": 123, - "name": "Alice", - "tags": ["python", "arrow"], - "metadata": {"active": True, "score": 95.5}, - "file_path": Path("/home/alice/data.csv"), - "user_id": uuid.uuid4(), - } - - schema = { - "id": int, - "name": str, - "tags": list[str], - "metadata": dict[str, Any], - "file_path": Path, - "user_id": uuid.UUID, - } - - # Get conversion functions (these get cached) - converters = {} - for field_name, python_type in schema.items(): - python_to_arrow = converter.get_python_to_arrow_converter(python_type) - arrow_type = converter.python_type_to_arrow_type(python_type) - arrow_to_python = converter.get_arrow_to_python_converter(arrow_type) - converters[field_name] = (python_to_arrow, arrow_to_python) - - print("Conversion functions created and cached for all fields") - - # Test round-trip conversion using cached functions - converted_data = {} - for field_name, value in test_data.items(): - python_to_arrow, arrow_to_python = converters[field_name] - - # Convert to Arrow format - arrow_value = python_to_arrow(value) - # Convert back to Python - recovered_value = arrow_to_python(arrow_value) - - converted_data[field_name] = recovered_value - - print( - f" {field_name}: {type(value).__name__} → Arrow → {type(recovered_value).__name__}" - ) - - print(f"\n" + "=" * 50) - print("Cache Statistics:") - stats = converter.get_cache_stats() - for stat_name, count in stats.items(): - print(f" {stat_name}: {count}") - - print(f"\n" + "=" * 50) - print("✅ Universal Type Conversion Engine Benefits:") - print("✅ Single self-contained system for all conversions") - print("✅ Holds semantic registry internally") - print("✅ Caches all conversion functions for performance") - print("✅ Handles both Python→Arrow and Arrow→Python") - print("✅ Creates TypedDicts preserving struct field info") - print("✅ Dramatic reduction in function creation overhead") - print("✅ Central caching reduces memory usage") diff --git a/src/orcapod/semantic_types/complete_converter_test.py b/src/orcapod/semantic_types/unused/complete_converter_test.py similarity index 90% rename from src/orcapod/semantic_types/complete_converter_test.py rename to src/orcapod/semantic_types/unused/complete_converter_test.py index b1939cd..52ff4d1 100644 --- a/src/orcapod/semantic_types/complete_converter_test.py +++ b/src/orcapod/semantic_types/unused/complete_converter_test.py @@ -13,6 +13,7 @@ - Error handling and edge cases """ +from unittest.mock import Mock import pyarrow as pa import typing from typing import Any, Optional, Union @@ -25,7 +26,8 @@ # Import the converter functions # (In real usage, these would be imported from your module) -from orcapod.semantic_types.complete_converter import ( +from orcapod.semantic_types import SemanticTypeRegistry +from orcapod.semantic_types.semantic_converters import ( python_type_to_arrow, arrow_type_to_python, python_dicts_to_arrow_table, @@ -97,6 +99,14 @@ def get_converter_for_struct_type(self, struct_type): class MockPathConverter: """Mock converter for pathlib.Path objects.""" + @property + def semantic_type_name(self) -> str: + return "path" + + @property + def python_type(self): + return Path + @property def arrow_struct_type(self): return pa.struct([("semantic_type", pa.string()), ("path", pa.large_string())]) @@ -104,17 +114,34 @@ def arrow_struct_type(self): def python_to_struct_dict(self, value): if not isinstance(value, Path): raise TypeError(f"Expected Path, got {type(value)}") - return {"semantic_type": "path", "path": str(value)} + return {"semantic_type": self.semantic_type_name, "path": str(value)} def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != "path": + if struct_dict.get("semantic_type") != self.semantic_type_name: raise ValueError("Not a path semantic type") return Path(struct_dict["path"]) + def can_handle_python_type(self, python_type: type) -> bool: + return python_type is Path + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { + "semantic_type", + "path", + } + class MockUUIDConverter: """Mock converter for UUID objects.""" + @property + def semantic_type_name(self) -> str: + return "uuid" + + @property + def python_type(self) -> type: + return uuid.UUID + @property def arrow_struct_type(self): return pa.struct([("semantic_type", pa.string()), ("uuid_str", pa.string())]) @@ -122,18 +149,27 @@ def arrow_struct_type(self): def python_to_struct_dict(self, value): if not isinstance(value, uuid.UUID): raise TypeError(f"Expected UUID, got {type(value)}") - return {"semantic_type": "uuid", "uuid_str": str(value)} + return {"semantic_type": self.semantic_type_name, "uuid_str": str(value)} def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != "uuid": + if struct_dict.get("semantic_type") != self.semantic_type_name: raise ValueError("Not a uuid semantic type") return uuid.UUID(struct_dict["uuid_str"]) + def can_handle_python_type(self, python_type: type) -> bool: + return python_type is uuid.UUID + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { + "semantic_type", + "uuid_str", + } + class CustomData: """Custom data class for testing complex semantic types.""" - def __init__(self, data: dict, metadata: dict = None): + def __init__(self, data: dict, metadata: dict | None = None): self.data = data self.metadata = metadata or {} @@ -149,6 +185,14 @@ def __repr__(self): class MockCustomDataConverter: """Mock converter for CustomData objects.""" + @property + def semantic_type_name(self) -> str: + return "custom_data" + + @property + def python_type(self) -> type: + return CustomData + @property def arrow_struct_type(self): return pa.struct( @@ -163,19 +207,29 @@ def python_to_struct_dict(self, value): if not isinstance(value, CustomData): raise TypeError(f"Expected CustomData, got {type(value)}") return { - "semantic_type": "custom_data", + "semantic_type": self.semantic_type_name, "data": json.dumps(value.data), "metadata": json.dumps(value.metadata), } def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != "custom_data": + if struct_dict.get("semantic_type") != self.semantic_type_name: raise ValueError("Not a custom_data semantic type") data = json.loads(struct_dict["data"]) metadata = json.loads(struct_dict["metadata"]) return CustomData(data, metadata) + def can_handle_python_type(self, python_type: type) -> bool: + return python_type is CustomData + + def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { + "semantic_type", + "data", + "metadata", + } + def run_comprehensive_tests(): """Run comprehensive test suite for the type converter.""" @@ -184,7 +238,11 @@ def run_comprehensive_tests(): print("=" * 80) # Initialize mock semantic registry - semantic_registry = MockSemanticRegistry() + # semantic_registry = MockSemanticRegistry() + semantic_registry = SemanticTypeRegistry() + semantic_registry.register_converter(MockPathConverter()) + semantic_registry.register_converter(MockUUIDConverter()) + semantic_registry.register_converter(MockCustomDataConverter()) # Test counters total_tests = 0 diff --git a/src/orcapod/semantic_types/python_arrow_types.py b/src/orcapod/semantic_types/unused/python_arrow_types.py similarity index 83% rename from src/orcapod/semantic_types/python_arrow_types.py rename to src/orcapod/semantic_types/unused/python_arrow_types.py index 71ed7e7..e85e98f 100644 --- a/src/orcapod/semantic_types/python_arrow_types.py +++ b/src/orcapod/semantic_types/unused/python_arrow_types.py @@ -1,6 +1,7 @@ import pyarrow as pa +from typing import get_origin, get_args, Any import typing -from typing import get_origin, get_args +from collections.abc import Collection, Sequence, Mapping, Iterable, Set import sys # Basic type mapping for Python -> Arrow conversion @@ -164,7 +165,35 @@ def python_type_to_arrow(type_hint, semantic_registry=None) -> pa.DataType: key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) return pa.large_list(key_value_struct) - # Handle Union types (including Optional) + # Handle abstract base classes and collections + elif origin in {Collection, Sequence, Iterable}: + # Treat as list - most common concrete implementation + if len(args) != 1: + raise ValueError( + f"{origin.__name__} type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) + + elif origin is Set or origin is set: + # Sets -> lists (Arrow doesn't have native set type) + if len(args) != 1: + raise ValueError( + f"set type must have exactly one type argument, got: {args}" + ) + element_type = python_type_to_arrow(args[0], semantic_registry) + return pa.large_list(element_type) + + elif origin is Mapping: + # Mapping -> dict representation + if len(args) != 2: + raise ValueError( + f"Mapping type must have exactly two type arguments, got: {args}" + ) + key_type = python_type_to_arrow(args[0], semantic_registry) + value_type = python_type_to_arrow(args[1], semantic_registry) + key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) + return pa.large_list(key_value_struct) elif origin is typing.Union: # Handle Optional[T] which is Union[T, NoneType] if len(args) == 2 and type(None) in args: @@ -184,7 +213,8 @@ def python_type_to_arrow(type_hint, semantic_registry=None) -> pa.DataType: raise ValueError(f"Unsupported generic type: {origin}") -def arrow_type_to_python(arrow_type: pa.DataType) -> type: +# TODO: change back the return type to `type` +def arrow_type_to_python(arrow_type: pa.DataType) -> Any: """ Convert PyArrow data types back to Python type hints. @@ -247,17 +277,12 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: element_python_type = arrow_type_to_python(element_type) # Check if this is a fixed-size list (homogeneous tuple representation) - if ( - hasattr(arrow_type, "list_size") and arrow_type.list_size > 0 - ) or pa.types.is_fixed_size_list(arrow_type): - # Fixed-size list represents homogeneous tuple - if pa.types.is_fixed_size_list(arrow_type): - size = arrow_type.list_size - else: - size = arrow_type.list_size + if pa.types.is_fixed_size_list(arrow_type): + # Fixed-size list -> homogeneous tuple + size = arrow_type.list_size return tuple[tuple(element_python_type for _ in range(size))] else: - # Variable-size list + # Variable-size list -> list return list[element_python_type] elif pa.types.is_struct(arrow_type): @@ -342,7 +367,109 @@ def parse_type_string(type_string: str): raise ValueError(f"Could not parse type string '{type_string}': {e}") -# Helper functions for dict conversion +def infer_schema_from_data(data: list[dict]) -> dict[str, type]: + """ + Infer schema from sample data (best effort). + + Args: + data: List of sample dictionaries + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema = {} + + # Get all possible field names + all_fields = set() + for record in data: + all_fields.update(record.keys()) + + # Infer type for each field + for field_name in all_fields: + field_values = [ + record.get(field_name) + for record in data + if field_name in record and record[field_name] is not None + ] + + if not field_values: + schema[field_name] = Any # No non-null values found + continue + + # Get types of all values + value_types = {type(v) for v in field_values} + + if len(value_types) == 1: + # All values have same type + value_type = next(iter(value_types)) + + # For containers, try to infer element types + if value_type is list and field_values: + # Infer list element type from first non-empty list + for lst in field_values: + if lst: # non-empty list + element_types = {type(elem) for elem in lst} + if len(element_types) == 1: + element_type = next(iter(element_types)) + schema[field_name] = list[element_type] + else: + schema[field_name] = list[Any] # Mixed types + break + else: + schema[field_name] = list[Any] # All lists empty + + elif value_type in {set, frozenset} and field_values: + # Infer set element type from first non-empty set + for s in field_values: + if s: # non-empty set + element_types = {type(elem) for elem in s} + if len(element_types) == 1: + element_type = next(iter(element_types)) + schema[field_name] = set[element_type] + else: + schema[field_name] = set[Any] # Mixed types + break + else: + schema[field_name] = set[Any] # All sets empty + + elif value_type is dict and field_values: + # Infer dict types from first non-empty dict + for d in field_values: + if d: # non-empty dict + key_types = {type(k) for k in d.keys()} + value_types = {type(v) for v in d.values()} + + if len(key_types) == 1 and len(value_types) == 1: + key_type = next(iter(key_types)) + val_type = next(iter(value_types)) + schema[field_name] = dict[key_type, val_type] + else: + schema[field_name] = dict[Any, Any] # Mixed types + break + else: + schema[field_name] = dict[Any, Any] # All dicts empty + + else: + schema[field_name] = value_type + + else: + # Mixed types - use Union or Any + schema[field_name] = Any + + return schema + + +def arrow_list_to_set(lst: list) -> set: + """Convert Arrow list back to Python set (removes duplicates).""" + return set(lst) if lst is not None else set() + + def dict_to_arrow_list(d: dict) -> list[dict]: """Convert Python dict to Arrow-compatible list of key-value structs.""" return [{"key": k, "value": v} for k, v in d.items()] @@ -353,7 +480,9 @@ def arrow_list_to_dict(lst: list[dict]) -> dict: return {item["key"]: item["value"] for item in lst if item is not None} -def python_dicts_to_arrow_table(data: list[dict], schema: dict[str, type]) -> pa.Table: +def python_dicts_to_arrow_table( + data: list[dict], schema: dict[str, type] | None = None +) -> pa.Table: """ Convert list of Python dictionaries to PyArrow table with proper type conversion. @@ -446,13 +575,52 @@ def _convert_python_value_for_arrow(value, python_type): non_none_type = args[0] if args[1] is type(None) else args[1] return _convert_python_value_for_arrow(value, non_none_type) - # Handle list types - elif origin is list: + # Handle abstract collections + elif origin is list or origin in {Collection, Sequence, Iterable}: if not isinstance(value, (list, tuple)): raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") - element_type = args[0] + element_type = args[0] if args else Any return [_convert_python_value_for_arrow(item, element_type) for item in value] + # Handle set types + elif origin is set or origin is Set: + if not isinstance(value, (set, frozenset, list, tuple)): + raise TypeError( + f"Expected set/list/tuple for {python_type}, got {type(value)}" + ) + element_type = args[0] if args else Any + + # Convert set to sorted list for deterministic ordering + if isinstance(value, (set, frozenset)): + try: + # Sort if elements are comparable + value_list = sorted(value) + except TypeError: + # If elements aren't comparable (e.g., mixed types), convert to list as-is + # This maintains some order but isn't guaranteed to be deterministic + value_list = list(value) + else: + # Already a list/tuple, keep as-is + value_list = list(value) + + return [ + _convert_python_value_for_arrow(item, element_type) for item in value_list + ] + + # Handle mapping types + elif origin is dict or origin is Mapping: + if not isinstance(value, dict): + raise TypeError(f"Expected dict for {python_type}, got {type(value)}") + + key_type, value_type = (args[0], args[1]) if len(args) >= 2 else (Any, Any) + # Convert dict to list of key-value structs + key_value_list = [] + for k, v in value.items(): + converted_key = _convert_python_value_for_arrow(k, key_type) + converted_value = _convert_python_value_for_arrow(v, value_type) + key_value_list.append({"key": converted_key, "value": converted_value}) + return key_value_list + # Handle tuple types elif origin is tuple: if not isinstance(value, (list, tuple)): diff --git a/src/orcapod/semantic_types/unused/schemas.py b/src/orcapod/semantic_types/unused/schemas.py new file mode 100644 index 0000000..a028608 --- /dev/null +++ b/src/orcapod/semantic_types/unused/schemas.py @@ -0,0 +1,357 @@ +# from typing import Self +# from orcapod.types.core import DataType, TypeSpec +# from orcapod.types.semantic_types import ( +# SemanticType, +# SemanticTypeRegistry, +# PythonArrowConverter, +# ) +# import pyarrow as pa +# import datetime + +# # This mapping is expected to be stable +# # Be sure to test this assumption holds true +# DEFAULT_ARROW_TYPE_LUT = { +# int: pa.int64(), +# float: pa.float64(), +# str: pa.large_string(), +# bool: pa.bool_(), +# } + + +# def python_to_arrow_type(python_type: type) -> pa.DataType: +# if python_type in DEFAULT_ARROW_TYPE_LUT: +# return DEFAULT_ARROW_TYPE_LUT[python_type] +# raise TypeError(f"Converstion of python type {python_type} is not supported yet") + + +# def arrow_to_python_type(arrow_type: pa.DataType) -> type: +# if pa.types.is_integer(arrow_type): +# return int +# elif pa.types.is_floating(arrow_type): +# return float +# elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): +# return str +# elif pa.types.is_boolean(arrow_type): +# return bool +# elif pa.types.is_date(arrow_type): +# return datetime.date +# elif pa.types.is_timestamp(arrow_type): +# return datetime.datetime +# elif pa.types.is_binary(arrow_type): +# return bytes +# else: +# raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") + + +# class PythonSchema(dict[str, DataType]): +# """ +# A schema for Python data types, mapping string keys to Python types. + +# This is used to define the expected structure of data packets in OrcaPod. + +# Attributes +# ---------- +# keys : str +# The keys of the schema. +# values : type +# The types corresponding to each key. + +# Examples +# -------- +# >>> schema = PythonSchema(name=str, age=int) +# >>> print(schema) +# {'name': , 'age': } +# """ + +# def copy(self) -> "PythonSchema": +# return PythonSchema(self) + +# def to_semantic_schema( +# self, semantic_type_registry: SemanticTypeRegistry +# ) -> "SemanticSchema": +# """ +# Convert the Python schema to a semantic schema using the provided semantic type registry. + +# Parameters +# ---------- +# semantic_type_registry : SemanticTypeRegistry +# The registry containing semantic type information. + +# Returns +# ------- +# SemanticSchema +# A new schema mapping keys to tuples of Python types and optional semantic type identifiers. + +# Examples +# -------- +# >>> python_schema = PythonSchema(name=str, age=int) +# >>> semantic_schema = python_schema.to_semantic_schema(registry) +# >>> print(semantic_schema) +# {'name': (str, None), 'age': (int, None)} +# """ +# return SemanticSchema.from_typespec(self, semantic_type_registry) + +# def to_arrow_schema( +# self, +# semantic_type_registry: SemanticTypeRegistry | None = None, +# converters: dict[str, PythonArrowConverter] | None = None, +# ) -> pa.Schema: +# """ +# Convert the Python schema to an Arrow schema. +# If converters are provided, they are used to convert the schema. Note that +# no validation is performed on the converters, so they must be compatible with the schema. +# """ +# if converters is not None: +# # If converters are provided, use them to convert the schema +# fields = [] +# for field_name, python_type in self.items(): +# if field_name in converters: +# converter = converters[field_name] +# arrow_type = converter.arrow_type +# metadata = None +# if converter.semantic_type_name is not None: +# metadata = { +# b"semantic_type": converter.semantic_type_name.encode( +# "utf-8" +# ) +# } +# else: +# arrow_type = python_to_arrow_type(python_type) +# metadata = None +# fields.append(pa.field(field_name, arrow_type, metadata=metadata)) +# return pa.schema(fields) + +# if semantic_type_registry is None: +# raise ValueError( +# "semantic_type_registry must be provided if converters are not" +# ) +# # Otherwise, convert using the semantic type registry +# return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() + +# @classmethod +# def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: +# """ +# Create a PythonSchema from a SemanticSchema. + +# Parameters +# ---------- +# semantic_schema : SemanticSchema +# The semantic schema to convert. + +# Returns +# ------- +# PythonSchema +# A new schema mapping keys to Python types. +# """ +# return cls(semantic_schema.get_python_types()) + +# @classmethod +# def from_arrow_schema( +# cls, +# arrow_schema: pa.Schema, +# semantic_type_registry: SemanticTypeRegistry | None = None, +# converters: dict[str, PythonArrowConverter] | None = None, +# ) -> Self: +# """ +# Create a PythonSchema from an Arrow schema. + +# Parameters +# ---------- +# arrow_schema : pa.Schema +# The Arrow schema to convert. +# semantic_type_registry : SemanticTypeRegistry +# The registry containing semantic type information. +# skip_system_columns : bool, optional +# Whether to skip system columns (default is True). +# converters : dict[str, PythonArrowConverter], optional +# A dictionary of converters to use for converting the schema. If provided, the schema will be +# converted using the converters. If not provided, the schema will be converted using the semantic type +# registry. + +# Returns +# ------- +# PythonSchema +# A new schema mapping keys to Python types. +# """ +# if converters is not None: +# # If converters are provided, use them to convert the schema +# python_types = {} +# for field in arrow_schema: +# # TODO: consider performing validation of semantic type +# if field.name in converters: +# converter = converters[field.name] +# python_types[field.name] = converter.python_type +# else: +# python_types[field.name] = arrow_to_python_type(field.type) +# return cls(python_types) + +# if semantic_type_registry is None: +# raise ValueError( +# "semantic_type_registry must be provided if converters are not" +# ) +# semantic_schema = SemanticSchema.from_arrow_schema( +# arrow_schema, +# semantic_type_registry, +# ) +# return cls(semantic_schema.get_python_types()) + + +# class SemanticSchema(dict[str, type | SemanticType]): +# """ +# A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. + +# This is used to define the expected structure of data packets with semantic types in OrcaPod. + +# Attributes +# ---------- +# keys : str +# The keys of the schema. +# values : type | SemanticType +# Either type for simple fields or SemanticType for semantic fields. + +# Examples +# -------- +# >>> schema = SemanticSchema(image=SemanticType('path'), age=int) +# >>> print(schema) +# {"image": SemanticType(name='path'), "age": })} +# """ + +# def get_semantic_fields(self) -> dict[str, SemanticType]: +# """ +# Get a dictionary of semantic fields in the schema. + +# Returns +# ------- +# dict[str, SemanticType] +# A dictionary mapping keys to their corresponding SemanticType. +# """ +# return {k: v for k, v in self.items() if isinstance(v, SemanticType)} + +# def get_python_types(self) -> dict[str, type]: +# """ +# Get the Python types for all keys in the schema. + +# Returns +# ------- +# dict[str, type] +# A dictionary mapping keys to their corresponding Python types. +# """ +# return { +# k: v.get_default_python_type() if isinstance(v, SemanticType) else v +# for k, v in self.items() +# } + +# def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: +# """ +# Get the Arrow types for all keys in the schema. + +# Returns +# ------- +# dict[str, tuple[pa.DataType, str|None]] +# A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, +# the second element of the tuple is the semantic type name; otherwise, it is None. +# """ +# return { +# k: (v.get_default_arrow_type(), v.name) +# if isinstance(v, SemanticType) +# else (python_to_arrow_type(v), None) +# for k, v in self.items() +# } + +# def to_arrow_schema(self) -> pa.Schema: +# """ +# Get the Arrow schema, which is a PythonSchema representation of the semantic schema. + +# Returns +# ------- +# PythonSchema +# A new schema mapping keys to Python types. +# """ +# fields = [] +# for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): +# if semantic_type_name is not None: +# field = pa.field( +# k, +# arrow_type, +# metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, +# ) +# else: +# field = pa.field(k, arrow_type) +# fields.append(field) + +# return pa.schema(fields) + +# def to_python_schema(self) -> PythonSchema: +# """ +# Get the Python schema, which is a PythonSchema representation of the semantic schema. + +# Returns +# ------- +# PythonSchema +# A new schema mapping keys to Python types. +# """ +# return PythonSchema.from_semantic_schema(self) + +# @classmethod +# def from_arrow_schema( +# cls, +# arrow_schema: pa.Schema, +# semantic_type_registry: SemanticTypeRegistry, +# ) -> Self: +# """ +# Create a SemanticSchema from an Arrow schema. + +# Parameters +# ---------- +# arrow_schema : pa.Schema +# The Arrow schema to convert. + +# Returns +# ------- +# SemanticSchema +# A new schema mapping keys to tuples of Python types and optional semantic type identifiers. +# """ + +# semantic_schema = {} +# for field in arrow_schema: +# field_type = None +# if field.metadata is not None: +# semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() +# if semantic_type_name: +# semantic_type = semantic_type_registry.get_semantic_type( +# semantic_type_name +# ) +# if semantic_type is None: +# raise ValueError( +# f"Semantic type '{semantic_type_name}' not found in registry" +# ) +# if not semantic_type.supports_arrow_type(field.type): +# raise ValueError( +# f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" +# ) +# field_type = semantic_type + +# if ( +# field_type is None +# ): # was not set to semantic type, so fallback to simple conversion +# field_type = arrow_to_python_type(field.type) + +# semantic_schema[field.name] = field_type +# return cls(semantic_schema) + +# @classmethod +# def from_typespec( +# cls, +# typespec: TypeSpec, +# semantic_type_registry: SemanticTypeRegistry, +# ) -> Self: +# semantic_schema = {} +# for key, python_type in typespec.items(): +# semantic_type = semantic_type_registry.get_semantic_type_for_python_type( +# python_type +# ) +# if semantic_type is not None: +# semantic_schema[key] = semantic_type +# else: +# semantic_schema[key] = python_type +# return cls(semantic_schema) diff --git a/src/orcapod/semantic_types/complete_converter.py b/src/orcapod/semantic_types/unused/semantic_converters.py similarity index 67% rename from src/orcapod/semantic_types/complete_converter.py rename to src/orcapod/semantic_types/unused/semantic_converters.py index f4177d9..6abb564 100644 --- a/src/orcapod/semantic_types/complete_converter.py +++ b/src/orcapod/semantic_types/unused/semantic_converters.py @@ -2,8 +2,9 @@ from typing import get_origin, get_args, Any import typing from collections.abc import Collection, Sequence, Mapping, Iterable, Set -import sys -from orcapod.semantic_types.struct_converters import SemanticTypeRegistry +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry +from orcapod.types import TypeSpec + # Basic type mapping for Python -> Arrow conversion _PYTHON_TO_ARROW_MAP = { @@ -99,6 +100,7 @@ def python_type_to_arrow( tuple[int, int] -> pa.list_(pa.int64(), 2) tuple[int, str] -> pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) dict[str, int] -> pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) + Path -> pa.struct([('path', pa.large_string())]) # if registered in semantic registry """ # Handle basic types first @@ -214,12 +216,41 @@ def python_type_to_arrow( raise ValueError(f"Unsupported generic type: {origin}") -def arrow_type_to_python(arrow_type: pa.DataType) -> type: +def python_schema_to_arrow( + python_schema: TypeSpec, semantic_registry: SemanticTypeRegistry | None = None +) -> pa.Schema: + """ + Convert a Python schema (TypeSpec) to a PyArrow schema. + + Args: + python_schema: TypeSpec representing the Python schema + semantic_registry: Optional semantic type registry to check for semantic types + + Returns: + PyArrow Schema object + + Raises: + ValueError: If the Python schema cannot be converted to Arrow schema + """ + + arrow_fields = [] + for field_name, field_type in python_schema.items(): + arrow_type = python_type_to_arrow( + field_type, semantic_registry=semantic_registry + ) + arrow_fields.append(pa.field(field_name, arrow_type)) + return pa.schema(arrow_fields) + + +def arrow_type_to_python( + arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None +) -> type: """ Convert PyArrow data types back to Python type hints. Args: arrow_type: PyArrow data type to convert + semantic_registry: Optional semantic type registry for semantic types Returns: Python type annotation @@ -229,6 +260,7 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: pa.large_list(pa.large_string()) -> list[str] pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) -> dict[str, int] pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) -> tuple[int, str] + pa.struct([('path', pa.large_string())]) -> Path # if registered in semantic registry Raises: TypeError: If the Arrow type cannot be converted to a Python type @@ -268,13 +300,17 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: key_field = next(f for f in element_type if f.name == "key") value_field = next(f for f in element_type if f.name == "value") - key_python_type = arrow_type_to_python(key_field.type) - value_python_type = arrow_type_to_python(value_field.type) + key_python_type = arrow_type_to_python( + key_field.type, semantic_registry + ) + value_python_type = arrow_type_to_python( + value_field.type, semantic_registry + ) return dict[key_python_type, value_python_type] # Regular list - element_python_type = arrow_type_to_python(element_type) + element_python_type = arrow_type_to_python(element_type, semantic_registry) # Check if this is a fixed-size list (homogeneous tuple representation) if pa.types.is_fixed_size_list(arrow_type): @@ -286,6 +322,14 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: return list[element_python_type] elif pa.types.is_struct(arrow_type): + # First check if this is a semantic type using struct signature recognition + if semantic_registry: + python_type = semantic_registry.get_python_type_for_struct_signature( + arrow_type + ) + if python_type: + return python_type + # Check if this is a heterogeneous tuple representation field_names = [field.name for field in arrow_type] @@ -293,20 +337,25 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: if all(name.startswith("f") and name[1:].isdigit() for name in field_names): # Sort by field index to maintain order sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) - field_types = [arrow_type_to_python(field.type) for field in sorted_fields] + field_types = [ + arrow_type_to_python(field.type, semantic_registry) + for field in sorted_fields + ] return tuple[tuple(field_types)] else: - # TODO: Could support NamedTuple or dataclass conversion here + # Unknown struct type - cannot convert raise TypeError( f"Cannot convert struct type to Python type hint. " f"Struct has fields: {field_names}. " - f"Only tuple-like structs (f0, f1, ...) are supported." + f"Only tuple-like structs (f0, f1, ...) or registered semantic structs are supported." ) elif pa.types.is_map(arrow_type): # Handle pa.map_ types (though we prefer list representation) - key_python_type = arrow_type_to_python(arrow_type.key_type) - value_python_type = arrow_type_to_python(arrow_type.item_type) + key_python_type = arrow_type_to_python(arrow_type.key_type, semantic_registry) + value_python_type = arrow_type_to_python( + arrow_type.item_type, semantic_registry + ) return dict[key_python_type, value_python_type] elif pa.types.is_union(arrow_type): @@ -317,14 +366,16 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: child_types = [] for i in range(arrow_type.num_fields): child_field = arrow_type[i] - child_types.append(arrow_type_to_python(child_field.type)) + child_types.append( + arrow_type_to_python(child_field.type, semantic_registry) + ) if len(child_types) == 2 and type(None) in child_types: # This is Optional[T] non_none_type = next(t for t in child_types if t is not type(None)) - return typing.Optional[non_none_type] + return typing.Optional[non_none_type] # type: ignore else: - return typing.Union[tuple(child_types)] + return typing.Union[tuple(child_types)] # type: ignore else: raise TypeError( @@ -334,7 +385,31 @@ def arrow_type_to_python(arrow_type: pa.DataType) -> type: ) -def parse_type_string(type_string: str): +def arrow_schema_to_python( + arrow_schema: pa.Schema, semantic_registry: SemanticTypeRegistry | None = None +) -> TypeSpec: + """ + Convert a PyArrow schema to a Python schema (TypeSpec). + + Args: + arrow_schema: PyArrow Schema object + semantic_registry: Optional semantic type registry for semantic types + + Returns: + TypeSpec representing the Python schema + + Raises: + TypeError: If the Arrow schema cannot be converted to Python schema + """ + return { + field.name: arrow_type_to_python(field.type, semantic_registry) + for field in arrow_schema + } + + +def parse_type_string( + type_string: str, semantic_registry: SemanticTypeRegistry | None = None +): """ Parse a type hint from a string representation. Useful when you have type hints as strings. @@ -362,7 +437,7 @@ def parse_type_string(type_string: str): "Union": typing.Union, } type_hint = eval(type_string, {"__builtins__": {}}, namespace) - return python_type_to_arrow(type_hint) + return python_type_to_arrow(type_hint, semantic_registry) except Exception as e: raise ValueError(f"Could not parse type string '{type_string}': {e}") @@ -567,7 +642,7 @@ def python_dicts_to_arrow_table( try: table = pa.table(converted_data, schema=arrow_schema) return table - except Exception as e: + except Exception: # Fallback: create each column separately arrays = [] for field in arrow_schema: @@ -599,9 +674,7 @@ def _convert_python_value_for_arrow( Value in Arrow-compatible format """ # First, check if this is a semantic type - if semantic_registry and hasattr( - semantic_registry, "get_converter_for_python_type" - ): + if semantic_registry: converter = semantic_registry.get_converter_for_python_type(python_type) if converter: # Convert using semantic type converter @@ -783,12 +856,14 @@ def _convert_arrow_value_to_python( Returns: Value converted to proper Python type """ - # First, check if this is a semantic struct type + # First, check if this is a semantic struct type using signature recognition if semantic_registry and pa.types.is_struct(arrow_type): - converter = semantic_registry.get_converter_for_struct_type(arrow_type) - if converter and isinstance(value, dict): - # Convert using semantic type converter - return converter.struct_dict_to_python(value) + python_type = semantic_registry.get_python_type_for_struct_signature(arrow_type) + if python_type: + converter = semantic_registry.get_converter_for_python_type(python_type) + if converter and isinstance(value, dict): + # Convert using semantic type converter + return converter.struct_dict_to_python(value) # Fall back to standard type conversion # Handle basic types - no conversion needed @@ -906,412 +981,25 @@ def _convert_arrow_value_to_python( if __name__ == "__main__": - print("=== Complete Python Type Hint ↔ PyArrow Type Converter ===\n") + print("=== Semantic Type System with Struct Signature Recognition ===\n") - # Test basic functionality first - print("Testing basic round-trip:") - try: - # Simple test - python_type = dict[str, int] - arrow_type = python_type_to_arrow(python_type) - recovered_type = arrow_type_to_python(arrow_type) - print(f"✓ {python_type} -> {arrow_type} -> {recovered_type}") - print(f" Match: {recovered_type == python_type}") - except Exception as e: - print(f"✗ Basic test failed: {e}") - - print("\n" + "=" * 60) - print("Testing complex nested structures:") - - complex_nested_tests = [ - # Nested dictionaries - ( - dict[str, dict[str, int]], - pa.large_list( - pa.struct( - [ - ("key", pa.large_string()), - ( - "value", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.int64())] - ) - ), - ), - ] - ) - ), - ), - # Mixed complex types in tuples - ( - tuple[dict[str, int], list[str]], - pa.struct( - [ - ( - "f0", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.int64())] - ) - ), - ), - ("f1", pa.large_list(pa.large_string())), - ] - ), - ), - # Complex value types in dicts - ( - dict[str, list[int]], - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.large_list(pa.int64()))] - ) - ), - ), - # Triple nesting - ( - list[dict[str, list[int]]], - pa.large_list( - pa.large_list( - pa.struct( - [ - ("key", pa.large_string()), - ("value", pa.large_list(pa.int64())), - ] - ) - ) - ), - ), - # Complex tuple with nested structures - ( - tuple[list[int], dict[str, float], str], - pa.struct( - [ - ("f0", pa.large_list(pa.int64())), - ( - "f1", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.float64())] - ) - ), - ), - ("f2", pa.large_string()), - ] - ), - ), - ] - - for python_type, expected_arrow_type in complex_nested_tests: - try: - result = python_type_to_arrow(python_type) - success = result == expected_arrow_type - status = "✓" if success else "✗" - print(f"{status} {python_type}") - print(f" -> {result}") - if not success: - print(f" Expected: {expected_arrow_type}") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing complex nested round-trips:") - - complex_round_trip_tests = [ - dict[str, dict[str, int]], - tuple[dict[str, int], list[str]], - dict[str, list[int]], - list[dict[str, list[int]]], - tuple[list[int], dict[str, float], str], - dict[str, tuple[int, str]], - list[tuple[dict[str, int], list[str]]], - ] - - for python_type in complex_round_trip_tests: - try: - # Python -> Arrow -> Python - arrow_type = python_type_to_arrow(python_type) - recovered_python_type = arrow_type_to_python(arrow_type) - success = recovered_python_type == python_type - status = "✓" if success else "✗" - print(f"{status} {python_type}") - print(f" -> {arrow_type}") - print(f" -> {recovered_python_type}") - if not success: - print(f" Round-trip failed!") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing Python -> Arrow conversion:") - - # Test cases for Python -> Arrow - python_to_arrow_tests = [ - # Basic types - (int, pa.int64()), - (str, pa.large_string()), - (float, pa.float64()), - (bool, pa.bool_()), - # Lists (both regular and large) - (list[int], pa.large_list(pa.int64())), - (list[str], pa.large_list(pa.large_string())), - (list[float], pa.large_list(pa.float64())), - # Homogeneous tuples (always use regular fixed-size lists) - (tuple[int, int], pa.list_(pa.int64(), 2)), - (tuple[str, str, str], pa.list_(pa.large_string(), 3)), - # Heterogeneous tuples - (tuple[int, str], pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), - ( - tuple[int, str, float], - pa.struct( - [("f0", pa.int64()), ("f1", pa.large_string()), ("f2", pa.float64())] - ), - ), - # Dict types - using large_list> for Polars compatibility - ( - dict[str, int], - pa.large_list( - pa.struct([("key", pa.large_string()), ("value", pa.int64())]) - ), - ), - ( - dict[int, str], - pa.large_list( - pa.struct([("key", pa.int64()), ("value", pa.large_string())]) - ), - ), - # Nested types - (list[list[int]], pa.large_list(pa.large_list(pa.int64()))), - ( - list[tuple[int, str]], - pa.large_list(pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), - ), - ] - - for python_type, expected_arrow_type in python_to_arrow_tests: - try: - result = python_type_to_arrow(python_type) - success = result == expected_arrow_type - status = "✓" if success else "✗" - print(f"{status} {python_type} -> {result}") - if not success: - print(f" Expected: {expected_arrow_type}") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") + # This system now uses struct signature recognition instead of special marker fields + print("Key improvements:") + print("- Clean, self-documenting struct schemas") + print("- Zero storage overhead (no marker fields)") + print("- Natural field names for user queries") + print("- Struct signature uniquely identifies semantic types") + print("- Registry maps Python types ↔ struct signatures") print("\n" + "=" * 60) - print("Testing Arrow -> Python type conversion:") - - arrow_to_python_tests = [ - # Basic types (both regular and large variants) - (pa.int64(), int), - (pa.string(), str), - (pa.large_string(), str), - (pa.float64(), float), - (pa.bool_(), bool), - (pa.binary(), bytes), - (pa.large_binary(), bytes), - # Lists (both regular and large) - (pa.list_(pa.int64(), -1), list[int]), - (pa.large_list(pa.int64()), list[int]), - (pa.list_(pa.string(), -1), list[str]), - (pa.large_list(pa.large_string()), list[str]), - # Fixed-size lists (homogeneous tuples) - (pa.list_(pa.int64(), 3), tuple[int, int, int]), - (pa.list_(pa.large_string(), 2), tuple[str, str]), - # Dict representation: both regular and large list variants - ( - pa.list_(pa.struct([("key", pa.string()), ("value", pa.int64())]), -1), - dict[str, int], - ), - ( - pa.large_list( - pa.struct([("key", pa.large_string()), ("value", pa.int64())]) - ), - dict[str, int], - ), - ( - pa.list_(pa.struct([("key", pa.int64()), ("value", pa.string())]), -1), - dict[int, str], - ), - ( - pa.large_list( - pa.struct([("key", pa.int64()), ("value", pa.large_string())]) - ), - dict[int, str], - ), - # Heterogeneous tuples: struct - (pa.struct([("f0", pa.int64()), ("f1", pa.string())]), tuple[int, str]), - (pa.struct([("f0", pa.int64()), ("f1", pa.large_string())]), tuple[int, str]), - ( - pa.struct([("f0", pa.int64()), ("f1", pa.string()), ("f2", pa.float64())]), - tuple[int, str, float], - ), - # Maps (if encountered) - (pa.map_(pa.string(), pa.int64()), dict[str, int]), - (pa.map_(pa.large_string(), pa.int64()), dict[str, int]), - # Nested structures - (pa.list_(pa.list_(pa.int64(), -1), -1), list[list[int]]), - (pa.large_list(pa.large_list(pa.int64())), list[list[int]]), - ] - - for arrow_type, expected_python_type in arrow_to_python_tests: - try: - result = arrow_type_to_python(arrow_type) - success = result == expected_python_type - status = "✓" if success else "✗" - print(f"{status} {arrow_type} -> {result}") - if not success: - print(f" Expected: {expected_python_type}") - except Exception as e: - print(f"✗ {arrow_type} -> ERROR: {e}") + print("Example struct signatures:") + print("Path: struct") + print("UUID: struct") + print("Email: struct") + print("GeoLocation: struct") print("\n" + "=" * 60) - print("Testing round-trip conversion:") - - round_trip_tests = [ - dict[str, int], - list[int], - tuple[int, str], - tuple[str, str, str], - list[dict[str, int]], - list[list[str]], - tuple[int, float, bool], - ] - - for python_type in round_trip_tests: - try: - # Python -> Arrow -> Python - arrow_type = python_type_to_arrow(python_type) - recovered_python_type = arrow_type_to_python(arrow_type) - success = recovered_python_type == python_type - status = "✓" if success else "✗" - print(f"{status} {python_type} -> {arrow_type} -> {recovered_python_type}") - if not success: - print(f" Round-trip failed!") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing string parsing:") - - string_tests = [ - "list[int]", - "tuple[int, str]", - "dict[str, int]", - "list[dict[str, float]]", - ] - - for type_str in string_tests: - try: - result = parse_type_string(type_str) - print(f"✓ '{type_str}' -> {result}") - except Exception as e: - print(f"✗ '{type_str}' -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing practical data conversion:") - - # Test actual data conversion - try: - # Create some test data - test_data = [ - {"name": "Alice", "scores": {"math": 95, "english": 87}}, - {"name": "Bob", "scores": {"math": 78, "english": 92}}, - ] - - # Create schema with nested dict using large_list representation - dict_type = python_type_to_arrow(dict[str, int]) - schema = pa.schema([("name", pa.large_string()), ("scores", dict_type)]) - - print(f"Dict type representation: {dict_type}") - - # Convert Python dicts to the expected list format - converted_data = [] - for record in test_data: - converted_record = record.copy() - if "scores" in converted_record: - # Convert dict to list of key-value structs - scores_dict = converted_record["scores"] - converted_record["scores"] = dict_to_arrow_list(scores_dict) - converted_data.append(converted_record) - - # Create Arrow table - need to handle the conversion properly - try: - table = pa.table(converted_data, schema=schema) - except Exception as table_error: - # If direct conversion fails, convert each column separately - print(f" Direct table creation failed: {table_error}") - print(" Trying column-by-column conversion...") - - # Convert each field separately - arrays = [] - for field in schema: - field_name = field.name - field_type = field.type - - # Extract column data - column_data = [record.get(field_name) for record in converted_data] - - # Create array with explicit type - array = pa.array(column_data, type=field_type) - arrays.append(array) - - # Create table from arrays - table = pa.table(arrays, schema=schema) - print(f"✓ Created PyArrow table with large_list representation") - - # Convert back to Python and reconstruct dicts - result_data = table.to_pylist() - for record in result_data: - if "scores" in record and record["scores"]: - # Convert list of key-value structs back to dict - record["scores"] = arrow_list_to_dict(record["scores"]) - - print(f"✓ Round-trip successful: {result_data[0]['scores']}") - - except Exception as e: - print(f"✗ Practical conversion test failed: {e}") - - print("Testing edge cases and limitations:") - - edge_case_tests = [ - # Complex key types - these are challenging but let's see what happens - "dict[tuple[str, int], str]", # tuple keys - "dict[str, dict[int, list[str]]]", # deeply nested - "Optional[dict[str, int]]", # optional complex types - ] - - for type_str in edge_case_tests: - try: - # Parse and convert - namespace = { - "list": list, - "tuple": tuple, - "dict": dict, - "int": int, - "str": str, - "float": float, - "bool": bool, - "bytes": bytes, - "Optional": typing.Optional, - "Union": typing.Union, - } - python_type = eval(type_str, {"__builtins__": {}}, namespace) - arrow_type = python_type_to_arrow(python_type) - recovered_type = arrow_type_to_python(arrow_type) - - success = recovered_type == python_type - status = "✓" if success else "⚠" - print(f"{status} {type_str}") - print(f" -> {arrow_type}") - print(f" -> {recovered_type}") - if not success: - print(f" Note: Complex key types may have limitations") - - except Exception as e: - print(f"✗ {type_str} -> ERROR: {e}") - - print(f"\n{'=' * 60}") - print("All tests completed!") + print("Clean user queries enabled:") + print("SELECT file_info.path FROM my_table") + print("SELECT location.latitude, location.longitude FROM my_table") + print("SELECT user_id.uuid FROM my_table") diff --git a/src/orcapod/semantic_types/unused/struct_types.py b/src/orcapod/semantic_types/unused/struct_types.py new file mode 100644 index 0000000..3d34588 --- /dev/null +++ b/src/orcapod/semantic_types/unused/struct_types.py @@ -0,0 +1,312 @@ +""" +Dynamic TypedDict creation for preserving Arrow struct field information in Python type hints. + +This solves the problem of converting Arrow struct types back to Python type hints +that preserve full field name and type information. +""" + +from typing import TypedDict, Dict, Type, Any, get_type_hints +import pyarrow as pa +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry + + +class StructTypeManager: + """ + Manages dynamic TypedDict creation for Arrow struct types. + + This ensures that Arrow struct types can be converted to Python type hints + that preserve all field information. + """ + + def __init__(self): + # Cache created TypedDict classes to avoid duplicates + self._struct_signature_to_typeddict: Dict[pa.StructType, Type] = {} + self._typeddict_to_struct_signature: Dict[Type, pa.StructType] = {} + self._created_type_names: set[str] = set() + + def get_or_create_typeddict_for_struct( + self, + struct_type: pa.StructType, + semantic_registry: SemanticTypeRegistry | None = None, + ) -> Type: + """ + Get or create a TypedDict class that represents the Arrow struct type. + + Args: + struct_type: PyArrow struct type + semantic_registry: Optional semantic registry for nested types + + Returns: + TypedDict class that preserves all field information + """ + # Check cache first + if struct_type in self._struct_signature_to_typeddict: + return self._struct_signature_to_typeddict[struct_type] + + # Create field specifications for TypedDict + field_specs = {} + for field in struct_type: + field_name = field.name + python_type = self._convert_arrow_type_to_python_type( + field.type, semantic_registry + ) + field_specs[field_name] = python_type + + # Generate unique name for the TypedDict + type_name = self._generate_unique_type_name(field_specs) + + # Create TypedDict dynamically + typeddict_class = TypedDict(type_name, field_specs) + + # Cache the mapping + self._struct_signature_to_typeddict[struct_type] = typeddict_class + self._typeddict_to_struct_signature[typeddict_class] = struct_type + + return typeddict_class + + def get_struct_type_for_typeddict( + self, typeddict_class: Type + ) -> pa.StructType | None: + """Get the Arrow struct type for a dynamically created TypedDict.""" + return self._typeddict_to_struct_signature.get(typeddict_class) + + def is_dynamic_typeddict(self, python_type: Type) -> bool: + """Check if a type is one of our dynamically created TypedDicts.""" + return python_type in self._typeddict_to_struct_signature + + def _convert_arrow_type_to_python_type( + self, arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None + ) -> Type: + """Convert Arrow type to Python type, handling nested structs.""" + + # Handle nested struct types recursively + if pa.types.is_struct(arrow_type): + # Check if it's a registered semantic type first + if semantic_registry: + python_type = semantic_registry.get_python_type_for_struct_signature( + arrow_type + ) + if python_type: + return python_type + + # Create dynamic TypedDict for unregistered struct + return self.get_or_create_typeddict_for_struct( + arrow_type, semantic_registry + ) + + # For non-struct types, use standard conversion + from orcapod.semantic_types.semantic_converters import arrow_type_to_python + + return arrow_type_to_python(arrow_type, semantic_registry) + + def _generate_unique_type_name(self, field_specs: Dict[str, Type]) -> str: + """Generate a unique name for the TypedDict based on field specifications.""" + + # Create a descriptive name based on field names + field_names = sorted(field_specs.keys()) + if len(field_names) <= 3: + base_name = "Struct_" + "_".join(field_names) + else: + base_name = f"Struct_{len(field_names)}fields" + + # Ensure uniqueness + counter = 1 + type_name = base_name + while type_name in self._created_type_names: + type_name = f"{base_name}_{counter}" + counter += 1 + + self._created_type_names.add(type_name) + return type_name + + +# Global instance for managing struct types +_struct_type_manager = StructTypeManager() + + +def arrow_struct_to_python_type( + struct_type: pa.StructType, semantic_registry: SemanticTypeRegistry | None = None +) -> Type: + """ + Convert Arrow struct type to Python type hint that preserves field information. + + This creates a TypedDict that exactly matches the Arrow struct fields. + + Args: + struct_type: PyArrow struct type to convert + semantic_registry: Optional semantic registry for registered types + + Returns: + TypedDict class that preserves all field names and types + + Example: + struct -> TypedDict with name: str, age: int + """ + # First check if it's a registered semantic type + if semantic_registry: + python_type = semantic_registry.get_python_type_for_struct_signature( + struct_type + ) + if python_type: + return python_type + + # Create dynamic TypedDict for unregistered struct + return _struct_type_manager.get_or_create_typeddict_for_struct( + struct_type, semantic_registry + ) + + +def is_dynamic_struct_type(python_type: Type) -> bool: + """Check if a Python type is a dynamically created struct TypedDict.""" + return _struct_type_manager.is_dynamic_typeddict(python_type) + + +def get_struct_signature_for_dynamic_type(python_type: Type) -> pa.StructType | None: + """Get the Arrow struct signature for a dynamically created TypedDict.""" + return _struct_type_manager.get_struct_type_for_typeddict(python_type) + + +class DynamicStructConverter: + """Converter for dynamically created TypedDict structs.""" + + def __init__(self, typeddict_class: Type, struct_type: pa.StructType): + self.typeddict_class = typeddict_class + self.struct_type = struct_type + self._semantic_type_name = f"dynamic_struct_{typeddict_class.__name__.lower()}" + + @property + def semantic_type_name(self) -> str: + return self._semantic_type_name + + @property + def python_type(self) -> Type: + return self.typeddict_class + + @property + def arrow_struct_type(self) -> pa.StructType: + return self.struct_type + + def python_to_struct_dict(self, value: dict) -> dict: + """Convert TypedDict to Arrow struct dict (no conversion needed).""" + if not isinstance(value, dict): + raise TypeError( + f"Expected dict for {self.typeddict_class}, got {type(value)}" + ) + + # Validate that all required fields are present + type_hints = get_type_hints(self.typeddict_class) + for field_name in type_hints: + if field_name not in value: + raise ValueError( + f"Missing required field '{field_name}' for {self.typeddict_class}" + ) + + return value.copy() + + def struct_dict_to_python(self, struct_dict: dict) -> dict: + """Convert Arrow struct dict to TypedDict (no conversion needed).""" + return struct_dict.copy() + + def can_handle_python_type(self, python_type: Type) -> bool: + return python_type == self.typeddict_class + + +def register_dynamic_struct_converter( + registry: SemanticTypeRegistry, typeddict_class: Type, struct_type: pa.StructType +) -> None: + """Register a converter for a dynamically created TypedDict struct.""" + converter = DynamicStructConverter(typeddict_class, struct_type) + registry.register_converter(converter) + + +# Updated arrow_type_to_python function that preserves struct field information +def enhanced_arrow_type_to_python( + arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None +) -> Type: + """ + Enhanced version of arrow_type_to_python that preserves struct field information. + + For struct types, this creates TypedDict classes that preserve all field names and types. + """ + + # Handle struct types with full field preservation + if pa.types.is_struct(arrow_type): + return arrow_struct_to_python_type(arrow_type, semantic_registry) + + # For non-struct types, use standard conversion + from orcapod.semantic_types.semantic_converters import arrow_type_to_python + + return arrow_type_to_python(arrow_type, semantic_registry) + + +# Example usage and demonstration +if __name__ == "__main__": + print("=== Dynamic TypedDict Creation for Arrow Structs ===\n") + + from sample_converters import create_standard_semantic_registry + + # Create semantic registry + registry = create_standard_semantic_registry() + + # Test with various Arrow struct types + test_structs = [ + pa.struct([("name", pa.string()), ("age", pa.int64())]), + pa.struct([("x", pa.float64()), ("y", pa.float64()), ("z", pa.float64())]), + pa.struct( + [ + ("person", pa.struct([("name", pa.string()), ("age", pa.int64())])), + ("active", pa.bool_()), + ] + ), + ] + + print("Converting Arrow struct types to Python type hints:") + print("=" * 55) + + created_types = [] + for i, struct_type in enumerate(test_structs): + python_type = arrow_struct_to_python_type(struct_type, registry) + created_types.append(python_type) + + print(f"\nStruct {i + 1}:") + print(f" Arrow: {struct_type}") + print(f" Python: {python_type}") + print(f" Type name: {python_type.__name__}") + + # Show field information + type_hints = get_type_hints(python_type) + print(f" Fields: {type_hints}") + + print(f"\n" + "=" * 55) + print("Testing usage of created TypedDict types:") + + # Test the first created type (name, age) + PersonType = created_types[0] + person_data: PersonType = {"name": "Alice", "age": 30} + print(f"\nPerson data: {person_data}") + print( + f"Type check: {isinstance(person_data, dict)}" + ) # Still a regular dict at runtime + print(f"Field access: name={person_data['name']}, age={person_data['age']}") + + # Test nested struct type + if len(created_types) > 2: + NestedType = created_types[2] + # For nested struct, we need to create the inner struct too + inner_person: PersonType = {"name": "Bob", "age": 25} + nested_data: NestedType = {"person": inner_person, "active": True} + print(f"\nNested data: {nested_data}") + print(f"Nested access: person.name={nested_data['person']['name']}") + + print(f"\n" + "=" * 55) + print("Benefits of this approach:") + print("✓ Full field information preserved in type hints") + print("✓ Arrow struct -> Python type conversion is complete") + print("✓ Type checkers understand the structure") + print("✓ Runtime is still regular dicts (zero overhead)") + print("✓ Perfect round-trip: Python -> Arrow -> Python") + print("✓ Handles nested structs recursively") + + print( + f"\nDynamic TypedDict creation successfully preserves all Arrow struct field information!" + ) diff --git a/src/orcapod/semantic_types/table_converters.py b/src/orcapod/semantic_types/unused/table_converters.py similarity index 88% rename from src/orcapod/semantic_types/table_converters.py rename to src/orcapod/semantic_types/unused/table_converters.py index c1f9265..d9161ef 100644 --- a/src/orcapod/semantic_types/table_converters.py +++ b/src/orcapod/semantic_types/unused/table_converters.py @@ -6,7 +6,7 @@ """ from collections.abc import Mapping -from typing import Any, Self +from typing import Any, Protocol, Self import pyarrow as pa from orcapod.types import TypeSpec @@ -125,6 +125,42 @@ def get_regular_fields(self) -> dict[str, type]: return regular_fields +class SemanticTableConverter(Protocol): + """Protocol for semantic table converters. + + This defines the interface for converting between Python dicts and Arrow tables + with semantic types. + """ + + def get_struct_converter(self, field: str) -> StructConverter | None: + """Get struct converter for a specific field in table.""" + ... + + def python_dict_to_struct_dict( + self, data_dict: Mapping[str, Any] + ) -> dict[str, Any]: + """Convert Python dict to struct dict for semantic fields.""" + ... + + def struct_dict_to_python_dict( + self, struct_dict: Mapping[str, Any] + ) -> dict[str, Any]: + """Convert struct dict back to Python dict for semantic fields.""" + ... + + def python_dict_to_arrow_table(self, data_dict: dict[str, Any]) -> pa.Table: + """Convert single Python dict to Arrow table.""" + ... + + def python_dicts_to_arrow_table(self, data_dicts: list[dict[str, Any]]) -> pa.Table: + """Convert list of Python dicts to Arrow table with semantic structs.""" + ... + + def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: + """Convert Arrow table back to list of Python dicts.""" + ... + + class SchemaSemanticTableConverter: """Schema-specific semantic converter that pre-resolves semantic type converters for efficiency. @@ -155,6 +191,14 @@ def __init__(self, schema: SemanticSchema): else: self.regular_fields.add(field_name) + def get_semantic_fields(self) -> tuple[str, ...]: + """Get names of fields that are semantic types.""" + return tuple(self.field_converters.keys()) + + def get_struct_converter_for_field(self, field: str) -> StructConverter | None: + """Get struct converter for a specific field.""" + return self.field_converters.get(field) + @classmethod def from_python_schema( cls, python_schema: TypeSpec, registry: SemanticTypeRegistry @@ -250,7 +294,7 @@ def python_dict_to_arrow_table(self, data_dict: dict[str, Any]) -> pa.Table: return self.python_dicts_to_arrow_table([data_dict]) -class SemanticTableConverter: +class AutoSemanticTableConverter: """General-purpose converter for working with semantic types without pre-defined schema.""" def __init__(self, registry: SemanticTypeRegistry): diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py index ca29627..c08aa6a 100644 --- a/src/orcapod/types/__init__.py +++ b/src/orcapod/types/__init__.py @@ -1,6 +1,5 @@ from .core import PathLike, PathSet, TypeSpec, DataValue from . import typespec_utils -from .defaults import DEFAULT_REGISTRY as default_registry Packet = dict[str, str] PacketLike = Packet diff --git a/src/orcapod/types/defaults.py b/src/orcapod/types/defaults.py deleted file mode 100644 index f7b5773..0000000 --- a/src/orcapod/types/defaults.py +++ /dev/null @@ -1,51 +0,0 @@ -# A collection of versioned hashers that provide a "default" implementation of hashers. -from orcapod.utils.object_spec import parse_objectspec - - -from orcapod.types.semantic_types import ( - SemanticTypeRegistry, - SemanticType, - CanonicalPath, - PathlibPathConverter, - ArrowStringPathConverter, -) - -CURRENT_VERSION = "v0.1" - - -semantic_path_objectspec = { - "v0.1": { - "_class": "orcapod.types.semantic_types.SemanticType", - "_config": { - "name": "path", - "description": "File system path representation", - "python_converters": [ - { - "_class": "orcapod.types.semantic_types.PathlibPathConverter", - } - ], - "arrow_converters": [ - { - "_class": "orcapod.types.semantic_types.ArrowStringPathConverter", - } - ], - }, - } -} - -semantic_registry_objectspec = { - "v0.1": { - "_class": "orcapod.types.semantic_types.SemanticTypeRegistry", - "_config": {"semantic_types": [semantic_path_objectspec["v0.1"]]}, - } -} - - -SEMANTIC_PATH = SemanticType[CanonicalPath]( - "path", - "File system path representation", - python_converters=[PathlibPathConverter()], - arrow_converters=[ArrowStringPathConverter()], -) - -DEFAULT_REGISTRY = SemanticTypeRegistry([SEMANTIC_PATH]) diff --git a/src/orcapod/types/schemas.py b/src/orcapod/types/schemas.py deleted file mode 100644 index 57f0551..0000000 --- a/src/orcapod/types/schemas.py +++ /dev/null @@ -1,357 +0,0 @@ -from typing import Self -from orcapod.types.core import DataType, TypeSpec -from orcapod.types.semantic_types import ( - SemanticType, - SemanticTypeRegistry, - PythonArrowConverter, -) -import pyarrow as pa -import datetime - -# This mapping is expected to be stable -# Be sure to test this assumption holds true -DEFAULT_ARROW_TYPE_LUT = { - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), - bool: pa.bool_(), -} - - -def python_to_arrow_type(python_type: type) -> pa.DataType: - if python_type in DEFAULT_ARROW_TYPE_LUT: - return DEFAULT_ARROW_TYPE_LUT[python_type] - raise TypeError(f"Converstion of python type {python_type} is not supported yet") - - -def arrow_to_python_type(arrow_type: pa.DataType) -> type: - if pa.types.is_integer(arrow_type): - return int - elif pa.types.is_floating(arrow_type): - return float - elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): - return str - elif pa.types.is_boolean(arrow_type): - return bool - elif pa.types.is_date(arrow_type): - return datetime.date - elif pa.types.is_timestamp(arrow_type): - return datetime.datetime - elif pa.types.is_binary(arrow_type): - return bytes - else: - raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") - - -class PythonSchema(dict[str, DataType]): - """ - A schema for Python data types, mapping string keys to Python types. - - This is used to define the expected structure of data packets in OrcaPod. - - Attributes - ---------- - keys : str - The keys of the schema. - values : type - The types corresponding to each key. - - Examples - -------- - >>> schema = PythonSchema(name=str, age=int) - >>> print(schema) - {'name': , 'age': } - """ - - def copy(self) -> "PythonSchema": - return PythonSchema(self) - - def to_semantic_schema( - self, semantic_type_registry: SemanticTypeRegistry - ) -> "SemanticSchema": - """ - Convert the Python schema to a semantic schema using the provided semantic type registry. - - Parameters - ---------- - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - - Examples - -------- - >>> python_schema = PythonSchema(name=str, age=int) - >>> semantic_schema = python_schema.to_semantic_schema(registry) - >>> print(semantic_schema) - {'name': (str, None), 'age': (int, None)} - """ - return SemanticSchema.from_typespec(self, semantic_type_registry) - - def to_arrow_schema( - self, - semantic_type_registry: SemanticTypeRegistry | None = None, - converters: dict[str, PythonArrowConverter] | None = None, - ) -> pa.Schema: - """ - Convert the Python schema to an Arrow schema. - If converters are provided, they are used to convert the schema. Note that - no validation is performed on the converters, so they must be compatible with the schema. - """ - if converters is not None: - # If converters are provided, use them to convert the schema - fields = [] - for field_name, python_type in self.items(): - if field_name in converters: - converter = converters[field_name] - arrow_type = converter.arrow_type - metadata = None - if converter.semantic_type_name is not None: - metadata = { - b"semantic_type": converter.semantic_type_name.encode( - "utf-8" - ) - } - else: - arrow_type = python_to_arrow_type(python_type) - metadata = None - fields.append(pa.field(field_name, arrow_type, metadata=metadata)) - return pa.schema(fields) - - if semantic_type_registry is None: - raise ValueError( - "semantic_type_registry must be provided if converters are not" - ) - # Otherwise, convert using the semantic type registry - return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() - - @classmethod - def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: - """ - Create a PythonSchema from a SemanticSchema. - - Parameters - ---------- - semantic_schema : SemanticSchema - The semantic schema to convert. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - return cls(semantic_schema.get_python_types()) - - @classmethod - def from_arrow_schema( - cls, - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry | None = None, - converters: dict[str, PythonArrowConverter] | None = None, - ) -> Self: - """ - Create a PythonSchema from an Arrow schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The Arrow schema to convert. - semantic_type_registry : SemanticTypeRegistry - The registry containing semantic type information. - skip_system_columns : bool, optional - Whether to skip system columns (default is True). - converters : dict[str, PythonArrowConverter], optional - A dictionary of converters to use for converting the schema. If provided, the schema will be - converted using the converters. If not provided, the schema will be converted using the semantic type - registry. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - if converters is not None: - # If converters are provided, use them to convert the schema - python_types = {} - for field in arrow_schema: - # TODO: consider performing validation of semantic type - if field.name in converters: - converter = converters[field.name] - python_types[field.name] = converter.python_type - else: - python_types[field.name] = arrow_to_python_type(field.type) - return cls(python_types) - - if semantic_type_registry is None: - raise ValueError( - "semantic_type_registry must be provided if converters are not" - ) - semantic_schema = SemanticSchema.from_arrow_schema( - arrow_schema, - semantic_type_registry, - ) - return cls(semantic_schema.get_python_types()) - - -class SemanticSchema(dict[str, type | SemanticType]): - """ - A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. - - This is used to define the expected structure of data packets with semantic types in OrcaPod. - - Attributes - ---------- - keys : str - The keys of the schema. - values : type | SemanticType - Either type for simple fields or SemanticType for semantic fields. - - Examples - -------- - >>> schema = SemanticSchema(image=SemanticType('path'), age=int) - >>> print(schema) - {"image": SemanticType(name='path'), "age": })} - """ - - def get_semantic_fields(self) -> dict[str, SemanticType]: - """ - Get a dictionary of semantic fields in the schema. - - Returns - ------- - dict[str, SemanticType] - A dictionary mapping keys to their corresponding SemanticType. - """ - return {k: v for k, v in self.items() if isinstance(v, SemanticType)} - - def get_python_types(self) -> dict[str, type]: - """ - Get the Python types for all keys in the schema. - - Returns - ------- - dict[str, type] - A dictionary mapping keys to their corresponding Python types. - """ - return { - k: v.get_default_python_type() if isinstance(v, SemanticType) else v - for k, v in self.items() - } - - def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: - """ - Get the Arrow types for all keys in the schema. - - Returns - ------- - dict[str, tuple[pa.DataType, str|None]] - A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, - the second element of the tuple is the semantic type name; otherwise, it is None. - """ - return { - k: (v.get_default_arrow_type(), v.name) - if isinstance(v, SemanticType) - else (python_to_arrow_type(v), None) - for k, v in self.items() - } - - def to_arrow_schema(self) -> pa.Schema: - """ - Get the Arrow schema, which is a PythonSchema representation of the semantic schema. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - fields = [] - for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): - if semantic_type_name is not None: - field = pa.field( - k, - arrow_type, - metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, - ) - else: - field = pa.field(k, arrow_type) - fields.append(field) - - return pa.schema(fields) - - def to_python_schema(self) -> PythonSchema: - """ - Get the Python schema, which is a PythonSchema representation of the semantic schema. - - Returns - ------- - PythonSchema - A new schema mapping keys to Python types. - """ - return PythonSchema.from_semantic_schema(self) - - @classmethod - def from_arrow_schema( - cls, - arrow_schema: pa.Schema, - semantic_type_registry: SemanticTypeRegistry, - ) -> Self: - """ - Create a SemanticSchema from an Arrow schema. - - Parameters - ---------- - arrow_schema : pa.Schema - The Arrow schema to convert. - - Returns - ------- - SemanticSchema - A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - """ - - semantic_schema = {} - for field in arrow_schema: - field_type = None - if field.metadata is not None: - semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() - if semantic_type_name: - semantic_type = semantic_type_registry.get_semantic_type( - semantic_type_name - ) - if semantic_type is None: - raise ValueError( - f"Semantic type '{semantic_type_name}' not found in registry" - ) - if not semantic_type.supports_arrow_type(field.type): - raise ValueError( - f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" - ) - field_type = semantic_type - - if ( - field_type is None - ): # was not set to semantic type, so fallback to simple conversion - field_type = arrow_to_python_type(field.type) - - semantic_schema[field.name] = field_type - return cls(semantic_schema) - - @classmethod - def from_typespec( - cls, - typespec: TypeSpec, - semantic_type_registry: SemanticTypeRegistry, - ) -> Self: - semantic_schema = {} - for key, python_type in typespec.items(): - semantic_type = semantic_type_registry.get_semantic_type_for_python_type( - python_type - ) - if semantic_type is not None: - semantic_schema[key] = semantic_type - else: - semantic_schema[key] = python_type - return cls(semantic_schema) diff --git a/src/orcapod/types/semantic_converter.py b/src/orcapod/types/semantic_converter.py deleted file mode 100644 index 047ad2c..0000000 --- a/src/orcapod/types/semantic_converter.py +++ /dev/null @@ -1,135 +0,0 @@ -from orcapod.types.semantic_types import PythonArrowConverter -from orcapod.types.schemas import PythonSchema, SemanticSchema -from orcapod.types import TypeSpec, typespec_utils as tsutils - -from typing import Any, Self -from collections.abc import Mapping -import pyarrow as pa -import logging - -logger = logging.getLogger(__name__) - - -class SemanticConverter: - @classmethod - def from_semantic_schema(cls, semantic_schema: SemanticSchema) -> Self: - converter_lut = {} - for ( - field, - semantic_type, - ) in semantic_schema.get_semantic_fields().items(): - converter_lut[field] = PythonArrowConverter.from_semantic_type( - semantic_type - ) - return cls(converter_lut) - - def __init__( - self, - converter_lut: dict[str, PythonArrowConverter], - ): - self._converter_lut = converter_lut - - def from_python_to_arrow_schema(self, python_schema: TypeSpec) -> pa.Schema: - """Convert a Python schema to an Arrow schema""" - return PythonSchema(python_schema).to_arrow_schema( - converters=self._converter_lut - ) - - def from_arrow_to_python_schema(self, arrow_schema: pa.Schema) -> PythonSchema: - """Convert an Arrow schema to a Python schema""" - return PythonSchema.from_arrow_schema( - arrow_schema, converters=self._converter_lut - ) - - def from_python_to_arrow( - self, python_data: Mapping[str, Any], python_schema: TypeSpec | None = None - ) -> pa.Table: - """Convert a dictionary of Python values to Arrow arrays""" - if python_schema is None: - # infer schema from data - python_schema = PythonSchema(tsutils.get_typespec_from_dict(python_data)) - logger.warning( - f"Inferred schema {python_schema} from Python data {python_data}. Note that this may not behave as expected." - ) - - arrow_schema = self.from_python_to_arrow_schema(python_schema) - - arrow_data = {} - for field, value in python_data.items(): - if field in self._converter_lut: - converter = self._converter_lut[field] - arrow_data[field] = converter.from_python_to_arrow(value) - else: - arrow_data[field] = [value] - return pa.Table.from_pydict(arrow_data, schema=arrow_schema) - - def from_arrow_to_arrow_compat_dict( - self, arrow_data: pa.Table - ) -> list[dict[str, Any]]: - """Convert Arrow data to a dictionary of Python values""" - return arrow_data.to_pylist() - - def from_python_to_arrow_compat_dict( - self, python_data: Mapping[str, Any] - ) -> dict[str, Any]: - arrow_compat_dict = dict(python_data) - for field, converter in self._converter_lut.items(): - if field in python_data: - arrow_compat_dict[field] = converter.from_python_to_arrow( - python_data[field] - ) - return arrow_compat_dict - - def from_arrow_to_python(self, arrow_data: pa.Table) -> list[dict[str, Any]]: - """Convert a dictionary of Arrow arrays to Python values""" - - values = [] - for column_name in arrow_data.column_names: - column = arrow_data[column_name] - if column_name not in self._converter_lut: - values.append(column.to_pylist()) - else: - converter = self._converter_lut[column_name] - values.append(converter.from_arrow_to_python(column)) - all_entries = [] - - for entry in zip(*values): - assert len(entry) == len(arrow_data.column_names), ( - "Mismatch in number of columns and values" - ) - all_entries.append(dict(zip(arrow_data.column_names, entry))) - - return all_entries - - def as_dict(self) -> dict[str, PythonArrowConverter]: - """Return the converter lookup table as a dictionary.""" - return self._converter_lut.copy() - - def join(self, other: Self, strict: bool = False) -> Self: - """Join two SemanticConverters by merging their converter lookup tables.""" - if not isinstance(other, SemanticConverter): - raise TypeError("Can only join with another SemanticConverter.") - - new_converter_lut = self._converter_lut.copy() - for key, converter in other._converter_lut.items(): - if key in new_converter_lut: - if strict: - raise ValueError( - f"Key '{key}' already exists in the converter lookup table. Cannot overwrite in strict mode." - ) - logger.warning( - f"Key '{key}' already exists in the converter lookup table. Overwriting with new converter." - ) - new_converter_lut[key] = converter - - return self.__class__(new_converter_lut) - - def rename(self, column_mapping: Mapping[str, str]) -> Self: - """Rename columns in the converter lookup table.""" - new_converter_lut = {} - new_converter_lut = { - column_mapping.get(key, key): converter - for key, converter in self._converter_lut.items() - } - - return self.__class__(new_converter_lut) diff --git a/src/orcapod/types/semantic_types.py b/src/orcapod/types/semantic_types.py deleted file mode 100644 index c0eaef2..0000000 --- a/src/orcapod/types/semantic_types.py +++ /dev/null @@ -1,623 +0,0 @@ -from typing import Any, Self, cast -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -import pyarrow as pa - -from collections.abc import Collection - - -# Converter interfaces using modern generics with ABC -class PythonConverter[T, R](ABC): - """ - Abstract base class for converters between canonical and Python representation types. - T: canonical type, R: Python representation type - """ - - def __init__(self): - # Automatically infer types from inheritance - self._python_type = self._infer_python_type() - - def _infer_python_type(self) -> type[R]: - """Infer the Python type from __orig_bases__""" - for base in getattr(self.__class__, "__orig_bases__", []): - if hasattr(base, "__origin__") and issubclass( - base.__origin__, PythonConverter - ): - # Get the R type parameter (second argument) - args = getattr(base, "__args__", ()) - if len(args) >= 2: - return args[1] # R is the second type parameter - raise RuntimeError(f"Could not infer Python type for {self.__class__.__name__}") - - @abstractmethod - def to_canonical(self, value: R) -> T: - """Convert from Python representation to canonical form""" - pass - - @abstractmethod - def from_canonical(self, value: T) -> R: - """Convert from canonical to Python representation form""" - pass - - @abstractmethod - def can_handle(self, python_type: type) -> bool: ... - - def get_python_type(self) -> type[R]: - """Get the Python type this converter converts into (auto-inferred)""" - return self._python_type - - -class ArrowConverter[T](ABC): - """ - Abstract base class for converters between canonical and Arrow representation types. - T: canonical type - """ - - @abstractmethod - def to_canonical(self, value: pa.Array) -> list[T]: - """Convert from Arrow representation to canonical form""" - pass - - # @abstractmethod - # def from_canonical_to_arrow_compatible(self, value: T) -> Any: - # """Convert from canonical to Arrow-compatible representation""" - # pass - - # @abstractmethod - # def from_arrow_compatible_to_canonical(self, value: Any) -> T: - # """Convert from Arrow-compatible representation to canonical form""" - # pass - - @abstractmethod - def from_canonical(self, value: T | Collection[T]) -> pa.Array: - """Convert from canonical to Arrow representation""" - pass - - @abstractmethod - def can_handle(self, arrow_type: pa.DataType) -> bool: ... - - @abstractmethod - def get_arrow_type(self) -> pa.DataType: - """Get the Arrow DataType this converter handles""" - pass - - -# Canonical types with explicit definitions -@dataclass(frozen=True) -class CanonicalPath: - """Canonical representation of a file system path""" - - path_str: str - is_absolute: bool = False - - def __str__(self) -> str: - return self.path_str - - def __post_init__(self) -> None: - if not self.path_str: - raise ValueError("Path string cannot be empty") - - -@dataclass(frozen=True) -class CanonicalTimestamp: - """Canonical representation of a timestamp""" - - timestamp: int - timezone: str = "UTC" - - def __post_init__(self) -> None: - if self.timestamp < 0: - raise ValueError("Timestamp cannot be negative") - - -@dataclass(frozen=True) -class CanonicalURL: - """Canonical representation of a URL""" - - url: str - scheme: str - host: str - - def __post_init__(self) -> None: - if not self.url.startswith(f"{self.scheme}://"): - raise ValueError(f"URL must start with {self.scheme}://") - - -# Python converters for Path -class PathlibPathConverter(PythonConverter[CanonicalPath, Path]): - """Converter for pathlib.Path objects""" - - def to_canonical(self, value: Path) -> CanonicalPath: - return CanonicalPath(path_str=str(value), is_absolute=value.is_absolute()) - - def from_canonical(self, value: CanonicalPath) -> Path: - return Path(value.path_str) - - def can_handle(self, python_type: type) -> bool: - return issubclass(python_type, Path) - - -# Arrow converters for Path -class ArrowStringPathConverter(ArrowConverter[CanonicalPath]): - """Converter for Arrow string representation of paths""" - - def to_canonical(self, value: pa.Array) -> list[CanonicalPath]: - return [ - CanonicalPath(v, is_absolute=Path(v).is_absolute()) - for v in value.to_pylist() - ] - - def from_canonical( - self, value: CanonicalPath | Collection[CanonicalPath] - ) -> pa.Array: - if isinstance(value, CanonicalPath): - value = [value] - return pa.array([v.path_str for v in value], type=pa.large_string()) - - def from_canonical_to_arrow_compatible(self, value: CanonicalPath) -> str: - return value.path_str - - def from_arrow_compatible_to_canonical(self, value: str) -> CanonicalPath: - return CanonicalPath(path_str=value, is_absolute=Path(value).is_absolute()) - - def can_handle(self, arrow_type: pa.DataType) -> bool: - return arrow_type == pa.large_string() - - def get_arrow_type(self) -> pa.DataType: - return pa.large_string() - - -# Enhanced SemanticType with explicit Python and Arrow handling -class SemanticType[T]: - """ - Represents a semantic type with explicit Python/Arrow converters. - - A SemanticType is a central concept that: - 1. Defines a canonical representation (T) for a domain concept - 2. Manages separate Python and Arrow converters - 3. Provides explicit methods for Python and Arrow operations - 4. Maintains type safety while allowing runtime discovery - - Type parameter T represents the canonical representation type. - """ - - def __init__( - self, - name: str, - description: str = "", - python_converters: Collection[PythonConverter[T, Any]] | None = None, - arrow_converters: Collection[ArrowConverter[T]] | None = None, - ): - self.name = name - self.description = description - - self._python_type_converters: list[PythonConverter[T, Any]] = [] - self._arrow_type_converters: list[ArrowConverter[T]] = [] - - # Default converters - self._default_python_converter: PythonConverter[T, Any] | None = None - self._default_arrow_converter: ArrowConverter[T] | None = None - - if python_converters is not None: - for converter in python_converters: - self.register_python_converter( - converter, - set_default=self._default_python_converter is None, - force=False, - ) - - if arrow_converters is not None: - for converter in arrow_converters: - self.register_arrow_converter( - converter, - set_default=self._default_arrow_converter is None, - force=False, - ) - - def get_default_python_type(self) -> type[T]: - """Get the default Python type for this semantic type""" - if self._default_python_converter: - return self._default_python_converter.get_python_type() - raise ValueError( - f"No default Python converter registered for semantic type '{self.name}'" - ) - - def get_default_arrow_type(self) -> pa.DataType: - """Get the default Arrow DataType for this semantic type""" - if self._default_arrow_converter: - return self._default_arrow_converter.get_arrow_type() - raise ValueError( - f"No default Arrow converter registered for semantic type '{self.name}'" - ) - - def register_python_converter[R]( - self, - converter: PythonConverter[T, R], - set_default: bool = False, - force: bool = False, - ): - """ - Register a Python converter - """ - if converter not in self._python_type_converters: - self._python_type_converters.append(converter) - - if set_default: - if self._default_python_converter is not None and not force: - raise ValueError( - f"Default Python converter already set for semantic type '{self.name}'" - ) - self._default_python_converter = converter - - def register_arrow_converter( - self, - converter: ArrowConverter[T], - set_default: bool = False, - force: bool = False, - ) -> None: - """Register an Arrow converter""" - if converter not in self._arrow_type_converters: - self._arrow_type_converters.append(converter) - - if set_default: - if self._default_arrow_converter is not None and not force: - raise ValueError( - f"Default Arrow converter already set for semantic type '{self.name}'" - ) - self._default_arrow_converter = converter - - # Python-specific methods - def get_python_converter_for_type( - self, python_type: type - ) -> PythonConverter[T, Any] | None: - """Find a Python converter that can handle the given type""" - for converter in self._python_type_converters: - if converter.can_handle(python_type): - return converter - return None - - def get_arrow_converter_for_type( - self, arrow_type: pa.DataType - ) -> ArrowConverter[T] | None: - """Find an Arrow converter for the given Arrow DataType""" - for converter in self._arrow_type_converters: - if converter.can_handle(arrow_type): - return converter - return None - - def get_python_converter_with_output_type( - self, output_type: type - ) -> PythonConverter[T, Any] | None: - """Get a Python converter that can handle the specified output type""" - for converter in self._python_type_converters: - if issubclass(converter.get_python_type(), output_type): - return converter - return None - - def get_arrow_converter_with_output_type( - self, output_type: pa.DataType - ) -> ArrowConverter[T] | None: - for converter in self._arrow_type_converters: - if output_type == converter.get_arrow_type(): - return converter - return None - - def supports_python_type(self, python_type: type) -> bool: - return self.get_python_converter_for_type(python_type) is not None - - def supports_arrow_type(self, arrow_type: pa.DataType) -> bool: - return self.get_arrow_converter_for_type(arrow_type) is not None - - @property - def default_python_converter(self) -> PythonConverter[T, Any] | None: - """Get the default Python converter""" - return self._default_python_converter - - @property - def default_arrow_converter(self) -> ArrowConverter[T] | None: - return self._default_arrow_converter - - def to_canonical_from_python(self, value: Any) -> T: - """Convert Python value to canonical form""" - converter = self.get_python_converter_for_type(type(value)) - if not converter: - raise ValueError( - f"No Python converter found for {type(value)} in semantic type '{self.name}'" - ) - - return converter.to_canonical(value) - - def from_canonical_to_python( - self, value: T, target_type: type | None = None - ) -> Any: - """Convert from canonical to Python representation""" - if target_type is None: - converter = self.default_python_converter - if not converter: - raise ValueError( - f"No default Python converter for semantic type '{self.name}'" - ) - else: - converter = self.get_python_converter_for_type(target_type) - if not converter: - raise ValueError( - f"No converter found for target type '{target_type}' in semantic type '{self.name}'" - ) - - return converter.from_canonical(value) - - def to_canonical_from_arrow(self, value: pa.Array) -> list[T]: - """Convert Arrow value to canonical form using explicit Arrow DataType""" - converter = self.get_arrow_converter_for_type(value.type) - if not converter: - raise ValueError( - f"No Arrow converter found for type '{value.type}' in semantic type '{self.name}'" - ) - - canonical = converter.to_canonical(value) - - return canonical - - def from_canonical_to_arrow( - self, value: T, target_type: pa.DataType | None = None - ) -> pa.Array: - """Convert from canonical to Arrow representation using explicit Arrow DataType""" - - if target_type is None: - converter = self.default_arrow_converter - if not converter: - raise ValueError( - f"No default Arrow converter for semantic type '{self.name}'" - ) - else: - converter = self.get_arrow_converter_for_type(target_type) - if not converter: - raise ValueError( - f"No Arrow converter found for target type '{target_type}' in semantic type '{self.name}'" - ) - - return converter.from_canonical(value) - - def get_python_types(self) -> list[type]: - """Get all supported output Python DataTypes""" - return [ - converter.get_python_type() for converter in self._python_type_converters - ] - - def get_arrow_types(self) -> list[pa.DataType]: - """Get all supported output Arrow DataTypes""" - return [converter.get_arrow_type() for converter in self._arrow_type_converters] - - # Cross-system conversion methods - def convert_python_to_arrow( - self, python_value: Any, arrow_type: pa.DataType | None = None - ) -> Any: - """Convert directly from Python to Arrow representation""" - canonical = self.to_canonical_from_python(python_value) - return self.from_canonical_to_arrow(canonical, arrow_type) - - def convert_arrow_to_python( - self, arrow_value, python_type: type | None = None - ) -> list[Any]: - """Convert directly from Arrow to Python representation""" - canonical_values = self.to_canonical_from_arrow(arrow_value) - return [ - self.from_canonical_to_python(value, target_type=python_type) - for value in canonical_values - ] - - def __str__(self) -> str: - return f"SemanticType(name='{self.name}')" - - def __repr__(self) -> str: - python_count = len(self._python_type_converters) - arrow_count = len(self._arrow_type_converters) - return ( - f"SemanticType(name='{self.name}', " - f"python_converters={python_count}, " - f"arrow_converters={arrow_count})" - ) - - -# Registry with explicit Python and Arrow handling -class SemanticTypeRegistry: - """Registry that manages SemanticType objects with explicit Python/Arrow operations""" - - def __init__(self, semantic_types: Collection[SemanticType] | None = None): - self._semantic_type_lut: dict[str, SemanticType] = {} - self._python_to_semantic_lut: dict[type, SemanticType] = {} - if semantic_types is not None: - for semantic_type in semantic_types: - self.register_semantic_type(semantic_type) - - def register_semantic_type[T](self, semantic_type: SemanticType[T]): - """Register a semantic type""" - if semantic_type.name not in self._semantic_type_lut: - self._semantic_type_lut[semantic_type.name] = semantic_type - else: - raise ValueError( - f"Semantic type {self._semantic_type_lut[semantic_type.name]} is already registered for semantic name {semantic_type.name}" - ) - - python_type = semantic_type.get_default_python_type() - if python_type is None: - raise ValueError( - f"Semantic type {semantic_type.name} does not have a default Python type" - ) - if python_type in self._python_to_semantic_lut: - raise ValueError( - f"Python type {python_type} is already registered for semantic type {self._python_to_semantic_lut[python_type]}" - ) - self._python_to_semantic_lut[python_type] = semantic_type - - def get_semantic_type_for_python_type( - self, python_type: type - ) -> SemanticType | None: - """Get a semantic type by Python type""" - - # check if it's directly registered - semantic_type = self._python_to_semantic_lut.get(python_type) - if semantic_type is None: - # check if it's a subclass - for ( - registered_type, - registered_semantic_type, - ) in self._python_to_semantic_lut.items(): - if issubclass(python_type, registered_type): - return registered_semantic_type - return semantic_type - - def get_arrow_type_for_semantic_type( - self, semantic_type_name: str - ) -> pa.DataType | None: - """Get the default Arrow DataType for a semantic type by name""" - semantic_type = self._semantic_type_lut.get(semantic_type_name) - if semantic_type: - return semantic_type.get_default_arrow_type() - return None - - def get_arrow_type_for_python_type( - self, python_type: type - ) -> tuple[str | None, pa.DataType] | None: - """Get the default Arrow DataType for a Python type""" - semantic_type = self.get_semantic_type_for_python_type(python_type) - if semantic_type: - return semantic_type.name, semantic_type.get_default_arrow_type() - return None - - def from_python_to_arrow(self, python_value: Any) -> tuple[str | None, Any]: - """Convert a Python value to Arrow-targetting representation using the semantic type registry""" - semantic_type = self.get_semantic_type_for_python_type(type(python_value)) - if semantic_type: - return semantic_type.name, semantic_type.convert_python_to_arrow( - python_value - ) - return None, python_value - - def get_semantic_type(self, name: str) -> SemanticType | None: - """Get a semantic type by name""" - return self._semantic_type_lut.get(name) - - def list_semantic_types(self) -> list[SemanticType]: - """Get all registered semantic types""" - return list(self._semantic_type_lut.values()) - - def registered_with_semantic_type(self, python_type: type) -> bool: - """Check if registry has the Python type registered with a semantic type""" - return python_type in self._python_to_semantic_lut - - def supports_semantic_and_arrow_type( - self, semantic_type_name: str, arrow_type: pa.DataType - ) -> bool: - """Check if registry supports the given semantic type and Arrow DataType combination""" - semantic_type = self._semantic_type_lut.get(semantic_type_name) - if not semantic_type: - return False - return semantic_type.supports_arrow_type(arrow_type) - - -# Type-safe wrapper for semantic values -class SemanticValue[T]: - """Type-safe wrapper for semantic values""" - - def __init__(self, value: T, semantic_type: SemanticType[T]): - self._value = value - self._semantic_type = semantic_type - - @property - def value(self) -> T: - return self._value - - @property - def semantic_type(self) -> SemanticType[T]: - return self._semantic_type - - def to_python(self) -> Any: - """Convert to Python representation""" - return self._semantic_type.from_canonical_to_python(self._value) - - def to_python_type(self, python_type: type) -> Any: - """Convert to Arrow representation using specific Arrow DataType""" - return self._semantic_type.from_canonical_to_arrow(self._value, python_type) - - def to_arrow(self) -> Any: - """Convert to Arrow representation using default dtype""" - return self._semantic_type.from_canonical_to_arrow(self._value) - - def to_arrow_with_type(self, arrow_type: pa.DataType) -> Any: - """Convert to Arrow representation using specific Arrow DataType""" - return self._semantic_type.from_canonical_to_arrow(self._value, arrow_type) - - @classmethod - def from_python(cls, python_value: Any, semantic_type: SemanticType[T]) -> Self: - """Create from a Python value""" - canonical = semantic_type.to_canonical_from_python(python_value) - return cls(canonical, semantic_type) - - @classmethod - def from_arrow(cls, arrow_value: Any, semantic_type: SemanticType[T]) -> Self: - """Create from an Arrow value with explicit Arrow DataType""" - canonical = semantic_type.to_canonical_from_arrow(arrow_value) - if len(canonical) != 1: - raise ValueError( - f"Expected single value from Arrow, got {len(canonical)} values" - ) - return cls(canonical[0], semantic_type) - - def __str__(self) -> str: - return f"SemanticValue({self._value}, {self._semantic_type.name})" - - def __repr__(self) -> str: - return f"SemanticValue(value={self._value!r}, semantic_type={self._semantic_type.name})" - - -class PythonArrowConverter[T, R]: - @classmethod - def from_semantic_type(cls, semantic_type: SemanticType[T]) -> Self: - """Create a PythonArrowConverter from a SemanticType""" - python_converter = semantic_type.default_python_converter - arrow_converter = semantic_type.default_arrow_converter - - if not python_converter or not arrow_converter: - raise ValueError( - f"Semantic type '{semantic_type.name}' does not have default converters" - ) - - return cls(python_converter, arrow_converter, semantic_type.name) - - def __init__( - self, - python_converter: PythonConverter[T, R], - arrow_converter: ArrowConverter[T], - semantic_type_name: str | None = None, - ): - self.python_converter = python_converter - self.arrow_converter = arrow_converter - self.semantic_type_name = semantic_type_name - - @property - def python_type(self) -> type[R]: - """Get the Python type this converter handles""" - return self.python_converter.get_python_type() - - @property - def arrow_type(self) -> pa.DataType: - """Get the Arrow DataType this converter handles""" - return self.arrow_converter.get_arrow_type() - - def from_python_to_arrow(self, python_value: R | Collection[R]) -> pa.Array: - """Convert from Python to Arrow representation""" - if isinstance(python_value, self.python_type): - python_value = [python_value] - assert isinstance(python_value, Collection), ( - "Expected a collection of values at this point" - ) - python_values = cast(Collection[R], python_value) - canonicals = [self.python_converter.to_canonical(val) for val in python_values] - return self.arrow_converter.from_canonical(canonicals) - - def from_arrow_to_python(self, arrow_value: pa.Array) -> list[R]: - """Convert from Arrow to Python representation""" - canonical = self.arrow_converter.to_canonical(arrow_value) - return [self.python_converter.from_canonical(value) for value in canonical] diff --git a/src/orcapod/utils/dict_utils.py b/src/orcapod/utils/dict_utils.py new file mode 100644 index 0000000..e69de29 diff --git a/src/orcapod/utils/function_info.py b/src/orcapod/utils/function_info.py new file mode 100644 index 0000000..c76f8e6 --- /dev/null +++ b/src/orcapod/utils/function_info.py @@ -0,0 +1,320 @@ +import ast +import inspect +from collections.abc import Callable + + +def _is_in_string(line: str, pos: int) -> bool: + """Helper to check if a position in a line is inside a string literal.""" + # This is a simplified check - would need proper parsing for robust handling + in_single = False + in_double = False + for i in range(pos): + if line[i] == "'" and not in_double and (i == 0 or line[i - 1] != "\\"): + in_single = not in_single + elif line[i] == '"' and not in_single and (i == 0 or line[i - 1] != "\\"): + in_double = not in_double + return in_single or in_double + + +class SourceProcessor: + """Handles AST-based and fallback source code processing.""" + + # @staticmethod + # def remove_docstrings_and_comments_ast( + # source: str, remove_docstrings: bool = True, remove_comments: bool = True + # ) -> str: + # """Remove docstrings and comments using AST parsing.""" + # try: + # tree = ast.parse(source) + + # if remove_docstrings: + # SourceProcessor._remove_docstrings_from_ast(tree) + + # # Convert back to source + # import astor # Note: This would require astor package + + # processed = astor.to_source(tree) + + # if remove_comments: + # # AST doesn't preserve comments, so we still need line-by-line processing + # processed = SourceProcessor._remove_comments_fallback(processed) + + # return processed + + # except (ImportError, SyntaxError, TypeError): + # # Fall back to string-based processing + # return SourceProcessor._remove_docstrings_and_comments_fallback( + # source, remove_docstrings, remove_comments + # ) + + @staticmethod + def _remove_docstrings_from_ast(node: ast.AST) -> None: + """Remove docstring nodes from AST.""" + for child in ast.walk(node): + if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + if ( + child.body + and isinstance(child.body[0], ast.Expr) + and isinstance(child.body[0].value, ast.Constant) + and isinstance(child.body[0].value.value, str) + ): + child.body.pop(0) + + @staticmethod + def _remove_docstrings_and_comments_fallback( + source: str, remove_docstrings: bool = True, remove_comments: bool = True + ) -> str: + """Fallback string-based processing.""" + if remove_comments: + lines = source.split("\n") + for i, line in enumerate(lines): + comment_pos = line.find("#") + if comment_pos >= 0 and not _is_in_string(line, comment_pos): + lines[i] = line[:comment_pos].rstrip() + source = "\n".join(lines) + + # Simplified docstring removal (keeping original logic) + if remove_docstrings: + # This is basic - the AST approach above is more robust + pass + + return source + + @staticmethod + def _remove_comments_fallback(source: str) -> str: + """Remove comments using line-by-line parsing.""" + lines = source.split("\n") + for i, line in enumerate(lines): + comment_pos = line.find("#") + if comment_pos >= 0 and not _is_in_string(line, comment_pos): + lines[i] = line[:comment_pos].rstrip() + return "\n".join(lines) + + +def extract_decorators(func: Callable) -> list[str]: + """Extract decorator information from function source.""" + decorators = [] + try: + source = inspect.getsource(func) + tree = ast.parse(source) + + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + for decorator in node.decorator_list: + if isinstance(decorator, ast.Name): + decorators.append(decorator.id) + elif isinstance(decorator, ast.Attribute): + decorators.append(ast.unparse(decorator)) + elif isinstance(decorator, ast.Call): + decorators.append(ast.unparse(decorator)) + break # Only process the first function found + + except (ImportError, SyntaxError, TypeError, IOError): + # Fallback: try to extract decorators from source lines + try: + source_lines = inspect.getsourcelines(func)[0] + for line in source_lines: + stripped = line.strip() + if stripped.startswith("@"): + decorators.append(stripped[1:]) + elif stripped.startswith("def "): + break + except (IOError, TypeError): + pass + + return decorators + + +def get_function_components( + func: Callable, + name_override: str | None = None, + include_name: bool = True, + include_module: bool = True, + include_declaration: bool = True, + include_docstring: bool = True, + include_comments: bool = True, + preserve_whitespace: bool = True, + include_annotations: bool = True, + include_code_properties: bool = True, + include_defaults: bool = True, + include_custom_attributes: bool = True, + include_closure_info: bool = True, + include_decorators: bool = True, + include_extended_code_props: bool = True, + use_ast_parsing: bool = True, + skip_source_for_performance: bool = False, +) -> list[str]: + """ + Extract the components of a function that determine its identity for hashing. + + Args: + func: The function to process + name_override: Override for function name + include_name: Whether to include the function name + include_module: Whether to include the module name + include_declaration: Whether to include the function declaration line + include_docstring: Whether to include the function's docstring + include_comments: Whether to include comments in the function body + preserve_whitespace: Whether to preserve original whitespace/indentation + include_annotations: Whether to include function type annotations + include_code_properties: Whether to include basic code object properties + include_defaults: Whether to include default parameter values + include_custom_attributes: Whether to include custom function attributes + include_closure_info: Whether to include closure variable information + include_decorators: Whether to include decorator information + include_extended_code_props: Whether to include extended code properties + use_ast_parsing: Whether to use AST-based parsing for robust processing + skip_source_for_performance: Skip expensive source code operations + + Returns: + A list of string components + """ + components = [] + + # Add function name + if include_name: + components.append(f"name:{name_override or func.__name__}") + + # Add module + if include_module and hasattr(func, "__module__"): + components.append(f"module:{func.__module__}") + + # Add decorators + if include_decorators and not skip_source_for_performance: + try: + decorators = extract_decorators(func) + if decorators: + components.append(f"decorators:{';'.join(decorators)}") + except Exception: + pass # Don't fail if decorator extraction fails + + # Process source code + if not skip_source_for_performance: + try: + source = inspect.getsource(func) + + # Handle whitespace preservation + if not preserve_whitespace: + source = inspect.cleandoc(source) + + # Remove declaration if requested + if not include_declaration: + lines = source.split("\n") + for i, line in enumerate(lines): + if line.strip().startswith(("def ", "async def ")): + lines.pop(i) + break + source = "\n".join(lines) + + # Process docstrings and comments + if not include_docstring or not include_comments: + # if use_ast_parsing: + # source = SourceProcessor.remove_docstrings_and_comments_ast( + # source, + # remove_docstrings=not include_docstring, + # remove_comments=not include_comments, + # ) + # else: + source = SourceProcessor._remove_docstrings_and_comments_fallback( + source, + remove_docstrings=not include_docstring, + remove_comments=not include_comments, + ) + + components.append(f"source:{source}") + + except (IOError, TypeError, OSError): + # Handle special function types + if hasattr(func, "__name__"): + if func.__name__ == "": + components.append("function_type:lambda") + else: + components.append("function_type:dynamic_or_builtin") + + # Fall back to signature if available + try: + sig = inspect.signature(func) + components.append(f"signature:{str(sig)}") + except (ValueError, TypeError): + components.append("builtin:True") + + # Add function annotations + if ( + include_annotations + and hasattr(func, "__annotations__") + and func.__annotations__ + ): + sorted_annotations = sorted(func.__annotations__.items()) + annotations_str = ";".join(f"{k}:{v}" for k, v in sorted_annotations) + components.append(f"annotations:{annotations_str}") + + # Add default parameter values + if include_defaults: + defaults_info = [] + + if hasattr(func, "__defaults__") and func.__defaults__: + defaults_info.append(f"defaults:{func.__defaults__}") + + if hasattr(func, "__kwdefaults__") and func.__kwdefaults__: + sorted_kwdefaults = sorted(func.__kwdefaults__.items()) + kwdefaults_str = ";".join(f"{k}:{v}" for k, v in sorted_kwdefaults) + defaults_info.append(f"kwdefaults:{kwdefaults_str}") + + if defaults_info: + components.extend(defaults_info) + + # Add custom function attributes + if include_custom_attributes and hasattr(func, "__dict__") and func.__dict__: + # Filter out common built-in attributes + custom_attrs = {k: v for k, v in func.__dict__.items() if not k.startswith("_")} + if custom_attrs: + sorted_attrs = sorted(custom_attrs.items()) + attrs_str = ";".join(f"{k}:{v}" for k, v in sorted_attrs) + components.append(f"custom_attributes:{attrs_str}") + + # Add closure information + if include_closure_info and hasattr(func, "__closure__") and func.__closure__: + # Be careful with closure - it can contain dynamic data + # We'll just include the variable names, not values + try: + closure_vars = ( + func.__code__.co_freevars if hasattr(func, "__code__") else () + ) + if closure_vars: + components.append(f"closure_vars:{closure_vars}") + except AttributeError: + pass + + # Add basic code object properties + if include_code_properties and hasattr(func, "__code__"): + code = func.__code__ + stable_code_props = { + "co_argcount": code.co_argcount, + "co_kwonlyargcount": getattr(code, "co_kwonlyargcount", 0), + "co_nlocals": code.co_nlocals, + "co_varnames": code.co_varnames[: code.co_argcount], + } + components.append(f"code_properties:{stable_code_props}") + + # Add extended code object properties + if include_extended_code_props and hasattr(func, "__code__"): + code = func.__code__ + extended_props = {} + + # Add code flags (generator, coroutine, etc.) + if hasattr(code, "co_flags"): + extended_props["co_flags"] = code.co_flags + + # Add free variables (closure) + if hasattr(code, "co_freevars") and code.co_freevars: + extended_props["co_freevars"] = code.co_freevars + + # Add global names referenced + if hasattr(code, "co_names") and code.co_names: + # Limit to avoid too much noise - maybe first 10 names + extended_props["co_names"] = code.co_names[:10] + + if extended_props: + components.append(f"extended_code_properties:{extended_props}") + + return components diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py index 8949622..453204f 100644 --- a/src/orcapod/utils/object_spec.py +++ b/src/orcapod/utils/object_spec.py @@ -2,28 +2,87 @@ from typing import Any -def parse_objectspec(obj_spec: Any) -> Any: +def parse_objectspec(obj_spec: Any, validate: bool = True) -> Any: + """Enhanced ObjectSpec with better error handling and validation.""" + if isinstance(obj_spec, dict): if "_class" in obj_spec: - # if _class is specified, treat the dict as an object specification, looking for - # _config key to extract configuration parameters - module_name, class_name = obj_spec["_class"].rsplit(".", 1) - module = importlib.import_module(module_name) - cls = getattr(module, class_name) - configs = parse_objectspec(obj_spec.get("_config", {})) - return cls(**configs) + return _create_instance_from_spec(obj_spec, validate) else: - # otherwise, parse through the dictionary recursively - parsed_object = obj_spec - for k, v in obj_spec.items(): - parsed_object[k] = parse_objectspec(v) - return parsed_object - elif isinstance(obj_spec, list): - # if it's a list, parse each item in the list - return [parse_objectspec(item) for item in obj_spec] - elif isinstance(obj_spec, tuple): - # if it's a tuple, parse each item in the tuple - return tuple(parse_objectspec(item) for item in obj_spec) + # Recursively process dict + return {k: parse_objectspec(v, validate) for k, v in obj_spec.items()} + + elif isinstance(obj_spec, (list, tuple)): + processed = [parse_objectspec(item, validate) for item in obj_spec] + return tuple(processed) if isinstance(obj_spec, tuple) else processed + else: - # if it's neither a dict nor a list, return it as is return obj_spec + + +def _create_instance_from_spec(spec: dict[str, Any], validate: bool) -> Any: + """Create instance with better error handling.""" + try: + class_path = spec["_class"] + config = spec.get("_config", {}) + + # Import and validate class exists + module_name, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + + # Process config recursively + processed_config = parse_objectspec(config, validate) + + # Optional: validate config matches class signature + if validate: + _validate_config_for_class(cls, processed_config) + + return cls(**processed_config) + + except Exception as e: + raise ValueError(f"Failed to create instance from spec {spec}: {e}") from e + + +def _validate_config_for_class(cls: type, config: dict[str, Any]) -> None: + """Optional validation that config matches class signature.""" + import inspect + + try: + sig = inspect.signature(cls.__init__) + valid_params = set(sig.parameters.keys()) - {"self"} + invalid_params = set(config.keys()) - valid_params + + if invalid_params: + raise ValueError(f"Invalid parameters for {cls.__name__}: {invalid_params}") + + except Exception: + # Skip validation if introspection fails + pass + + +# def parse_objectspec(obj_spec: Any) -> Any: +# if isinstance(obj_spec, dict): +# if "_class" in obj_spec: +# # if _class is specified, treat the dict as an object specification, looking for +# # _config key to extract configuration parameters +# module_name, class_name = obj_spec["_class"].rsplit(".", 1) +# module = importlib.import_module(module_name) +# cls = getattr(module, class_name) +# configs = parse_objectspec(obj_spec.get("_config", {})) +# return cls(**configs) +# else: +# # otherwise, parse through the dictionary recursively +# parsed_object = obj_spec +# for k, v in obj_spec.items(): +# parsed_object[k] = parse_objectspec(v) +# return parsed_object +# elif isinstance(obj_spec, list): +# # if it's a list, parse each item in the list +# return [parse_objectspec(item) for item in obj_spec] +# elif isinstance(obj_spec, tuple): +# # if it's a tuple, parse each item in the tuple +# return tuple(parse_objectspec(item) for item in obj_spec) +# else: +# # if it's neither a dict nor a list, return it as is +# return obj_spec From 5220829f5a1f4aae45b421e3e188924ae35cf72a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 3 Aug 2025 02:36:31 +0000 Subject: [PATCH 166/224] fix: add better error message --- src/orcapod/data/pods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 84246e4..4b4a2ef 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -136,7 +136,7 @@ def validate_inputs(self, *streams: dp.Stream) -> None: ): # TODO: use custom exception type for better error handling raise ValueError( - f"Input typespec {incoming_packet_types} is not compatible with expected input typespec {self.input_packet_types}" + f"Incoming packet data type {incoming_packet_types} from {input_stream} is not compatible with expected input typespec {self.input_packet_types()}" ) def prepare_output_stream( From 40c890c50247c82dd4c332742d7c2277cd52f429 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 3 Aug 2025 02:37:06 +0000 Subject: [PATCH 167/224] fix: error in joining tables with complex data types --- src/orcapod/hashing/content_identifiable.py | 10 ++-------- src/orcapod/pipeline/nodes.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py index 1e48243..d363146 100644 --- a/src/orcapod/hashing/content_identifiable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -16,18 +16,12 @@ class ContentIdentifiableBase: def __init__( self, - identity_structure_hasher: ObjectHasher | None = None, label: str | None = None, ) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. - - Args: - identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ - self.identity_structure_hasher = ( - identity_structure_hasher or get_default_object_hasher() - ) + self._label = label @property @@ -94,7 +88,7 @@ def __hash__(self) -> int: # If no identity structure is provided, use the default hash return super().__hash__() - return self.identity_structure_hasher.hash_to_int(structure) + return hash(structure) def __eq__(self, other: object) -> bool: """ diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index e38248f..fdf03b7 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -359,11 +359,16 @@ def get_all_records( if results is None or taginfo is None: return None - joined_info = taginfo.join( - results, - constants.PACKET_RECORD_ID, - join_type="inner", - ) + # hack - use polars for join as it can deal with complex data type + # TODO: convert the entire load logic to use polars with lazy evaluation + + joined_info = pl.DataFrame(taginfo).join(pl.DataFrame(results), on=constants.PACKET_RECORD_ID, how="inner").to_arrow() + + # joined_info = taginfo.join( + # results, + # constants.PACKET_RECORD_ID, + # join_type="inner", + # ) if not include_system_columns: system_columns = [ From e047613f8892c6fc6eea1217acd1a027c2171e02 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 4 Aug 2025 06:34:21 +0000 Subject: [PATCH 168/224] refactor: use updated converter system and optimization --- src/orcapod/data/base.py | 241 +++++++++++++++- src/orcapod/data/datagrams/arrow_datagram.py | 26 +- .../data/datagrams/arrow_tag_packet.py | 6 + src/orcapod/data/datagrams/base.py | 8 +- src/orcapod/data/datagrams/dict_datagram.py | 220 +++++++------- src/orcapod/data/datagrams/dict_tag_packet.py | 31 +- src/orcapod/data/kernels.py | 13 +- src/orcapod/data/pods.py | 126 +++++--- src/orcapod/data/sources.py | 35 ++- src/orcapod/data/streams.py | 220 ++++++++------ src/orcapod/data/trackers.py | 11 +- src/orcapod/hashing/__init__.py | 8 +- src/orcapod/hashing/content_identifiable.py | 233 ++++++++++++++- src/orcapod/hashing/defaults.py | 17 +- src/orcapod/hashing/file_hashers.py | 3 +- src/orcapod/hashing/hash_utils.py | 78 ++++- src/orcapod/hashing/object_hashers.py | 270 +++++++++++++++--- src/orcapod/pipeline/nodes.py | 41 ++- src/orcapod/protocols/data_protocols.py | 5 +- src/orcapod/protocols/hashing_protocols.py | 21 +- src/orcapod/semantic_types/__init__.py | 2 + src/orcapod/semantic_types/type_inference.py | 7 +- .../semantic_types/universal_converter.py | 68 +++-- src/orcapod/stores/delta_lake_stores.py | 4 +- src/orcapod/utils/arrow_utils.py | 57 ++++ 25 files changed, 1352 insertions(+), 399 deletions(-) diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index dec4f06..e0e254e 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -1,6 +1,9 @@ -from typing import Any +from collections.abc import Collection +from pathlib import Path +from typing import Any, Mapping +from uuid import UUID from orcapod.protocols import hashing_protocols as hp -from orcapod.hashing.defaults import get_default_object_hasher +from orcapod import contexts import logging @@ -22,6 +25,7 @@ def __init__( self, identity_structure_hasher: hp.ObjectHasher | None = None, label: str | None = None, + data_context: str | contexts.DataContext | None = None, ) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. @@ -29,10 +33,10 @@ def __init__( Args: identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ - self.identity_structure_hasher = ( - identity_structure_hasher or get_default_object_hasher() - ) self._label = label + self._data_context = contexts.resolve_context(data_context) + self._content_hash: str | None = None + self._int_hash: int | None = None @property def has_assigned_label(self) -> bool: @@ -85,6 +89,22 @@ def identity_structure(self) -> Any: # TODO: come up with a way to signify non-determinate identity structure return None + def content_hash(self) -> str: + """ + Compute a hash based on the content of this object. + + Returns: + bytes: A byte representation of the hash based on the content. + If no identity structure is provided, return None. + """ + if self._content_hash is None: + structure = self.identity_structure() + processed_structure = process_structure(structure) + self._content_hash = self._data_context.object_hasher.hash_to_hex( + processed_structure, prefix_hasher_id=True + ) + return self._content_hash + def __hash__(self) -> int: """ Hash implementation that uses the identity structure if provided, @@ -94,12 +114,14 @@ def __hash__(self) -> int: int: A hash value based on either content or identity """ # Get the identity structure - structure = self.identity_structure() - if structure is None: - # If no identity structure is provided, use the default hash - return super().__hash__() - - return self.identity_structure_hasher.hash_to_int(structure) + if self._int_hash is None: + structure = self.identity_structure() + if structure is None: + # If no identity structure is provided, use the default hash + self._int_hash = super().__hash__() + else: + self._int_hash = self._data_context.object_hasher.hash_to_int(structure) + return self._int_hash def __eq__(self, other: object) -> bool: """ @@ -115,3 +137,200 @@ def __eq__(self, other: object) -> bool: return NotImplemented return self.identity_structure() == other.identity_structure() + + +def process_structure( + obj: Any, + visited: set[int] | None = None, + force_hash: bool = True, + function_info_extractor: hp.FunctionInfoExtractor | None = None, +) -> Any: + """ + Recursively process a structure to prepare it for hashing. + + Args: + obj: The object or structure to process + visited: Set of object ids already visited (to handle circular references) + function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation + + Returns: + A processed version of the structure suitable for stable hashing + """ + # Initialize the visited set if this is the top-level call + if visited is None: + visited = set() + else: + visited = visited.copy() # Copy to avoid modifying the original set + + # Check for circular references - use object's memory address + # NOTE: While id() is not stable across sessions, we only use it within a session + # to detect circular references, not as part of the final hash + obj_id = id(obj) + if obj_id in visited: + logger.debug( + f"Detected circular reference for object of type {type(obj).__name__}" + ) + return "CircularRef" # Don't include the actual id in hash output + + # For objects that could contain circular references, add to visited + if isinstance(obj, (dict, list, tuple, set)) or not isinstance( + obj, (str, int, float, bool, type(None)) + ): + visited.add(obj_id) + + # Handle None + if obj is None: + return None + + # TODO: currently using runtime_checkable on ContentIdentifiable protocol + # Re-evaluate this strategy to see if a faster / more robust check could be used + if isinstance(obj, hp.ContentIdentifiable): + logger.debug( + f"Processing ContentHashableBase instance of type {type(obj).__name__}" + ) + return obj.content_hash() + + # Handle basic types + if isinstance(obj, (str, int, float, bool)): + return obj + + # Handle bytes and bytearray + if isinstance(obj, (bytes, bytearray)): + logger.debug( + f"Converting bytes/bytearray of length {len(obj)} to hex representation" + ) + return obj.hex() + + # Handle Path objects + if isinstance(obj, Path): + logger.debug(f"Converting Path object to string: {obj}") + raise NotImplementedError( + "Path objects are not supported in this hasher. Please convert to string." + ) + return str(obj) + + # Handle UUID objects + if isinstance(obj, UUID): + logger.debug(f"Converting UUID to string: {obj}") + raise NotImplementedError( + "UUID objects are not supported in this hasher. Please convert to string." + ) + return str(obj) + + # Handle named tuples (which are subclasses of tuple) + if hasattr(obj, "_fields") and isinstance(obj, tuple): + logger.debug(f"Processing named tuple of type {type(obj).__name__}") + # For namedtuples, convert to dict and then process + d = {field: getattr(obj, field) for field in obj._fields} # type: ignore + return process_structure(d, visited) + + # Handle mappings (dict-like objects) + if isinstance(obj, Mapping): + # Process both keys and values + processed_items = [ + ( + process_structure(k, visited), + process_structure(v, visited), + ) + for k, v in obj.items() + ] + + # Sort by the processed keys for deterministic order + processed_items.sort(key=lambda x: str(x[0])) + + # Create a new dictionary with string keys based on processed keys + # TODO: consider checking for possibly problematic values in processed_k + # and issue a warning + return { + str(processed_k): processed_v + for processed_k, processed_v in processed_items + } + + # Handle sets and frozensets + if isinstance(obj, (set, frozenset)): + logger.debug( + f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" + ) + # Process each item first, then sort the processed results + processed_items = [process_structure(item, visited) for item in obj] + return sorted(processed_items, key=str) + + # Handle collections (list-like objects) + if isinstance(obj, Collection): + logger.debug( + f"Processing collection of type {type(obj).__name__} with {len(obj)} items" + ) + return [process_structure(item, visited) for item in obj] + + # For functions, use the function_content_hash + if callable(obj) and hasattr(obj, "__code__"): + logger.debug(f"Processing function: {getattr(obj, '__name__')}") + if function_info_extractor is not None: + # Use the extractor to get a stable representation + function_info = function_info_extractor.extract_function_info(obj) + logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") + + # simply return the function info as a stable representation + return function_info + else: + raise ValueError( + f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" + ) + + # handle data types + if isinstance(obj, type): + logger.debug(f"Processing class/type: {obj.__name__}") + return f"type:{obj.__name__}" + + # For other objects, attempt to create deterministic representation only if force_hash=True + class_name = obj.__class__.__name__ + module_name = obj.__class__.__module__ + if force_hash: + try: + import re + + logger.debug( + f"Processing generic object of type {module_name}.{class_name}" + ) + + # Try to get a stable dict representation if possible + if hasattr(obj, "__dict__"): + # Sort attributes to ensure stable order + attrs = sorted( + (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") + ) + # Limit to first 10 attributes to avoid extremely long representations + if len(attrs) > 10: + logger.debug( + f"Object has {len(attrs)} attributes, limiting to first 10" + ) + attrs = attrs[:10] + attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] + obj_repr = f"{{{', '.join(attr_strs)}}}" + else: + # Get basic repr but remove memory addresses + logger.debug( + "Object has no __dict__, using repr() with memory address removal" + ) + obj_repr = repr(obj) + if len(obj_repr) > 1000: + logger.debug( + f"Object repr is {len(obj_repr)} chars, truncating to 1000" + ) + obj_repr = obj_repr[:1000] + "..." + # Remove memory addresses which look like '0x7f9a1c2b3d4e' + obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + + return f"{module_name}.{class_name}:{obj_repr}" + except Exception as e: + # Last resort - use class name only + logger.warning(f"Failed to process object representation: {e}") + try: + return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" + except AttributeError: + logger.error("Could not determine object class, using UnknownObject") + return "UnknownObject" + else: + raise ValueError( + f"Processing of {obj} of type {module_name}.{class_name} is not supported" + ) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 2a21a58..a3a4ac8 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -12,6 +12,7 @@ from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) +DEBUG = False class ArrowDatagram(BaseDatagram): @@ -810,14 +811,17 @@ def __repr__(self) -> str: >>> repr(datagram) "ArrowDatagram(data={'user_id': 123, 'name': 'Alice'}, meta_columns=2, context='std:v1.0.0:abc123')" """ - data_dict = self.as_dict() - meta_count = len(self.meta_columns) - context_key = self.data_context_key - - return ( - f"{self.__class__.__name__}(" - f"data={data_dict}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) + if DEBUG: + data_dict = self.as_dict() + meta_count = len(self.meta_columns) + context_key = self.data_context_key + + return ( + f"{self.__class__.__name__}(" + f"data={data_dict}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + else: + return str(self.as_dict()) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index e45fa35..c3a4f0a 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -36,6 +36,7 @@ class ArrowTag(ArrowDatagram): def __init__( self, table: pa.Table, + system_tags: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: if len(table) != 1: @@ -47,6 +48,11 @@ def __init__( table=table, data_context=data_context, ) + extracted_system_tags = [ + c for c in self._data_table.column_names if c.startswith("_tag_") + ] + self._system_tag_table = self._data_table.select(extracted_system_tags) + self._data_table = self._data_table.drop_columns(extracted_system_tags) class ArrowPacket(ArrowDatagram): diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 9e9cc1a..297b100 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -123,6 +123,7 @@ def __init__(self, data_context: contexts.DataContext | str | None = None) -> No or a DataContext object, or None for default. """ self._data_context = contexts.resolve_context(data_context) + self._converter = self._data_context.type_converter # 1. Core Properties (Identity & Structure) @property @@ -263,13 +264,14 @@ def with_columns( ... # 7. Context Operations - @abstractmethod def with_context_key(self, new_context_key: str) -> Self: """Create new datagram with different data context.""" - ... + new_datagram = self.copy(include_cache=False) + new_datagram._data_context = contexts.resolve_context(new_context_key) + return new_datagram # 8. Utility Operations - def copy(self) -> Self: + def copy(self, include_cache: bool = True) -> Self: """Create a shallow copy of the datagram.""" new_datagram = object.__new__(self.__class__) new_datagram._data_context = self._data_context diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 3da58ac..f9f9cf0 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -7,13 +7,15 @@ from orcapod.data.system_constants import orcapod_constants as constants from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram -from orcapod.types import TypeSpec -from orcapod.types import typespec_utils as tsutils +from orcapod.semantic_types import infer_schema_from_pylist_data from orcapod.types.core import DataValue from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) +# FIXME: make this configurable! +DEBUG = False + class DictDatagram(BaseDatagram): """ @@ -50,7 +52,7 @@ class DictDatagram(BaseDatagram): def __init__( self, data: Mapping[str, DataValue], - typespec: TypeSpec | None = None, + python_schema: dict[str, type] | None = None, meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: @@ -99,17 +101,23 @@ def __init__( # Combine provided typespec info with inferred typespec from content # If the column value is None and no type spec is provided, defaults to str. - self._data_python_schema = tsutils.get_typespec_from_dict( - self._data, - typespec, + inferred_schema = infer_schema_from_pylist_data([self._data], default_type=str) + + self._data_python_schema = ( + {k: python_schema.get(k, v) for k, v in inferred_schema.items()} + if python_schema + else inferred_schema ) # Create schema for meta data - self._meta_python_schema = tsutils.get_typespec_from_dict( - self._meta_data, - typespec=typespec, + inferred_meta_schema = infer_schema_from_pylist_data( + [self._meta_data], default_type=str + ) + self._meta_python_schema = ( + {k: python_schema.get(k, v) for k, v in inferred_meta_schema.items()} + if python_schema + else inferred_meta_schema ) - # Initialize caches self._cached_data_table: pa.Table | None = None @@ -124,6 +132,15 @@ def meta_columns(self) -> tuple[str, ...]: """Return tuple of meta column names.""" return tuple(self._meta_data.keys()) + def get_meta_info(self) -> dict[str, DataValue]: + """ + Get meta column information. + + Returns: + Dictionary of meta column names and their values. + """ + return dict(self._meta_data) + # 2. Dict-like Interface (Data Access) def __getitem__(self, key: str) -> DataValue: """Get data column value by key.""" @@ -180,7 +197,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> TypeSpec: + ) -> dict[str, type]: """ Return Python schema for the datagram. @@ -242,36 +259,34 @@ def arrow_schema( # Build data schema (cached) if self._cached_data_arrow_schema is None: - self._cached_data_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema( - self._data_python_schema + self._cached_data_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self._data_python_schema + ) ) - all_schemas = [self._cached_data_arrow_schema] # Add context schema if requested if include_context: - context_schema = pa.schema([pa.field(constants.CONTEXT_KEY, pa.string())]) + context_schema = self._converter.python_schema_to_arrow_schema( + {constants.CONTEXT_KEY: str} + ) all_schemas.append(context_schema) # Add meta schema if requested if include_meta_columns and self._meta_data: - if include_meta_columns is True: meta_schema = self._get_meta_arrow_schema() elif isinstance(include_meta_columns, Collection): # Filter meta schema by prefix matching - matched_fields = [ - field - for field in self._get_meta_arrow_schema() - if any( - field.name.startswith(prefix) for prefix in include_meta_columns + meta_schema = ( + arrow_utils.select_schema_columns_with_prefixes( + self._get_meta_arrow_schema(), + include_meta_columns, ) - ] - if matched_fields: - meta_schema = pa.schema(matched_fields) - else: - meta_schema = None + or None + ) else: meta_schema = None @@ -340,6 +355,41 @@ def as_dict( return result_dict + def as_arrow_compatible_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation compatible with Arrow. + + Args: + include_meta_columns: Whether to include meta columns. + - True: include all meta columns + - Collection[str]: include meta columns matching these prefixes + - False: exclude meta columns + include_context: Whether to include context key + + Returns: + Dictionary representation compatible with Arrow + """ + # FIXME: this is a super inefficient implementation! + python_dict = self.as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + python_schema = self.types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + + return self._data_context.type_converter.python_dict_to_struct_dict( + [python_dict], python_schema=python_schema + )[0] + def _get_meta_arrow_table(self) -> pa.Table: if self._cached_meta_table is None: arrow_schema = self._get_meta_arrow_schema() @@ -354,10 +404,12 @@ def _get_meta_arrow_table(self) -> pa.Table: def _get_meta_arrow_schema(self) -> pa.Schema: if self._cached_meta_arrow_schema is None: - self._cached_meta_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema( - self._meta_python_schema + self._cached_meta_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self._meta_python_schema + ) ) - + assert self._cached_meta_arrow_schema is not None, ( "Meta Arrow schema should be initialized by now" ) @@ -387,9 +439,11 @@ def as_table( # Build data table (cached) if self._cached_data_table is None: - self._cached_data_table = self._data_context.type_converter.python_dicts_to_arrow_table( - [self._data], - self._data_python_schema, + self._cached_data_table = ( + self._data_context.type_converter.python_dicts_to_arrow_table( + [self._data], + self._data_python_schema, + ) ) assert self._cached_data_table is not None, ( "Data Arrow table should be initialized by now" @@ -397,6 +451,7 @@ def as_table( result_table = self._cached_data_table # Add context if requested + # TODO: consider using type converter for consistency if include_context: result_table = result_table.append_column( constants.CONTEXT_KEY, @@ -410,18 +465,12 @@ def as_table( # Select appropriate meta columns if isinstance(include_meta_columns, Collection): # Filter meta columns by prefix matching - matched_cols = [ - col - for col in self._meta_data.keys() - if any(col.startswith(prefix) for prefix in include_meta_columns) - ] - if matched_cols: - meta_table = meta_table.select(matched_cols) - else: - meta_table = None + meta_table = arrow_utils.select_table_columns_with_prefixes( + meta_table, include_meta_columns + ) # Combine tables if we have meta columns to add - if meta_table is not None: + if meta_table: result_table = arrow_utils.hstack_tables(result_table, meta_table) return result_table @@ -595,18 +644,18 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: new_name = column_mapping.get(old_name, old_name) new_data[new_name] = value - # Handle typespec updates for renamed columns - new_typespec = None + # Handle python_schema updates for renamed columns + new_python_schema = None if self._data_python_schema: - existing_typespec = dict(self._data_python_schema) + existing_python_schema = dict(self._data_python_schema) # Rename types according to column mapping - renamed_typespec = {} - for old_name, old_type in existing_typespec.items(): + renamed_python_schema = {} + for old_name, old_type in existing_python_schema.items(): new_name = column_mapping.get(old_name, old_name) - renamed_typespec[new_name] = old_type + renamed_python_schema[new_name] = old_type - new_typespec = renamed_typespec + new_python_schema = renamed_python_schema # Reconstruct full data dict for new instance full_data = new_data # Renamed user data @@ -614,7 +663,7 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: return self.__class__( data=full_data, - typespec=new_typespec, + python_schema=new_python_schema, data_context=self._data_context, ) @@ -651,6 +700,7 @@ def update(self, **updates: DataValue) -> Self: full_data = new_data # Updated user data full_data.update(self._meta_data) # Keep existing meta data + # TODO: transfer over python schema return self.__class__( data=full_data, data_context=self._data_context, @@ -693,48 +743,21 @@ def with_columns( new_data = dict(self._data) new_data.update(updates) - # Create updated typespec - handle None values by defaulting to str - typespec = self.types() + # Create updated python schema - handle None values by defaulting to str + python_schema = self.types() if column_types is not None: - typespec.update(column_types) - - new_typespec = tsutils.get_typespec_from_dict( - new_data, - typespec=typespec, - ) - - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - return self.__class__( - data=full_data, - typespec=new_typespec, - # semantic converter needs to be rebuilt for new columns - data_context=self._data_context, - ) + python_schema.update(column_types) - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> Self: - """ - Create a new DictDatagram with a different data context key. - Maintains immutability by returning a new instance. - - Args: - new_context_key: New data context key string + new_python_schema = infer_schema_from_pylist_data([new_data]) + new_python_schema = { + k: python_schema.get(k, v) for k, v in new_python_schema.items() + } - Returns: - New DictDatagram instance with new context - """ - # Reconstruct full data dict for new instance - full_data = dict(self._data) # User data - full_data.update(self._meta_data) # Meta data + new_datagram = self.copy(include_cache=False) + new_datagram._data = new_data + new_datagram._data_python_schema = new_python_schema - return self.__class__( - data=full_data, - data_context=new_context_key, # New context - # Note: semantic_converter will be rebuilt for new context - ) + return new_datagram # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: @@ -792,13 +815,16 @@ def __repr__(self) -> str: Returns: Detailed representation with type and metadata information. """ - meta_count = len(self.meta_columns) - context_key = self.data_context_key + if DEBUG: + meta_count = len(self.meta_columns) + context_key = self.data_context_key - return ( - f"{self.__class__.__name__}(" - f"data={self._data}, " - f"meta_columns={meta_count}, " - f"context='{context_key}'" - f")" - ) + return ( + f"{self.__class__.__name__}(" + f"data={self._data}, " + f"meta_columns={meta_count}, " + f"context='{context_key}'" + f")" + ) + else: + return str(self._data) diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index f61fefe..775945a 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -47,7 +47,7 @@ def __init__( data: Mapping[str, DataValue], meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, - typespec: TypeSpec | None = None, + python_schema: dict[str, type] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: # normalize the data content and remove any source info keys @@ -62,7 +62,7 @@ def __init__( super().__init__( data_only, - typespec=typespec, + python_schema=python_schema, meta_info=meta_info, data_context=data_context, ) @@ -72,16 +72,21 @@ def __init__( self._cached_source_info_schema: pa.Schema | None = None @property - def _source_info_schema(self) -> pa.Schema: + def _source_info_arrow_schema(self) -> pa.Schema: if self._cached_source_info_schema is None: - self._cached_source_info_schema = pa.schema( - { - f"{constants.SOURCE_PREFIX}{k}": pa.large_string() - for k in self.keys() - } + self._cached_source_info_schema = ( + self._converter.python_schema_to_arrow_schema( + self._source_info_python_schema + ) ) + return self._cached_source_info_schema + @property + def _source_info_python_schema(self) -> dict[str, type]: + """Return the Python schema for source info.""" + return {f"{constants.SOURCE_PREFIX}{k}": str for k in self.keys()} + def as_table( self, include_all_info: bool = False, @@ -102,7 +107,7 @@ def as_table( for k, v in self.source_info().items() } self._cached_source_info_table = pa.Table.from_pylist( - [source_info_data], schema=self._source_info_schema + [source_info_data], schema=self._source_info_arrow_schema ) assert self._cached_source_info_table is not None, ( "Cached source info table should not be None" @@ -202,7 +207,9 @@ def arrow_schema( include_context=include_context, ) if include_all_info or include_source: - return arrow_utils.join_arrow_schemas(schema, self._source_info_schema) + return arrow_utils.join_arrow_schemas( + schema, self._source_info_arrow_schema + ) return schema def as_datagram( @@ -226,14 +233,14 @@ def as_datagram( include_meta_columns=include_meta_columns, include_source=include_source, ) - typespec = self.types( + python_schema = self.types( include_all_info=include_all_info, include_meta_columns=include_meta_columns, include_source=include_source, ) return DictDatagram( data, - typespec=typespec, + python_schema=python_schema, data_context=self._data_context, ) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index dfe1680..920769c 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -50,13 +50,7 @@ def kernel_id(self) -> tuple[str, ...]: Returns a unique identifier for the kernel. This is used to identify the kernel in the computational graph. """ - if self._kernel_hash is None: - # If the kernel hash is not set, compute it based on the class name and label. - # This is a simple way to ensure that each kernel has a unique identifier. - self._kernel_hash = self.data_context.object_hasher.hash_to_hex( - self.identity_structure(), prefix_hasher_id=True - ) - return (f"{self.__class__.__name__}", self._kernel_hash) + return (f"{self.__class__.__name__}", self.content_hash()) @property def data_context(self) -> contexts.DataContext: @@ -125,6 +119,11 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. if streams is not None: + if len(streams) == 0: + # If no streams are provided, then this is a source kernel + # and we simply return None as the identity structure. + print(f"Kernel {self} is acting as a source!") + return None streams = self.pre_kernel_processing(*streams) self.validate_inputs(*streams) return self.kernel_identity_structure(streams) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 4b4a2ef..5092cda 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,9 +1,11 @@ import logging import sys from abc import abstractmethod -from collections.abc import Callable, Collection, Iterable, Sequence +from collections.abc import Callable, Collection, Iterable, Mapping, Sequence from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal + +from numpy import record from orcapod import contexts from orcapod.data.datagrams import ( ArrowPacket, @@ -20,6 +22,25 @@ from orcapod.types import typespec_utils as tsutils from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components +import hashlib + + +def combine_hashes( + *hashes: str, order: bool = False, prefix_hasher_id: bool = False +) -> str: + """Combine hashes into a single hash string.""" + + # Sort for deterministic order regardless of input order + if order: + prepared_hashes = sorted(hashes) + else: + prepared_hashes = list(hashes) + combined = "".join(prepared_hashes) + combined_hash = hashlib.sha256(combined.encode()).hexdigest() + if prefix_hasher_id: + return "sha256@" + combined_hash + return combined_hash + if TYPE_CHECKING: import pyarrow as pa @@ -217,8 +238,8 @@ def __init__( output_keys: str | Collection[str] | None = None, function_name=None, version: str = "v0.0", - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | Sequence[type] | None = None, + input_python_schema: Mapping[str, type] | None = None, + output_python_schema: Mapping[str, type] | Sequence[type] | None = None, label: str | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, @@ -238,20 +259,36 @@ def __init__( "function_name must be provided if function has no __name__ attribute" ) self.function_name = function_name + # extract the first full index (potentially with leading 0) in the version string + if not isinstance(version, str): + raise TypeError(f"Version must be a string, got {type(version)}") + import re + + match = re.match(r"\D.*(\d+)", version) + major_version = 0 + if match: + major_version = int(match.group(1)) + else: + raise ValueError( + f"Version string {version} does not contain a valid version number" + ) + self.version = version + self.major_version = major_version + super().__init__(label=label or self.function_name, **kwargs) # extract input and output types from the function signature input_packet_types, output_packet_types = tsutils.extract_function_typespecs( self.function, self.output_keys, - input_typespec=input_typespec, - output_typespec=output_typespec, + input_typespec=input_python_schema, + output_typespec=output_python_schema, ) - self._input_packet_schema = input_packet_types - self._output_packet_schema = output_packet_types + self._input_packet_schema = dict(input_packet_types) + self._output_packet_schema = dict(output_packet_types) # TODO: add output packet converter for speed up - + self._function_info_extractor = function_info_extractor object_hasher = self.data_context.object_hasher self._function_signature_hash = object_hasher.hash_to_hex( @@ -261,21 +298,35 @@ def __init__( get_function_components(self.function), prefix_hasher_id=True ) + self._output_packet_type_hash = object_hasher.hash_to_hex( + self.output_packet_types(), prefix_hasher_id=True + ) + + self._total_pod_id_hash = object_hasher.hash_to_hex( + self.tiered_pod_id, prefix_hasher_id=True + ) + @property def tiered_pod_id(self) -> dict[str, str]: return { + "version": self.version, "signature": self._function_signature_hash, "content": self._function_content_hash, } @property def kernel_id(self) -> tuple[str, ...]: - return (self.function_name, self.version) + return ( + self.function_name, + self._output_packet_type_hash, + "v" + str(self.major_version), + ) def get_record_id(self, packet: dp.Packet) -> str: - content = (packet.content_hash(), self.tiered_pod_id) - return self.data_context.object_hasher.hash_to_hex( - content, prefix_hasher_id=True + return combine_hashes( + packet.content_hash(), + self._total_pod_id_hash, + prefix_hasher_id=True, ) def input_packet_types(self) -> dict[str, type]: @@ -304,7 +355,9 @@ def __str__(self) -> str: ) return f"FunctionPod:{func_sig}" - def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | None]: + def call( + self, tag: dp.Tag, packet: dp.Packet, record_id: str | None = None + ) -> tuple[dp.Tag, DictPacket | None]: if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -333,7 +386,9 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non ) output_data = {k: v for k, v in zip(self.output_keys, output_values)} - record_id = self.get_record_id(packet) + if record_id is None: + # if record_id is not provided, generate it from the packet + record_id = self.get_record_id(packet) source_info = { k: ":".join(self.kernel_id + (record_id, k)) for k in output_data } @@ -341,7 +396,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non output_packet = DictPacket( {k: v for k, v in zip(self.output_keys, output_values)}, source_info=source_info, - typespec=self.output_packet_types(), + python_schema=self.output_packet_types(), data_context=self._data_context, ) return tag, output_packet @@ -349,28 +404,7 @@ def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, DictPacket | Non def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None ) -> Any: - # construct identity structure for the function - - # if function_info_extractor is available, use that but substitute the function_name - if self._function_info_extractor is not None: - function_info = self._function_info_extractor.extract_function_info( - self.function, - function_name=self.function_name, - input_typespec=self.input_packet_types(), - output_typespec=self.output_packet_types(), - ) - else: - # use basic information only - function_info = { - "name": self.function_name, - "input_packet_types": self.input_packet_types(), - "output_packet_types": self.output_packet_types(), - } - - id_struct = ( - self.__class__.__name__, - function_info, - ) + id_struct = (self.__class__.__name__,) + self.kernel_id # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure if streams is not None and len(streams) != 0: @@ -440,8 +474,10 @@ def output_packet_types(self) -> TypeSpec: def validate_inputs(self, *streams: dp.Stream) -> None: self.pod.validate_inputs(*streams) - def call(self, tag: dp.Tag, packet: dp.Packet) -> tuple[dp.Tag, dp.Packet | None]: - return self.pod.call(tag, packet) + def call( + self, tag: dp.Tag, packet: dp.Packet, record_id: str | None = None + ) -> tuple[dp.Tag, dp.Packet | None]: + return self.pod.call(tag, packet, record_id=record_id) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -492,17 +528,20 @@ def call( self, tag: dp.Tag, packet: dp.Packet, + record_id: str | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: # TODO: consider logic for overwriting existing records + if record_id is None: + record_id = self.get_record_id(packet) output_packet = None if not skip_cache_lookup: output_packet = self.get_recorded_output_packet(packet) if output_packet is None: - tag, output_packet = super().call(tag, packet) + tag, output_packet = super().call(tag, packet, record_id=record_id) if output_packet is not None and not skip_cache_insert: - self.record_packet(packet, output_packet) + self.record_packet(packet, output_packet, record_id=record_id) return tag, output_packet @@ -514,6 +553,7 @@ def record_packet( self, input_packet: dp.Packet, output_packet: dp.Packet, + record_id: str | None = None, skip_duplicates: bool = False, ) -> dp.Packet: """ @@ -535,10 +575,12 @@ def record_packet( constants.INPUT_PACKET_HASH, pa.array([input_packet.content_hash()], type=pa.large_string()), ) + if record_id is None: + record_id = self.get_record_id(input_packet) self.result_store.add_record( self.record_path, - self.pod.get_record_id(input_packet), + record_id, data_table, skip_duplicates=skip_duplicates, ) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index f5754aa..0d3dda7 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -297,7 +297,7 @@ class ManualDeltaTableSource(SourceBase): def __init__( self, table_path: str | Path, - schema: TypeSpec | None = None, + python_schema: dict[str, type] | None = None, tag_columns: Collection[str] | None = None, **kwargs, ) -> None: @@ -311,7 +311,7 @@ def __init__( self.load_delta_table() if self._delta_table is None: - if schema is None: + if python_schema is None: raise ValueError( "Delta table not found and no schema provided. " "Please provide a valid Delta table path or a schema to create a new table." @@ -320,9 +320,12 @@ def __init__( raise ValueError( "At least one tag column must be provided when creating a new Delta table." ) - python_schema = schema - arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(python_schema) - + arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + python_schema + ) + ) + fields = [] for field in arrow_schema: if field.name in tag_columns: @@ -332,8 +335,12 @@ def __init__( else: arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) - python_schema = self._data_context.type_converter.arrow_schema_to_python_schema(arrow_schema) - + python_schema = ( + self._data_context.type_converter.arrow_schema_to_python_schema( + arrow_schema + ) + ) + inferred_tag_columns = [] for field in arrow_schema: if ( @@ -660,9 +667,17 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: This is called by forward() and creates a fresh snapshot each time. """ - tag_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(self.tag_typespec) - packet_arrow_schema = self._data_context.type_converter.python_schema_to_arrow_schema(self.packet_typespec) - + tag_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self.tag_typespec + ) + ) + packet_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self.packet_typespec + ) + ) + joined_data = [ {**tag, **packet} for tag, packet in zip(self.tags, self.packets) ] diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 55c8bfe..26123f0 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -6,8 +6,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, cast -from orcapod.data.base import LabeledContentIdentifiableBase from orcapod import contexts +from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.datagrams import ( ArrowPacket, ArrowTag, @@ -108,11 +108,7 @@ def substream_identities(self) -> tuple[str, ...]: Returns the identities of the substreams that this stream is composed of. This is used to identify the substreams in the computational graph. """ - return ( - self._data_context.object_hasher.hash_to_hex( - self.identity_structure(), compressed=True, prefix_hasher_id=True - ), - ) + return (self.content_hash(),) def get_substream(self, substream_id: str) -> dp.Stream: """ @@ -248,7 +244,35 @@ def identity_structure(self) -> Any: return super().identity_structure() -class ImmutableTableStream(StreamBase): +class ImmutableStream(StreamBase): + """ + A class of stream that is constructed from immutable/constant data and does not change over time. + Consequently, the identity of an unsourced stream should be based on the content of the stream itself. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._data_content_identity = None + + @abstractmethod + def data_content_identity_structure(self) -> Any: + """ + Returns a hash of the content of the stream. + This is used to identify the content of the stream. + """ + ... + + def identity_structure(self) -> Any: + if self.source is not None: + # if the stream is generated by an operation, use the identity structure from the invocation + return self.source.identity_structure(self.upstreams) + # otherwise, use the content of the stream as the identity structure + if self._data_content_identity is None: + self._data_content_identity = self.data_content_identity_structure() + return self._data_content_identity + + +class ImmutableTableStream(ImmutableStream): """ An immutable stream based on a PyArrow Table. This stream is designed to be used with data that is already in a tabular format, @@ -332,6 +356,20 @@ def __init__( self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now + def data_content_identity_structure(self) -> Any: + """ + Returns a hash of the content of the stream. + This is used to identify the content of the stream. + """ + table_hash = self._data_context.arrow_hasher.hash_table( + self.as_table(include_data_context=True, include_source=True), + ) + return ( + self.__class__.__name__, + table_hash, + self._tag_columns, + ) + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. @@ -348,7 +386,7 @@ def types(self) -> tuple[dict[str, type], dict[str, type]]: converter = self._data_context.type_converter return ( converter.arrow_schema_to_python_schema(self._tag_schema), - converter.arrow_schema_to_python_schema(self._packet_schema) + converter.arrow_schema_to_python_schema(self._packet_schema), ) def as_table( @@ -582,6 +620,7 @@ def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): # Packet-level caching (from your PodStream) self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: if self._prepared_stream_iterator is not None: @@ -717,100 +756,113 @@ def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): self._prepared_stream_iterator = input_stream.iter_packets() # Packet-level caching (from your PodStream) - self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + self._cached_output_packets: list[tuple[dp.Tag, dp.Packet | None]] | None = None self._cached_output_table: pa.Table | None = None - def process_inputs(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: """ Processes the input stream and prepares the output stream. This is typically called before iterating over the packets. """ - # identify all entries in the input stream for which we still don't have computed packets - target_entries = self.input_stream.as_table( - include_content_hash=constants.INPUT_PACKET_HASH - ) - existing_entries = self.pod.get_all_records(include_system_columns=True) - if existing_entries is None or existing_entries.num_rows == 0: - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - # missing = target_entries.join( - # existing_entries, - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left anti", - # ) - # Single join that gives you both missing and existing - # More efficient - only bring the key column from existing_entries - # .select([constants.INPUT_PACKET_HASH]).append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), - all_results = target_entries.join( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ), - keys=[constants.INPUT_PACKET_HASH], - join_type="left outer", - right_suffix="_right", - ) - # grab all columns from target_entries first - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH, "_exists"]) - ) + if self._cached_output_packets is None: + cached_results = [] - existing = all_results.filter(pc.is_valid(pc.field("_exists"))).drop( - target_entries.column_names + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) - yield from existing_stream.iter_packets() + existing_entries = self.pod.get_all_records(include_system_columns=True) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) - if missing is not None and missing.num_rows > 0: - for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - tag, packet = self.pod.call(tag, packet, skip_cache_lookup=True) - if packet is not None: + existing = ( + all_results.filter(pc.is_valid(pc.field("_exists"))) + .drop_columns(target_entries.column_names) + .drop_columns(["_exists"]) + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) yield tag, packet - self._set_modified_time() - - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - if self._prepared_stream_iterator is not None: - for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - if i in self._cached_output_packets: - # Use cached result - tag, packet = self._cached_output_packets[i] + if missing is not None and missing.num_rows > 0: + for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call(tag, packet, skip_cache_lookup=True) + cached_results.append((tag, packet)) if packet is not None: yield tag, packet - else: - # Process packet - processed = self.pod.call(tag, packet) - if processed is not None: - # Update shared cache for future iterators (optimization) - self._cached_output_packets[i] = processed - tag, packet = processed - if packet is not None: - yield tag, packet - # Mark completion by releasing the iterator - self._prepared_stream_iterator = None + self._cached_output_packets = cached_results + self._set_modified_time() else: - # Yield from snapshot of complete cache - for i in range(len(self._cached_output_packets)): - tag, packet = self._cached_output_packets[i] + for tag, packet in self._cached_output_packets: if packet is not None: yield tag, packet + # def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + # if self._prepared_stream_iterator is not None: + # for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + # if i in self._cached_output_packets: + # # Use cached result + # tag, packet = self._cached_output_packets[i] + # if packet is not None: + # yield tag, packet + # else: + # # Process packet + # processed = self.pod.call(tag, packet) + # if processed is not None: + # # Update shared cache for future iterators (optimization) + # self._cached_output_packets[i] = processed + # tag, packet = processed + # if packet is not None: + # yield tag, packet + + # # Mark completion by releasing the iterator + # self._prepared_stream_iterator = None + # else: + # # Yield from snapshot of complete cache + # for i in range(len(self._cached_output_packets)): + # tag, packet = self._cached_output_packets[i] + # if packet is not None: + # yield tag, packet + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 0b7ce33..5c8d64c 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,7 +1,6 @@ from orcapod import contexts from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp, hashing_protocols as hp -from orcapod.hashing.defaults import get_default_object_hasher from collections import defaultdict from collections.abc import Generator, Collection from abc import ABC, abstractmethod @@ -137,6 +136,7 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) +# TODO: rename this to stub source or simply use StreamSource class StubKernel: def __init__(self, stream: dp.Stream, label: str | None = None) -> None: """ @@ -169,8 +169,13 @@ def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: return self.forward(*args, **kwargs) def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - # FIXME: using label as a stop-gap for identity structure - return self.label + if streams is not None: + # when checked for invocation id, act as a source + # and just return the output packet types + _, packet_types = self.stream.types() + return packet_types + # otherwise, return the identity structure of the stream + return self.stream.identity_structure() def __hash__(self) -> int: # TODO: resolve the logic around identity structure on a stream / stub kernel diff --git a/src/orcapod/hashing/__init__.py b/src/orcapod/hashing/__init__.py index eb94afe..b90f228 100644 --- a/src/orcapod/hashing/__init__.py +++ b/src/orcapod/hashing/__init__.py @@ -1,7 +1,7 @@ -from .defaults import ( - get_default_object_hasher, - get_default_arrow_hasher, -) +# from .defaults import ( +# get_default_object_hasher, +# get_default_arrow_hasher, +# ) __all__ = [ diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py index d363146..ece579c 100644 --- a/src/orcapod/hashing/content_identifiable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -1,6 +1,12 @@ -from orcapod.hashing.types import ObjectHasher -from orcapod.hashing.defaults import get_default_object_hasher +from collections.abc import Collection, Mapping +from pathlib import Path from typing import Any +from uuid import UUID +from orcapod import contexts +import logging +from orcapod.protocols import hashing_protocols as hp + +logger = logging.getLogger(__name__) class ContentIdentifiableBase: @@ -17,12 +23,14 @@ class ContentIdentifiableBase: def __init__( self, label: str | None = None, + data_context: Any = None, # Placeholder for ObjectHasher or similar context ) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. """ - + self._data_context = contexts.resolve_context(data_context) self._label = label + self._cached_hash: bytes | None = None @property def has_assigned_label(self) -> bool: @@ -74,6 +82,26 @@ def identity_structure(self) -> Any: """ return None + def content_hash(self) -> bytes: + """ + Compute a hash based on the content of this object. + + This method uses the identity structure to compute a hash value. + If no identity structure is provided, it will return None. + + Returns: + int: A hash value based on the content of this object, or None if no identity structure is provided. + """ + if self._cached_hash is None: + structure = self.identity_structure() + + processed_structure = process_structure(structure) + + self._cached_hash = self._data_context.object_hasher.hash( + processed_structure + ) + return self._cached_hash + def __hash__(self) -> int: """ Hash implementation that uses the identity structure if provided, @@ -88,7 +116,7 @@ def __hash__(self) -> int: # If no identity structure is provided, use the default hash return super().__hash__() - return hash(structure) + return self._data_context.object_hasher.hash_to_int(structure) def __eq__(self, other: object) -> bool: """ @@ -104,3 +132,200 @@ def __eq__(self, other: object) -> bool: return NotImplemented return self.identity_structure() == other.identity_structure() + + +def process_structure( + obj: Any, + visited: set[int] | None = None, + force_hash: bool = True, + function_info_extractor: hp.FunctionInfoExtractor | None = None, +) -> Any: + """ + Recursively process a structure to prepare it for hashing. + + Args: + obj: The object or structure to process + visited: Set of object ids already visited (to handle circular references) + function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation + + Returns: + A processed version of the structure suitable for stable hashing + """ + # Initialize the visited set if this is the top-level call + if visited is None: + visited = set() + else: + visited = visited.copy() # Copy to avoid modifying the original set + + # Check for circular references - use object's memory address + # NOTE: While id() is not stable across sessions, we only use it within a session + # to detect circular references, not as part of the final hash + obj_id = id(obj) + if obj_id in visited: + logger.debug( + f"Detected circular reference for object of type {type(obj).__name__}" + ) + return "CircularRef" # Don't include the actual id in hash output + + # For objects that could contain circular references, add to visited + if isinstance(obj, (dict, list, tuple, set)) or not isinstance( + obj, (str, int, float, bool, type(None)) + ): + visited.add(obj_id) + + # Handle None + if obj is None: + return None + + # TODO: currently using runtime_checkable on ContentIdentifiable protocol + # Re-evaluate this strategy to see if a faster / more robust check could be used + if isinstance(obj, hp.ContentIdentifiable): + logger.debug( + f"Processing ContentHashableBase instance of type {type(obj).__name__}" + ) + return obj.content_hash() + + # Handle basic types + if isinstance(obj, (str, int, float, bool)): + return obj + + # Handle bytes and bytearray + if isinstance(obj, (bytes, bytearray)): + logger.debug( + f"Converting bytes/bytearray of length {len(obj)} to hex representation" + ) + return obj.hex() + + # Handle Path objects + if isinstance(obj, Path): + logger.debug(f"Converting Path object to string: {obj}") + raise NotImplementedError( + "Path objects are not supported in this hasher. Please convert to string." + ) + return str(obj) + + # Handle UUID objects + if isinstance(obj, UUID): + logger.debug(f"Converting UUID to string: {obj}") + raise NotImplementedError( + "UUID objects are not supported in this hasher. Please convert to string." + ) + return str(obj) + + # Handle named tuples (which are subclasses of tuple) + if hasattr(obj, "_fields") and isinstance(obj, tuple): + logger.debug(f"Processing named tuple of type {type(obj).__name__}") + # For namedtuples, convert to dict and then process + d = {field: getattr(obj, field) for field in obj._fields} # type: ignore + return process_structure(d, visited) + + # Handle mappings (dict-like objects) + if isinstance(obj, Mapping): + # Process both keys and values + processed_items = [ + ( + process_structure(k, visited), + process_structure(v, visited), + ) + for k, v in obj.items() + ] + + # Sort by the processed keys for deterministic order + processed_items.sort(key=lambda x: str(x[0])) + + # Create a new dictionary with string keys based on processed keys + # TODO: consider checking for possibly problematic values in processed_k + # and issue a warning + return { + str(processed_k): processed_v + for processed_k, processed_v in processed_items + } + + # Handle sets and frozensets + if isinstance(obj, (set, frozenset)): + logger.debug( + f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" + ) + # Process each item first, then sort the processed results + processed_items = [process_structure(item, visited) for item in obj] + return sorted(processed_items, key=str) + + # Handle collections (list-like objects) + if isinstance(obj, Collection): + logger.debug( + f"Processing collection of type {type(obj).__name__} with {len(obj)} items" + ) + return [process_structure(item, visited) for item in obj] + + # For functions, use the function_content_hash + if callable(obj) and hasattr(obj, "__code__"): + logger.debug(f"Processing function: {getattr(obj, '__name__')}") + if function_info_extractor is not None: + # Use the extractor to get a stable representation + function_info = function_info_extractor.extract_function_info(obj) + logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") + + # simply return the function info as a stable representation + return function_info + else: + raise ValueError( + f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" + ) + + # handle data types + if isinstance(obj, type): + logger.debug(f"Processing class/type: {obj.__name__}") + return f"type:{obj.__name__}" + + # For other objects, attempt to create deterministic representation only if force_hash=True + class_name = obj.__class__.__name__ + module_name = obj.__class__.__module__ + if force_hash: + try: + import re + + logger.debug( + f"Processing generic object of type {module_name}.{class_name}" + ) + + # Try to get a stable dict representation if possible + if hasattr(obj, "__dict__"): + # Sort attributes to ensure stable order + attrs = sorted( + (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") + ) + # Limit to first 10 attributes to avoid extremely long representations + if len(attrs) > 10: + logger.debug( + f"Object has {len(attrs)} attributes, limiting to first 10" + ) + attrs = attrs[:10] + attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] + obj_repr = f"{{{', '.join(attr_strs)}}}" + else: + # Get basic repr but remove memory addresses + logger.debug( + "Object has no __dict__, using repr() with memory address removal" + ) + obj_repr = repr(obj) + if len(obj_repr) > 1000: + logger.debug( + f"Object repr is {len(obj_repr)} chars, truncating to 1000" + ) + obj_repr = obj_repr[:1000] + "..." + # Remove memory addresses which look like '0x7f9a1c2b3d4e' + obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + + return f"{module_name}.{class_name}:{obj_repr}" + except Exception as e: + # Last resort - use class name only + logger.warning(f"Failed to process object representation: {e}") + try: + return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" + except AttributeError: + logger.error("Could not determine object class, using UnknownObject") + return "UnknownObject" + else: + raise ValueError( + f"Processing of {obj} of type {module_name}.{class_name} is not supported" + ) diff --git a/src/orcapod/hashing/defaults.py b/src/orcapod/hashing/defaults.py index c9e404b..2006761 100644 --- a/src/orcapod/hashing/defaults.py +++ b/src/orcapod/hashing/defaults.py @@ -3,7 +3,8 @@ from orcapod.protocols import hashing_protocols as hp from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.hashing.object_hashers import LegacyObjectHasher + +# from orcapod.hashing.object_hashers import LegacyObjectHasher from orcapod.hashing.function_info_extractors import FunctionInfoExtractorFactory from orcapod.hashing.versioned_hashers import ( get_versioned_semantic_arrow_hasher, @@ -36,13 +37,13 @@ def get_default_object_hasher() -> hp.ObjectHasher: return object_hasher -def get_legacy_object_hasher() -> hp.ObjectHasher: - function_info_extractor = ( - FunctionInfoExtractorFactory.create_function_info_extractor( - strategy="signature" - ) - ) - return LegacyObjectHasher(function_info_extractor=function_info_extractor) +# def get_legacy_object_hasher() -> hp.ObjectHasher: +# function_info_extractor = ( +# FunctionInfoExtractorFactory.create_function_info_extractor( +# strategy="signature" +# ) +# ) +# return LegacyObjectHasher(function_info_extractor=function_info_extractor) # def get_default_composite_file_hasher(with_cache=True) -> LegacyCompositeFileHasher: diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index d5fc761..fd3cd81 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,10 +1,9 @@ -from orcapod.hashing import legacy_core from orcapod.hashing.hash_utils import hash_file from orcapod.protocols.hashing_protocols import ( FileContentHasher, StringCacher, ) -from orcapod.types import PathLike, PathSet, PacketLike +from orcapod.types import PathLike class BasicFileHasher: diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 904d55e..4fb0c13 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -1,6 +1,9 @@ from typing import Any -from .function_info_extractors import FunctionInfoExtractor -from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.protocols.hashing_protocols import ( + ContentIdentifiable, + ObjectHasher, + FunctionInfoExtractor, +) import logging import json from uuid import UUID @@ -33,8 +36,10 @@ def serialize_through_json(processed_obj) -> bytes: def process_structure( obj: Any, visited: set[int] | None = None, + object_hasher: ObjectHasher | None = None, function_info_extractor: FunctionInfoExtractor | None = None, - force_hash: bool = False, + compressed: bool = False, + force_hash: bool = True, ) -> Any: """ Recursively process a structure to prepare it for hashing. @@ -79,10 +84,22 @@ def process_structure( logger.debug( f"Processing ContentHashableBase instance of type {type(obj).__name__}" ) - # replace the object with expanded identity structure and re-process - return process_structure( - obj.identity_structure(), visited, function_info_extractor - ) + if compressed: + # if compressed, the content identifiable object is immediately replaced with + # its hashed string identity + if object_hasher is None: + raise ValueError( + "ObjectHasher must be provided to hash ContentIdentifiable objects with compressed=True" + ) + return object_hasher.hash_to_hex(obj.identity_structure(), compressed=True) + else: + # if not compressed, replace the object with expanded identity structure and re-process + return process_structure( + obj.identity_structure(), + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + ) # Handle basic types if isinstance(obj, (str, int, float, bool)): @@ -110,15 +127,33 @@ def process_structure( logger.debug(f"Processing named tuple of type {type(obj).__name__}") # For namedtuples, convert to dict and then process d = {field: getattr(obj, field) for field in obj._fields} # type: ignore - return process_structure(d, visited, function_info_extractor) + return process_structure( + d, + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + compressed=compressed, + ) # Handle mappings (dict-like objects) if isinstance(obj, Mapping): # Process both keys and values processed_items = [ ( - process_structure(k, visited, function_info_extractor), - process_structure(v, visited, function_info_extractor), + process_structure( + k, + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + compressed=compressed, + ), + process_structure( + v, + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + compressed=compressed, + ), ) for k, v in obj.items() ] @@ -141,7 +176,14 @@ def process_structure( ) # Process each item first, then sort the processed results processed_items = [ - process_structure(item, visited, function_info_extractor) for item in obj + process_structure( + item, + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + compressed=compressed, + ) + for item in obj ] return sorted(processed_items, key=str) @@ -151,7 +193,14 @@ def process_structure( f"Processing collection of type {type(obj).__name__} with {len(obj)} items" ) return [ - process_structure(item, visited, function_info_extractor) for item in obj + process_structure( + item, + visited, + object_hasher=object_hasher, + function_info_extractor=function_info_extractor, + compressed=compressed, + ) + for item in obj ] # For functions, use the function_content_hash @@ -231,9 +280,12 @@ def process_structure( def hash_object( obj: Any, function_info_extractor: FunctionInfoExtractor | None = None, + compressed: bool = False, ) -> bytes: # Process the object to handle nested structures and HashableMixin instances - processed = process_structure(obj, function_info_extractor=function_info_extractor) + processed = process_structure( + obj, function_info_extractor=function_info_extractor, compressed=compressed + ) # Serialize the processed structure json_str = json.dumps(processed, sort_keys=True, separators=(",", ":")).encode( diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 2a92f69..956fd11 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,17 +1,24 @@ -from orcapod.protocols.hashing_protocols import FunctionInfoExtractor -from orcapod.hashing import legacy_core -from orcapod.hashing import hash_utils +from collections.abc import Collection, Mapping +import hashlib +import json +from orcapod.protocols import hashing_protocols as hp from typing import Any import uuid from abc import ABC, abstractmethod +import logging +from pathlib import Path +from uuid import UUID + +logger = logging.getLogger(__name__) class ObjectHasherBase(ABC): @abstractmethod def hash(self, obj: object) -> bytes: ... + @property @abstractmethod - def get_hasher_id(self) -> str: ... + def hasher_id(self) -> str: ... def hash_to_hex( self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False @@ -27,7 +34,7 @@ def hash_to_hex( ) hex_str = hex_str[:char_count] if prefix_hasher_id: - hex_str = self.get_hasher_id() + "@" + hex_str + hex_str = self.hasher_id + "@" + hex_str return hex_str def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: @@ -45,7 +52,9 @@ def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: return int(hex_hash, 16) def hash_to_uuid( - self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID + self, + obj: Any, + namespace: uuid.UUID = uuid.NAMESPACE_OID, ) -> uuid.UUID: """Convert hash to proper UUID5.""" return uuid.uuid5(namespace, self.hash(obj)) @@ -59,66 +68,233 @@ class BasicObjectHasher(ObjectHasherBase): def __init__( self, hasher_id: str, - function_info_extractor: FunctionInfoExtractor | None = None, + function_info_extractor: hp.FunctionInfoExtractor | None = None, ): self._hasher_id = hasher_id self.function_info_extractor = function_info_extractor - def get_hasher_id(self) -> str: + @property + def hasher_id(self) -> str: return self._hasher_id - def hash(self, obj: object) -> bytes: + def process_structure( + self, + obj: Any, + visited: set[int] | None = None, + force_hash: bool = True, + ) -> Any: """ - Hash an object to a byte representation. + Recursively process a structure to prepare it for hashing. Args: - obj (object): The object to hash. + obj: The object or structure to process + visited: Set of object ids already visited (to handle circular references) + function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation Returns: - bytes: The byte representation of the hash. + A processed version of the structure suitable for stable hashing """ - return hash_utils.hash_object( - obj, function_info_extractor=self.function_info_extractor - ) + # Initialize the visited set if this is the top-level call + if visited is None: + visited = set() + else: + visited = visited.copy() # Copy to avoid modifying the original set + # Check for circular references - use object's memory address + # NOTE: While id() is not stable across sessions, we only use it within a session + # to detect circular references, not as part of the final hash + obj_id = id(obj) + if obj_id in visited: + logger.debug( + f"Detected circular reference for object of type {type(obj).__name__}" + ) + return "CircularRef" # Don't include the actual id in hash output -class LegacyObjectHasher(ObjectHasherBase): - """ - Legacy object hasher that returns the string representation of the object. + # For objects that could contain circular references, add to visited + if isinstance(obj, (dict, list, tuple, set)) or not isinstance( + obj, (str, int, float, bool, type(None)) + ): + visited.add(obj_id) - Note that this is "legacy" in the sense that it is not recommended for use in new code. - It is provided for compatibility with existing code that relies on this behavior. - Namely, this algorithm makes use of the - """ + # Handle None + if obj is None: + return None - def __init__( - self, - function_info_extractor: FunctionInfoExtractor | None = None, - ): - """ - Initializes the hasher with an optional function info extractor. + # TODO: currently using runtime_checkable on ContentIdentifiable protocol + # Re-evaluate this strategy to see if a faster / more robust check could be used + if isinstance(obj, hp.ContentIdentifiable): + logger.debug( + f"Processing ContentHashableBase instance of type {type(obj).__name__}" + ) + return self._hash_object(obj.identity_structure(), visited=visited).hex() - Args: - function_info_extractor (FunctionInfoExtractor | None): Optional extractor for function information. This must be provided if an object containing function information is to be hashed. - """ - self.function_info_extractor = function_info_extractor + # Handle basic types + if isinstance(obj, (str, int, float, bool)): + return obj - def get_hasher_id(self) -> str: - """ - Returns a unique identifier/name assigned to the hasher - """ - return "legacy_object_hasher" + # Handle bytes and bytearray + if isinstance(obj, (bytes, bytearray)): + logger.debug( + f"Converting bytes/bytearray of length {len(obj)} to hex representation" + ) + return obj.hex() - def hash(self, obj: object) -> bytes: - """ - Hash an object to a byte representation. + # Handle Path objects + if isinstance(obj, Path): + logger.debug(f"Converting Path object to string: {obj}") + raise NotImplementedError( + "Path objects are not supported in this hasher. Please convert to string." + ) + return str(obj) - Args: - obj (object): The object to hash. + # Handle UUID objects + if isinstance(obj, UUID): + logger.debug(f"Converting UUID to string: {obj}") + raise NotImplementedError( + "UUID objects are not supported in this hasher. Please convert to string." + ) + return str(obj) - Returns: - bytes: The byte representation of the hash. - """ - return legacy_core.legacy_hash( - obj, function_info_extractor=self.function_info_extractor + # Handle named tuples (which are subclasses of tuple) + if hasattr(obj, "_fields") and isinstance(obj, tuple): + logger.debug(f"Processing named tuple of type {type(obj).__name__}") + # For namedtuples, convert to dict and then process + d = {field: getattr(obj, field) for field in obj._fields} # type: ignore + return self.process_structure(d, visited) + + # Handle mappings (dict-like objects) + if isinstance(obj, Mapping): + # Process both keys and values + processed_items = [ + ( + self.process_structure(k, visited), + self.process_structure(v, visited), + ) + for k, v in obj.items() + ] + + # Sort by the processed keys for deterministic order + processed_items.sort(key=lambda x: str(x[0])) + + # Create a new dictionary with string keys based on processed keys + # TODO: consider checking for possibly problematic values in processed_k + # and issue a warning + return { + str(processed_k): processed_v + for processed_k, processed_v in processed_items + } + + # Handle sets and frozensets + if isinstance(obj, (set, frozenset)): + logger.debug( + f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" + ) + # Process each item first, then sort the processed results + processed_items = [self.process_structure(item, visited) for item in obj] + return sorted(processed_items, key=str) + + # Handle collections (list-like objects) + if isinstance(obj, Collection): + logger.debug( + f"Processing collection of type {type(obj).__name__} with {len(obj)} items" + ) + return [self.process_structure(item, visited) for item in obj] + + # For functions, use the function_content_hash + if callable(obj) and hasattr(obj, "__code__"): + logger.debug(f"Processing function: {getattr(obj, '__name__')}") + if self.function_info_extractor is not None: + # Use the extractor to get a stable representation + function_info = self.function_info_extractor.extract_function_info(obj) + logger.debug( + f"Extracted function info: {function_info} for {obj.__name__}" + ) + + # simply return the function info as a stable representation + return function_info + else: + raise ValueError( + f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" + ) + + # handle data types + if isinstance(obj, type): + logger.debug(f"Processing class/type: {obj.__name__}") + return f"type:{obj.__name__}" + + # For other objects, attempt to create deterministic representation only if force_hash=True + class_name = obj.__class__.__name__ + module_name = obj.__class__.__module__ + if force_hash: + try: + import re + + logger.debug( + f"Processing generic object of type {module_name}.{class_name}" + ) + + # Try to get a stable dict representation if possible + if hasattr(obj, "__dict__"): + # Sort attributes to ensure stable order + attrs = sorted( + (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") + ) + # Limit to first 10 attributes to avoid extremely long representations + if len(attrs) > 10: + logger.debug( + f"Object has {len(attrs)} attributes, limiting to first 10" + ) + attrs = attrs[:10] + attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] + obj_repr = f"{{{', '.join(attr_strs)}}}" + else: + # Get basic repr but remove memory addresses + logger.debug( + "Object has no __dict__, using repr() with memory address removal" + ) + obj_repr = repr(obj) + if len(obj_repr) > 1000: + logger.debug( + f"Object repr is {len(obj_repr)} chars, truncating to 1000" + ) + obj_repr = obj_repr[:1000] + "..." + # Remove memory addresses which look like '0x7f9a1c2b3d4e' + obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + + return f"{module_name}.{class_name}:{obj_repr}" + except Exception as e: + # Last resort - use class name only + logger.warning(f"Failed to process object representation: {e}") + try: + return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" + except AttributeError: + logger.error( + "Could not determine object class, using UnknownObject" + ) + return "UnknownObject" + else: + raise ValueError( + f"Processing of {obj} of type {module_name}.{class_name} is not supported" + ) + + def _hash_object( + self, + obj: Any, + visited: set[int] | None = None, + ) -> bytes: + # Process the object to handle nested structures and HashableMixin instances + processed = self.process_structure(obj, visited=visited) + + # Serialize the processed structure + json_str = json.dumps(processed, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + logger.debug( + f"Successfully serialized {type(obj).__name__} using custom serializer" ) + + # Create the hash + return hashlib.sha256(json_str).digest() + + def hash(self, obj: object) -> bytes: + return self._hash_object(obj) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index fdf03b7..c587380 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -271,12 +271,17 @@ def call( self, tag: dp.Tag, packet: dp.Packet, + record_id: str | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: + if record_id is None: + record_id = self.get_record_id(packet) + tag, output_packet = super().call( tag, packet, + record_id=record_id, skip_cache_lookup=skip_cache_lookup, skip_cache_insert=skip_cache_insert, ) @@ -286,11 +291,23 @@ def call( output_packet.get_meta_value(self.DATA_RETRIEVED_FLAG) is not None ) # add pipeline record if the output packet is not None - self.add_pipeline_record(tag, packet, retrieved=retrieved) + # TODO: verify cache lookup logic + self.add_pipeline_record( + tag, + packet, + record_id, + retrieved=retrieved, + skip_cache_lookup=skip_cache_lookup, + ) return tag, output_packet def add_pipeline_record( - self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None + self, + tag: dp.Tag, + input_packet: dp.Packet, + packet_record_id: str, + retrieved: bool | None = None, + skip_cache_lookup: bool = False, ) -> None: # combine dp.Tag with packet content hash to compute entry hash tag_with_hash = tag.as_table().append_column( @@ -302,10 +319,12 @@ def add_pipeline_record( tag_with_hash, prefix_hasher_id=True ) - existing_record = self.pipeline_store.get_record_by_id( - self.pipeline_path, - entry_id, - ) + existing_record = None + if not skip_cache_lookup: + existing_record = self.pipeline_store.get_record_by_id( + self.pipeline_path, + entry_id, + ) if existing_record is not None: # if the record already exists, then skip @@ -317,9 +336,7 @@ def add_pipeline_record( ) .append_column( constants.PACKET_RECORD_ID, - pa.array( - [self.pod.get_record_id(input_packet)], type=pa.large_string() - ), + pa.array([packet_record_id], type=pa.large_string()), ) .append_column( f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", @@ -362,7 +379,11 @@ def get_all_records( # hack - use polars for join as it can deal with complex data type # TODO: convert the entire load logic to use polars with lazy evaluation - joined_info = pl.DataFrame(taginfo).join(pl.DataFrame(results), on=constants.PACKET_RECORD_ID, how="inner").to_arrow() + joined_info = ( + pl.DataFrame(taginfo) + .join(pl.DataFrame(results), on=constants.PACKET_RECORD_ID, how="inner") + .to_arrow() + ) # joined_info = taginfo.join( # results, diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index b8a4e0d..e3fc5b9 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1575,7 +1575,9 @@ def output_packet_types(self) -> TypeSpec: """ ... - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: + def call( + self, tag: Tag, packet: Packet, record_id: str | None = None + ) -> tuple[Tag, Packet | None]: """ Process a single packet with its associated tag. @@ -1613,6 +1615,7 @@ def call( self, tag: Tag, packet: Packet, + record_id: str | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[Tag, Packet | None]: diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 27c1e78..ca19512 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -25,6 +25,16 @@ def identity_structure(self) -> Any: """ ... + def content_hash(self) -> bytes: + """ + Compute a hash based on the content of this object. + + Returns: + bytes: A byte representation of the hash based on the content. + If no identity structure is provided, return None. + """ + ... + def __eq__(self, other: object) -> bool: """ Equality check that compares the identity structures of two objects. @@ -52,7 +62,7 @@ class ObjectHasher(Protocol): """Protocol for general object hashing.""" # TODO: consider more explicitly stating types of objects accepted - def hash(self, obj: Any, compressed: bool = False) -> bytes: + def hash(self, obj: Any) -> bytes: """ Hash an object to a byte representation. Object hasher must be able to handle ContentIdentifiable objects to hash them based on their @@ -68,7 +78,8 @@ def hash(self, obj: Any, compressed: bool = False) -> bytes: """ ... - def get_hasher_id(self) -> str: + @property + def hasher_id(self) -> str: """ Returns a unique identifier/name assigned to the hasher """ @@ -78,13 +89,10 @@ def hash_to_hex( self, obj: Any, char_count: int | None = None, - compressed: bool = False, prefix_hasher_id: bool = True, ) -> str: ... - def hash_to_int( - self, obj: Any, hexdigits: int = 16, compressed: bool = False - ) -> int: + def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: """ Hash an object to an integer. @@ -101,7 +109,6 @@ def hash_to_uuid( self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID, - comrpressed: bool = False, ) -> uuid.UUID: ... diff --git a/src/orcapod/semantic_types/__init__.py b/src/orcapod/semantic_types/__init__.py index 31c664e..3d09b1b 100644 --- a/src/orcapod/semantic_types/__init__.py +++ b/src/orcapod/semantic_types/__init__.py @@ -1,7 +1,9 @@ from .semantic_registry import SemanticTypeRegistry from .universal_converter import UniversalTypeConverter +from .type_inference import infer_schema_from_pylist_data __all__ = [ "SemanticTypeRegistry", "UniversalTypeConverter", + "infer_schema_from_pylist_data", ] diff --git a/src/orcapod/semantic_types/type_inference.py b/src/orcapod/semantic_types/type_inference.py index e8519f4..bc27a8d 100644 --- a/src/orcapod/semantic_types/type_inference.py +++ b/src/orcapod/semantic_types/type_inference.py @@ -2,7 +2,8 @@ def infer_schema_from_pylist_data( - data: list[dict], default_type=str + data: list[dict], + default_type: type = str, ) -> dict[str, type]: """ Infer schema from sample data (best effort). @@ -146,10 +147,10 @@ def _infer_set_type(sets: list, set_type: type) -> type: all_elements.extend(s) if not all_elements: - return set_type[Any] + return set_type[Any] # type: ignore[return-value] element_type = _infer_type_from_values(all_elements) - return set_type[element_type] + return set_type[element_type] # type: ignore[return-value] def _infer_dict_type(dicts: list[dict]) -> type: diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 8362e8e..1ec711c 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -141,21 +141,6 @@ def python_schema_to_arrow_schema( return pa.schema(fields) - def infer_python_schema_from_data(self, python_data: Any) -> type: - """ - Infer Python schema from data, returning a TypedDict type. - - This is useful for dynamic data structures where the schema is not known in advance. - """ - if not isinstance(python_data, dict): - raise ValueError("Expected a dictionary to infer schema") - - field_specs = {} - for key, value in python_data.items(): - field_specs[key] = type(value) - - return TypedDict("DynamicSchema", field_specs) # type: ignore[call-arg] - def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> type: """ Convert Arrow type to Python type hint with caching. @@ -186,11 +171,11 @@ def arrow_schema_to_python_schema(self, arrow_schema: pa.Schema) -> dict[str, ty return python_schema - def python_dicts_to_arrow_table( + def python_dict_to_struct_dict( self, python_dicts: list[dict[str, Any]], python_schema: dict[str, type] | None = None, - ) -> pa.Table: + ) -> list[dict[str, Any]]: """ Convert a list of Python dictionaries to an Arrow table. @@ -214,10 +199,57 @@ def python_dicts_to_arrow_table( converted_record[field_name] = None converted_data.append(converted_record) + return converted_data + + def struct_dict_to_python_dict( + self, + struct_dict: list[dict[str, Any]], + arrow_schema: pa.Schema, + ) -> list[dict[str, Any]]: + """ + Convert a list of Arrow structs to Python dictionaries. + + This uses the main conversion logic and caches results for performance. + """ + + converters = { + field.name: self.get_arrow_to_python_converter(field.type) + for field in arrow_schema + } + + converted_data = [] + for record in struct_dict: + converted_record = {} + for field_name, converter in converters.items(): + if field_name in record: + converted_record[field_name] = converter(record[field_name]) + else: + converted_record[field_name] = None + converted_data.append(converted_record) + + return converted_data + + def python_dicts_to_arrow_table( + self, + python_dicts: list[dict[str, Any]], + python_schema: dict[str, type] | None = None, + ) -> pa.Table: + """ + Convert a list of Python dictionaries to an Arrow table. + + This uses the main conversion logic and caches results for performance. + """ + if python_schema is None: + python_schema = infer_schema_from_pylist_data(python_dicts) + + struct_dict = self.python_dict_to_struct_dict( + python_dicts, python_schema=python_schema + ) + # Convert to Arrow schema arrow_schema = self.python_schema_to_arrow_schema(python_schema) - return pa.Table.from_pylist(converted_data, schema=arrow_schema) + return pa.Table.from_pylist(struct_dict, schema=arrow_schema) def arrow_table_to_python_dicts( self, arrow_table: pa.Table diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index e897367..41c09b5 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -263,7 +263,7 @@ def _get_existing_ids(self, record_path: tuple[str, ...]) -> set[str]: record_key = self._get_record_key(record_path) if ( self._cache_dirty.get(record_key) - or self._delta_table_cache.get(record_key) is None + or record_key not in self._delta_table_cache ): self._refresh_existing_ids_cache(record_path) return self._existing_ids_cache.get(record_key) or set() @@ -346,7 +346,7 @@ def add_records( return None else: # Check for conflicts - insert never allows duplicates when skip_duplicates=False - self._check_all_conflicts(record_path, deduplicated_table) + # self._check_all_conflicts(record_path, deduplicated_table) filtered_table = deduplicated_table # Step 4: Handle schema compatibility diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 74e5c71..c728338 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -87,6 +87,63 @@ def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: return pa.schema(merged_fields) +def select_table_columns_with_prefixes( + table: pa.Table, prefix: str | Collection[str] +) -> pa.Table: + """ + Select columns from a PyArrow table that start with a specific prefix. + + Args: + table (pa.Table): The original table. + prefix (str): The prefix to filter column names. + + Returns: + pa.Table: New table containing only the columns with the specified prefix. + """ + if isinstance(prefix, str): + prefix = [prefix] + selected_columns = [ + col for col in table.column_names if any(col.startswith(p) for p in prefix) + ] + return table.select(selected_columns) + + +def select_schema_columns_with_prefixes( + schema: pa.Schema, prefix: str | Collection[str] +) -> pa.Schema: + """ + Select columns from an Arrow schema that start with a specific prefix. + + Args: + schema (pa.Schema): The original schema. + prefix (str): The prefix to filter column names. + + Returns: + pa.Schema: New schema containing only the columns with the specified prefix. + """ + if isinstance(prefix, str): + prefix = [prefix] + selected_fields = [ + field for field in schema if any(field.name.startswith(p) for p in prefix) + ] + return pa.schema(selected_fields) + + +def select_arrow_schema(schema: pa.Schema, columns: Collection[str]) -> pa.Schema: + """ + Select specific columns from an Arrow schema. + + Args: + schema (pa.Schema): The original schema. + columns (Collection[str]): List of column names to select. + + Returns: + pa.Schema: New schema containing only the specified columns. + """ + selected_fields = [field for field in schema if field.name in columns] + return pa.schema(selected_fields) + + def hstack_tables(*tables: pa.Table) -> pa.Table: """ Horizontally stack multiple PyArrow tables by concatenating their columns. From 2914d88e0df831f297ea46eb6d8b19f4721281bb Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 4 Aug 2025 06:52:38 +0000 Subject: [PATCH 169/224] doc: update tutorial notebook with explanation on operator methods and complex data types --- .../01_quick_dive_into_orcapod.ipynb | 182 +++++++++++++++++- 1 file changed, 181 insertions(+), 1 deletion(-) diff --git a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb index b09f745..5b44850 100644 --- a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb +++ b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb @@ -10,6 +10,186 @@ "import orcapod as op" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "df42576d", + "metadata": {}, + "outputs": [], + "source": [ + "from orcapod.data.datagrams import DictDatagram, ArrowDatagram, DictPacket, ArrowPacket" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c1dfc4d1", + "metadata": {}, + "outputs": [], + "source": [ + "data = {\n", + " \"name\": \"orcapod\",\n", + " \"__something\": \"there\",\n", + " \"_another_kind\": 5,\n", + " \"value\": 42,\n", + " \"_source_value\": \"Japan\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "45e2f6e2", + "metadata": {}, + "outputs": [], + "source": [ + "dict_datagram = DictDatagram(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "01b060b6", + "metadata": {}, + "outputs": [], + "source": [ + "table = dict_datagram.as_table(include_all_info=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5e8a867e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'orcapod'}\n", + "{'_another_kind': 5, 'value': 42}\n" + ] + } + ], + "source": [ + "stream = op.streams.ImmutableTableStream(table, tag_columns=[\"name\"])\n", + "\n", + "for t, p in stream:\n", + " print(t)\n", + " print(p)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "10f029b5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_another_kind': None, 'value': 'Japan'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p.source_info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0f9edd9", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'stream' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mstream\u001b[49m.as_table(include_source=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "\u001b[31mNameError\u001b[39m: name 'stream' is not defined" + ] + } + ], + "source": [ + "stream.as_table(include_source=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "9bc4346b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_another_kind': 5,\n", + " 'value': 42,\n", + " '_context_key': 'std:v0.1.0:default',\n", + " '__something': 'there',\n", + " '_source__another_kind': None,\n", + " '_source_value': None}" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p.as_dict(include_all_info=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "ffd88de9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ArrowPacket(data={'_another_kind': 5, 'value': 42}, meta_columns=1, context='std:v0.1.0:default')" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "93b7638a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_another_kind': None, 'value': None}" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p.source_info()" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -854,7 +1034,7 @@ ], "metadata": { "kernelspec": { - "display_name": "orcapod (3.13.3)", + "display_name": ".venv", "language": "python", "name": "python3" }, From 44ead2b06e76ac0d8b34928de8611871999a1539 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 4 Aug 2025 07:07:36 +0000 Subject: [PATCH 170/224] doc: add as_df to streams --- .../01_orcapod_quick_exploration.ipynb | 1082 +++++++++++++++-- src/orcapod/data/streams.py | 34 +- src/orcapod/protocols/data_protocols.py | 11 + 3 files changed, 1017 insertions(+), 110 deletions(-) diff --git a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb index d0eb591..16169a9 100644 --- a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb +++ b/notebooks/tutorials/01_orcapod_quick_exploration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "id": "27cdd37d", "metadata": {}, "outputs": [], @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "id": "e6a9e8b6", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "id": "420477e8", "metadata": {}, "outputs": [], @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "id": "dab6bf9c", "metadata": {}, "outputs": [], @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "id": "cd0394d8", "metadata": {}, "outputs": [], @@ -124,10 +124,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "id": "2d4a0812", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'a': 1, 'b': 'x'}, Packet: {'c': True, 'd': 1.1}\n", + "Tag: {'a': 2, 'b': 'y'}, Packet: {'c': False, 'd': 2.2}\n", + "Tag: {'a': 3, 'b': 'z'}, Packet: {'c': True, 'd': 3.3}\n" + ] + } + ], "source": [ "for tag, packet in stream:\n", " print(f\"Tag: {tag}, Packet: {packet}\")" @@ -143,10 +153,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "id": "79e67bfc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[({'a': 1, 'b': 'x'}, {'c': True, 'd': 1.1}),\n", + " ({'a': 2, 'b': 'y'}, {'c': False, 'd': 2.2}),\n", + " ({'a': 3, 'b': 'z'}, {'c': True, 'd': 3.3})]" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stream.flow()" ] @@ -156,17 +179,47 @@ "id": "20fa500e", "metadata": {}, "source": [ - "Every stream can be converted into a table with `as_table()` method" + "Every stream can be converted into a Polars dataframe with `as_df()` method" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "id": "52baee9c", "metadata": {}, - "outputs": [], - "source": [ - "stream.as_table()" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 4)
abcd
i64strboolf64
1"x"true1.1
2"y"false2.2
3"z"true3.3
" + ], + "text/plain": [ + "shape: (3, 4)\n", + "┌─────┬─────┬───────┬─────┐\n", + "│ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 │\n", + "╞═════╪═════╪═══════╪═════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 │\n", + "└─────┴─────┴───────┴─────┘" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df()" ] }, { @@ -174,7 +227,7 @@ "id": "a7b29786", "metadata": {}, "source": [ - "Optionally, you can pass in arguments to `as_table` to have system columns included in the table" + "Optionally, you can pass in arguments to `as_df` to have system columns included in the table" ] }, { @@ -187,12 +240,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "id": "4648fbe9", "metadata": {}, - "outputs": [], - "source": [ - "stream.as_table(include_source=True)" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 6)
abcd_source_c_source_d
i64strboolf64strstr
1"x"true1.1nullnull
2"y"false2.2nullnull
3"z"true3.3nullnull
" + ], + "text/plain": [ + "shape: (3, 6)\n", + "┌─────┬─────┬───────┬─────┬───────────┬───────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ _source_c ┆ _source_d │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 ┆ str ┆ str │\n", + "╞═════╪═════╪═══════╪═════╪═══════════╪═══════════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ null ┆ null │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ null ┆ null │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ null ┆ null │\n", + "└─────┴─────┴───────┴─────┴───────────┴───────────┘" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df(include_source=True)" ] }, { @@ -205,12 +288,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "id": "001b2a9c", "metadata": {}, - "outputs": [], - "source": [ - "stream.as_table(include_content_hash=True)" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 5)
abcd_content_hash
i64strboolf64str
1"x"true1.1"arrow_v0.1@3de5f8a7b9a2fe5e6cc…
2"y"false2.2"arrow_v0.1@cc022b33fc80a6639d2…
3"z"true3.3"arrow_v0.1@b0bb7434c813b4d5d7c…
" + ], + "text/plain": [ + "shape: (3, 5)\n", + "┌─────┬─────┬───────┬─────┬─────────────────────────────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ _content_hash │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", + "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@3de5f8a7b9a2fe5e6cc… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@cc022b33fc80a6639d2… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@b0bb7434c813b4d5d7c… │\n", + "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df(include_content_hash=True)" ] }, { @@ -223,12 +336,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "id": "d3b9e394", "metadata": {}, - "outputs": [], - "source": [ - "stream.as_table(include_content_hash=\"my_hash_values\")" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 5)
abcdmy_hash_values
i64strboolf64str
1"x"true1.1"arrow_v0.1@3de5f8a7b9a2fe5e6cc…
2"y"false2.2"arrow_v0.1@cc022b33fc80a6639d2…
3"z"true3.3"arrow_v0.1@b0bb7434c813b4d5d7c…
" + ], + "text/plain": [ + "shape: (3, 5)\n", + "┌─────┬─────┬───────┬─────┬─────────────────────────────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ my_hash_values │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", + "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@3de5f8a7b9a2fe5e6cc… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@cc022b33fc80a6639d2… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@b0bb7434c813b4d5d7c… │\n", + "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df(include_content_hash=\"my_hash_values\")" ] }, { @@ -241,10 +384,80 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "id": "92cbfa50", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 5)
abcd_context_key
i64strboolf64str
1"x"true1.1null
2"y"false2.2null
3"z"true3.3null
" + ], + "text/plain": [ + "shape: (3, 5)\n", + "┌─────┬─────┬───────┬─────┬──────────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ _context_key │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", + "╞═════╪═════╪═══════╪═════╪══════════════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ null │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ null │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ null │\n", + "└─────┴─────┴───────┴─────┴──────────────┘" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df(include_data_context=True)" + ] + }, + { + "cell_type": "markdown", + "id": "574a2031", + "metadata": {}, + "source": [ + "If preferred you can view any stream as Arrow table by calling `as_table` method. It takes the same set of arguments as `as_df`" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "bf6acd59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "a: int64\n", + "b: string\n", + "c: bool\n", + "d: double\n", + "_context_key: large_string\n", + "----\n", + "a: [[1,2,3]]\n", + "b: [[\"x\",\"y\",\"z\"]]\n", + "c: [[true,false,true]]\n", + "d: [[1.1,2.2,3.3]]\n", + "_context_key: [[null,null,null]]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stream.as_table(include_data_context=True)" ] @@ -267,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "c78096a7", "metadata": {}, "outputs": [], @@ -277,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "6f8a2f0b", "metadata": {}, "outputs": [], @@ -287,20 +500,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "e1ac13b1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': 1, 'b': 'x'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tag" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "263fa1c5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': True, 'd': 1.1}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet" ] @@ -315,40 +550,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "42158816", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tag[\"a\"]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "6a792175", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'x'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tag[\"b\"]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "a28f2051", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet[\"c\"]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "981e6c44", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.1" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet[\"d\"]" ] @@ -363,10 +642,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "56423d2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': bool, 'd': float}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Returns typespec (dictionary of key to type)\n", "packet.types()" @@ -374,10 +664,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "d5e02f81", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "('c', 'd')" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# entry names as strings\n", "packet.keys()" @@ -393,10 +694,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "b1b18ee4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "c: bool\n", + "d: double\n", + "----\n", + "c: [[true]]\n", + "d: [[1.1]]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet.as_table()" ] @@ -411,10 +728,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "3aa4020e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "c: bool\n", + "d: double" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet.arrow_schema()" ] @@ -429,10 +758,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "bea6c771", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': 1, 'b': 'x'}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tag.as_dict()" ] @@ -447,10 +787,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "92f00feb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': None, 'd': None}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet.source_info()" ] @@ -465,20 +816,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "bba2bc5c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'c': True, 'd': 1.1, '_source_c': None, '_source_d': None}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet.as_dict(include_source=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "bd09d9d1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "c: bool\n", + "d: double\n", + "_source_c: large_string\n", + "_source_d: large_string\n", + "----\n", + "c: [[true]]\n", + "d: [[1.1]]\n", + "_source_c: [[null]]\n", + "_source_d: [[null]]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "packet.as_table(include_source=True)" ] @@ -493,10 +875,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "03219fd3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'arrow_v0.1@6e1143896d73d370757811b52ceeea8d1d456cd30206416fbf81754e1cea5568'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tag.content_hash()" ] @@ -527,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "11ee5130", "metadata": {}, "outputs": [], @@ -562,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "8299d4b1", "metadata": {}, "outputs": [], @@ -572,7 +965,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "dfc7ee9f", "metadata": {}, "outputs": [], @@ -589,32 +982,224 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "092abff5", + "cell_type": "markdown", + "id": "095856e3", "metadata": {}, - "outputs": [], "source": [ - "for tag, packet in joined_stream:\n", - " print(f\"Tag: {tag}, Packet: {packet}\")" + "The output of the computation is automatically cached so that as long as you access the same output stream, you won't be triggering unnecessary recomputation!" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "48ef0a8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 5)
idabcd
i64i64strboolf64
01"x"true1.1
12"y"false2.2
" + ], + "text/plain": [ + "shape: (2, 5)\n", + "┌─────┬─────┬─────┬───────┬─────┐\n", + "│ id ┆ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str ┆ bool ┆ f64 │\n", + "╞═════╪═════╪═════╪═══════╪═════╡\n", + "│ 0 ┆ 1 ┆ x ┆ true ┆ 1.1 │\n", + "│ 1 ┆ 2 ┆ y ┆ false ┆ 2.2 │\n", + "└─────┴─────┴─────┴───────┴─────┘" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joined_stream.as_df()" ] }, { "cell_type": "markdown", - "id": "095856e3", + "id": "b979bc35", "metadata": {}, "source": [ - "The output of the computation is automatically cached so that as long as you access the same output stream, you won't be triggering unnecessary recomputation!" + "### [NEW] Convenience methods" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "48ef0a8f", + "cell_type": "markdown", + "id": "67c8c1e0", "metadata": {}, - "outputs": [], "source": [ - "joined_stream.as_table()" + "In fact, streams comes with methods to conveniently perform common operators" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "fbc58246", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 5)
idabcd
i64i64strboolf64
01"x"true1.1
12"y"false2.2
" + ], + "text/plain": [ + "shape: (2, 5)\n", + "┌─────┬─────┬─────┬───────┬─────┐\n", + "│ id ┆ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str ┆ bool ┆ f64 │\n", + "╞═════╪═════╪═════╪═══════╪═════╡\n", + "│ 0 ┆ 1 ┆ x ┆ true ┆ 1.1 │\n", + "│ 1 ┆ 2 ┆ y ┆ false ┆ 2.2 │\n", + "└─────┴─────┴─────┴───────┴─────┘" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream1.join(stream2).as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "c6b0b571", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (2, 3)
idab
i64i64str
01"x"
12"y"
" + ], + "text/plain": [ + "shape: (2, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ id ┆ a ┆ b │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str │\n", + "╞═════╪═════╪═════╡\n", + "│ 0 ┆ 1 ┆ x │\n", + "│ 1 ┆ 2 ┆ y │\n", + "└─────┴─────┴─────┘" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream1.semi_join(stream2).as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "5be42490", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
ida_mappedb
i64i64str
01"x"
12"y"
43"z"
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌─────┬──────────┬─────┐\n", + "│ id ┆ a_mapped ┆ b │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str │\n", + "╞═════╪══════════╪═════╡\n", + "│ 0 ┆ 1 ┆ x │\n", + "│ 1 ┆ 2 ┆ y │\n", + "│ 4 ┆ 3 ┆ z │\n", + "└─────┴──────────┴─────┘" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream1.map_packets({\"a\": \"a_mapped\"}).as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "c9c98304", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
nameab
i64i64str
01"x"
12"y"
43"z"
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌──────┬─────┬─────┐\n", + "│ name ┆ a ┆ b │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str │\n", + "╞══════╪═════╪═════╡\n", + "│ 0 ┆ 1 ┆ x │\n", + "│ 1 ┆ 2 ┆ y │\n", + "│ 4 ┆ 3 ┆ z │\n", + "└──────┴─────┴─────┘" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream1.map_tags({\"id\": \"name\"}).as_df()" ] }, { @@ -635,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 94, "id": "35423d9a", "metadata": {}, "outputs": [], @@ -664,7 +1249,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 95, "id": "119d33a3", "metadata": {}, "outputs": [], @@ -682,7 +1267,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "id": "2b3b42ff", "metadata": {}, "outputs": [], @@ -701,20 +1286,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 97, "id": "ff05a8fc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(ImmutableTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "output_stream" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 98, "id": "6431180f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tag: {'id': 0}, Packet: {'sum': 11}\n", + "Tag: {'id': 1}, Packet: {'sum': 22}\n", + "Tag: {'id': 2}, Packet: {'sum': 33}\n", + "Tag: {'id': 3}, Packet: {'sum': 44}\n", + "Tag: {'id': 4}, Packet: {'sum': 55}\n" + ] + } + ], "source": [ "for t, p in output_stream:\n", " print(f\"Tag: {t}, Packet: {p}\")" @@ -728,6 +1336,107 @@ "Simple, right?" ] }, + { + "cell_type": "markdown", + "id": "4d8d7f23", + "metadata": {}, + "source": [ + "### [NEW] Pods with structured inputs and outputs" + ] + }, + { + "cell_type": "markdown", + "id": "5f7d21cc", + "metadata": {}, + "source": [ + "You can now use more complex structured data types like lists and dictionaries in your input and output for a pod" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "9b7fcbbf", + "metadata": {}, + "outputs": [], + "source": [ + "@op.function_pod(output_keys=[\"stats\"])\n", + "def compute_stats(a: int, b: int) -> dict[str, int]:\n", + " \"\"\"Compute various statistics.\"\"\"\n", + " return {\"sum\": a + b, \"difference\": a - b, \"product\": a * b}\n", + "\n", + "\n", + "@op.function_pod(output_keys=[\"message\"])\n", + "def build_message(stats: dict[str, int]) -> str:\n", + " \"\"\"Build a message from the stats.\"\"\"\n", + " return f\"Hi! The sum was {stats['sum']}, the difference was {stats['difference']}, and the product was {stats['product']}.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "3e496c59", + "metadata": {}, + "outputs": [], + "source": [ + "stats_output = compute_stats(input_stream)\n", + "messages = build_message(stats_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "23c0fa92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[({'id': 0}, {'stats': {'sum': 11, 'difference': -9, 'product': 10}}),\n", + " ({'id': 1}, {'stats': {'sum': 22, 'difference': -18, 'product': 40}}),\n", + " ({'id': 2}, {'stats': {'sum': 33, 'difference': -27, 'product': 90}}),\n", + " ({'id': 3}, {'stats': {'sum': 44, 'difference': -36, 'product': 160}}),\n", + " ({'id': 4}, {'stats': {'sum': 55, 'difference': -45, 'product': 250}})]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats_output.flow()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "bba7f8d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[({'id': 0},\n", + " {'message': 'Hi! The sum was 11, the difference was -9, and the product was 10.'}),\n", + " ({'id': 1},\n", + " {'message': 'Hi! The sum was 22, the difference was -18, and the product was 40.'}),\n", + " ({'id': 2},\n", + " {'message': 'Hi! The sum was 33, the difference was -27, and the product was 90.'}),\n", + " ({'id': 3},\n", + " {'message': 'Hi! The sum was 44, the difference was -36, and the product was 160.'}),\n", + " ({'id': 4},\n", + " {'message': 'Hi! The sum was 55, the difference was -45, and the product was 250.'})]" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages.flow()" + ] + }, { "cell_type": "markdown", "id": "04b0a24e", @@ -746,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 103, "id": "cb4bc91a", "metadata": {}, "outputs": [], @@ -766,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "id": "f371822b", "metadata": {}, "outputs": [], @@ -791,10 +1500,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "id": "e132fc93", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:combine_results(sum: int, product: int)-> is acting as a source!\n" + ] + } + ], "source": [ "# now defien the pipeline\n", "with pipeline:\n", @@ -813,10 +1536,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "id": "cca9e0d0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PodNode(pod=FunctionPod:add_numbers)" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.add_numbers" ] @@ -839,7 +1573,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "id": "21086f72", "metadata": {}, "outputs": [], @@ -857,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "id": "1e741659", "metadata": {}, "outputs": [], @@ -883,10 +1617,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "id": "c77154ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.add_numbers.df" ] @@ -919,7 +1685,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "id": "37e65e33", "metadata": {}, "outputs": [], @@ -931,10 +1697,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "id": "3bad8332", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:combine_results(sum: int, product: int)-> is acting as a source!\n" + ] + } + ], "source": [ "# now defien the pipeline\n", "with pipeline2:\n", @@ -947,30 +1727,126 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "id": "8f146ae7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline2.my_summation.df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "id": "8fd7bf4e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idproduct
i64i64
010
140
290
3160
4250
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────────┐\n", + "│ id ┆ product │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════════╡\n", + "│ 0 ┆ 10 │\n", + "│ 1 ┆ 40 │\n", + "│ 2 ┆ 90 │\n", + "│ 3 ┆ 160 │\n", + "│ 4 ┆ 250 │\n", + "└─────┴─────────┘" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline2.my_product.df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 117, "id": "2a918db1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idresult
i64str
0"Sum: 11, Product: 10"
1"Sum: 22, Product: 40"
2"Sum: 33, Product: 90"
3"Sum: 44, Product: 160"
4"Sum: 55, Product: 250"
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬───────────────────────┐\n", + "│ id ┆ result │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═════╪═══════════════════════╡\n", + "│ 0 ┆ Sum: 11, Product: 10 │\n", + "│ 1 ┆ Sum: 22, Product: 40 │\n", + "│ 2 ┆ Sum: 33, Product: 90 │\n", + "│ 3 ┆ Sum: 44, Product: 160 │\n", + "│ 4 ┆ Sum: 55, Product: 250 │\n", + "└─────┴───────────────────────┘" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline2.my_final_result.df" ] diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 26123f0..85e99f4 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -22,9 +22,12 @@ if TYPE_CHECKING: import pyarrow as pa import pyarrow.compute as pc + import polars as pl else: pa = LazyModule("pyarrow") pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + # TODO: consider using this instead of making copy of dicts # from types import MappingProxyType @@ -222,7 +225,24 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: ... + ) -> "pa.Table": ... + + def as_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> "pl.DataFrame": + """ + Convert the entire stream to a Polars DataFrame. + """ + return pl.DataFrame( + self.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_content_hash=include_content_hash, + ) + ) def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: """ @@ -286,7 +306,7 @@ class ImmutableTableStream(ImmutableStream): def __init__( self, - table: pa.Table, + table: "pa.Table", tag_columns: Collection[str] = (), source_info: dict[str, str | None] | None = None, source: dp.Kernel | None = None, @@ -394,7 +414,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. @@ -581,7 +601,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." @@ -670,7 +690,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] all_packets = [] @@ -884,7 +904,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] all_packets = [] @@ -984,7 +1004,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, - ) -> pa.Table: + ) -> "pa.Table": """ Returns the underlying table representation of the stream. This is useful for converting the stream to a table format. diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index e3fc5b9..ddcdccb 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1153,6 +1153,17 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... + def as_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_content_hash: bool | str = False, + ) -> "pl.DataFrame": + """ + Convert the entire stream to a Polars DataFrame. + """ + ... + def as_table( self, include_data_context: bool = False, From fb993b73dc1121d75dcce8150fb6cec8500511b0 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 4 Aug 2025 16:53:37 +0000 Subject: [PATCH 171/224] feat: add support for system tags and clean up constants --- src/orcapod/data/__init__.py | 2 +- src/orcapod/data/datagrams/arrow_datagram.py | 2 +- .../data/datagrams/arrow_tag_packet.py | 166 ++++++++++++- src/orcapod/data/datagrams/dict_datagram.py | 13 +- src/orcapod/data/datagrams/dict_tag_packet.py | 221 ++++++++++++++++- src/orcapod/data/operators/join.py | 11 +- src/orcapod/data/operators/mappers.py | 2 +- src/orcapod/data/pods.py | 2 +- src/orcapod/data/sources.py | 2 +- src/orcapod/data/streams.py | 66 ++++- src/orcapod/data/system_constants.py | 7 +- src/orcapod/pipeline/nodes.py | 2 +- src/orcapod/protocols/data_protocols.py | 226 +++++++++++++++++- 13 files changed, 689 insertions(+), 33 deletions(-) diff --git a/src/orcapod/data/__init__.py b/src/orcapod/data/__init__.py index eb005c1..24f5aab 100644 --- a/src/orcapod/data/__init__.py +++ b/src/orcapod/data/__init__.py @@ -1,5 +1,5 @@ from .trackers import DEFAULT_TRACKER_MANAGER -from .system_constants import orcapod_constants as constants +from .system_constants import constants __all__ = [ "DEFAULT_TRACKER_MANAGER", diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index a3a4ac8..1f94e2a 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -6,7 +6,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod.types import TypeSpec, typespec_utils from orcapod.types.core import DataValue from orcapod.utils import arrow_utils diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index c3a4f0a..adf7f44 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -5,8 +5,9 @@ import pyarrow as pa -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod import contexts +from orcapod.semantic_types import infer_schema_from_pylist_data from orcapod.types import TypeSpec from orcapod.types.core import DataValue @@ -48,11 +49,166 @@ def __init__( table=table, data_context=data_context, ) - extracted_system_tags = [ - c for c in self._data_table.column_names if c.startswith("_tag_") + extracted_system_tag_columns = [ + c + for c in self._data_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) ] - self._system_tag_table = self._data_table.select(extracted_system_tags) - self._data_table = self._data_table.drop_columns(extracted_system_tags) + self._system_tags_dict = ( + self._data_context.type_converter.arrow_table_to_python_dicts( + self._data_table.select(extracted_system_tag_columns) + )[0] + ) + self._system_tags_python_schema = infer_schema_from_pylist_data( + [self._system_tags_dict] + ) + self._system_tags_dict.update(system_tags or {}) + self._system_tags_table = ( + self._data_context.type_converter.python_dicts_to_arrow_table( + [self._system_tags_dict], python_schema=self._system_tags_python_schema + ) + ) + + self._data_table = self._data_table.drop_columns(extracted_system_tag_columns) + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> tuple[str, ...]: + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + keys += tuple(self._system_tags_dict.keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, type]: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + schema.update(self._system_tags_python_schema) + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + return arrow_utils.join_arrow_schemas( + schema, self._system_tags_table.schema + ) + return schema + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, DataValue]: + """ + Convert to dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + return_dict = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + return_dict.update(self._system_tags_dict) + return return_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> pa.Table: + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + # add system_tags only for existing data columns + table = arrow_utils.hstack_tables(table, self._system_tags_table) + return table + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_system_tags: bool = False, + ) -> ArrowDatagram: + table = self.as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_system_tags=include_system_tags, + ) + return ArrowDatagram( + table, + data_context=self._data_context, + ) + + def system_tags(self) -> dict[str, str | None]: + """ + Return system tags for all keys. + + Returns: + Copy of the dictionary mapping field names to their source info + """ + return self._system_tags_dict.copy() + + # 8. Utility Operations + def copy(self, include_cache: bool = True) -> Self: + """Return a copy of the datagram.""" + new_tag = super().copy(include_cache=include_cache) + + new_tag._system_tags_dict = self._system_tags_dict.copy() + new_tag._system_tags_python_schema = self._system_tags_python_schema.copy() + new_tag._system_tags_table = self._system_tags_table + + return new_tag class ArrowPacket(ArrowDatagram): diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index f9f9cf0..30f0903 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -4,7 +4,7 @@ import pyarrow as pa -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_schema_from_pylist_data @@ -126,6 +126,17 @@ def __init__( self._cached_data_arrow_schema: pa.Schema | None = None self._cached_meta_arrow_schema: pa.Schema | None = None + def _get_total_dict(self) -> dict[str, DataValue]: + """ + Return the total dictionary representation including meta and context. + + This is used for content hashing and exporting to Arrow. + """ + total_dict = dict(self._data) + total_dict.update(self._meta_data) + total_dict[constants.CONTEXT_KEY] = self._data_context + return total_dict + # 1. Core Properties (Identity & Structure) @property def meta_columns(self) -> tuple[str, ...]: diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 775945a..eaef415 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -4,12 +4,12 @@ import pyarrow as pa -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.data.datagrams.dict_datagram import DictDatagram -from orcapod.types import TypeSpec from orcapod.types.core import DataValue from orcapod.utils import arrow_utils +from orcapod.semantic_types import infer_schema_from_pylist_data logger = logging.getLogger(__name__) @@ -22,6 +22,223 @@ class DictTag(DictDatagram): to different representations like Arrow tables. """ + def __init__( + self, + data: Mapping[str, DataValue], + system_tags: Mapping[str, DataValue] | None = None, + meta_info: Mapping[str, DataValue] | None = None, + python_schema: dict[str, type] | None = None, + data_context: str | contexts.DataContext | None = None, + ) -> None: + """ + Initialize the tag with data. + + Args: + data: Dictionary containing tag data + """ + # normalize the data content and remove any source info keys + data_only = { + k: v + for k, v in data.items() + if not k.startswith(constants.SYSTEM_TAG_PREFIX) + } + extracted_system_tags = { + k: v for k, v in data.items() if k.startswith(constants.SYSTEM_TAG_PREFIX) + } + + super().__init__( + data_only, + python_schema=python_schema, + meta_info=meta_info, + data_context=data_context, + ) + + self._system_tags = {**extracted_system_tags, **(system_tags or {})} + self._system_tags_python_schema: dict[str, type] = ( + infer_schema_from_pylist_data([self._system_tags]) + ) + self._cached_system_tags_table: pa.Table | None = None + self._cached_system_tags_schema: pa.Schema | None = None + + def _get_total_dict(self) -> dict[str, DataValue]: + """Return the total dictionary representation including system tags.""" + total_dict = super()._get_total_dict() + total_dict.update(self._system_tags) + return total_dict + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> pa.Table: + """Convert the packet to an Arrow table.""" + table = super().as_table( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + + if include_all_info or include_system_tags: + if self._cached_system_tags_table is None: + self._cached_system_tags_table = ( + self._data_context.type_converter.python_dicts_to_arrow_table( + [self._system_tags], + python_schema=self._system_tags_python_schema, + ) + ) + table = arrow_utils.hstack_tables(table, self._cached_system_tags_table) + return table + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, DataValue]: + """ + Return dictionary representation. + + Args: + include_source: Whether to include source info fields + + Returns: + Dictionary representation of the packet + """ + dict_copy = super().as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + dict_copy.update(self._system_tags) + return dict_copy + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> tuple[str, ...]: + """Return keys of the Python schema.""" + keys = super().keys( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + keys += tuple(self._system_tags.keys()) + return keys + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, type]: + """Return copy of the Python schema.""" + schema = super().types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + schema.update(self._system_tags_python_schema) + return schema + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> pa.Schema: + """ + Return the PyArrow schema for this datagram. + + Args: + include_data_context: Whether to include data context column in the schema + include_source: Whether to include source info columns in the schema + + Returns: + PyArrow schema representing the datagram's structure + """ + schema = super().arrow_schema( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_context=include_context, + ) + if include_all_info or include_system_tags: + if self._cached_system_tags_schema is None: + self._cached_system_tags_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self._system_tags_python_schema + ) + ) + return arrow_utils.join_arrow_schemas( + schema, self._cached_system_tags_schema + ) + return schema + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_system_tags: bool = False, + ) -> DictDatagram: + """ + Convert the packet to a DictDatagram. + + Args: + include_source: Whether to include source info fields + + Returns: + DictDatagram representation of the packet + """ + + data = self.as_dict( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_system_tags=include_system_tags, + ) + python_schema = self.types( + include_all_info=include_all_info, + include_meta_columns=include_meta_columns, + include_system_tags=include_system_tags, + ) + return DictDatagram( + data, + python_schema=python_schema, + data_context=self._data_context, + ) + + def system_tags(self) -> dict[str, DataValue]: + """ + Return source information for all keys. + + Returns: + Dictionary mapping field names to their source info + """ + return dict(self._system_tags) + + def copy(self, include_cache: bool = True) -> Self: + """Return a shallow copy of the packet.""" + instance = super().copy(include_cache=include_cache) + instance._system_tags = self._system_tags.copy() + if include_cache: + instance._cached_system_tags_table = self._cached_system_tags_table + instance._cached_system_tags_schema = self._cached_system_tags_schema + + else: + instance._cached_system_tags_table = None + instance._cached_system_tags_schema = None + + return instance + class DictPacket(DictDatagram): """ diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index a8ab492..a101f3d 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -1,14 +1,11 @@ -from orcapod.data.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp from orcapod.data.streams import ImmutableTableStream from orcapod.types import TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from abc import abstractmethod from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from collections.abc import Collection, Mapping +from collections.abc import Collection from orcapod.errors import InputValidationError -from orcapod.data.system_constants import orcapod_constants as constants from orcapod.data.operators.base import NonZeroInputOperator if TYPE_CHECKING: @@ -66,13 +63,15 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: stream = streams[0] tag_keys, _ = [set(k) for k in stream.keys()] - table = stream.as_table(include_source=True) + table = stream.as_table(include_source=True, include_system_tags=True) # trick to get cartesian product table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) for next_stream in streams[1:]: next_tag_keys, _ = next_stream.keys() - next_table = next_stream.as_table(include_source=True) + next_table = next_stream.as_table( + include_source=True, include_system_tags=True + ) next_table = next_table.add_column( 0, COMMON_JOIN_KEY, pa.array([0] * len(next_table)) ) diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py index 0c8603e..f27cca8 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/data/operators/mappers.py @@ -5,7 +5,7 @@ from orcapod.utils.lazy_module import LazyModule from collections.abc import Mapping from orcapod.errors import InputValidationError -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod.data.operators.base import UnaryOperator if TYPE_CHECKING: diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 5092cda..794258b 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -14,7 +14,7 @@ from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join from orcapod.data.streams import LazyPodResultStream, EfficientPodResultStream -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index 0d3dda7..c31971d 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -18,7 +18,7 @@ from orcapod.types import DataValue, TypeSpec, typespec_utils from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants if TYPE_CHECKING: import pandas as pd diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 85e99f4..8c342fb 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -13,7 +13,7 @@ ArrowTag, DictTag, ) -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod.protocols import data_protocols as dp from orcapod.types import TypeSpec from orcapod.utils import arrow_utils @@ -224,6 +224,7 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": ... @@ -231,6 +232,7 @@ def as_df( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pl.DataFrame": """ @@ -240,6 +242,7 @@ def as_df( self.as_table( include_data_context=include_data_context, include_source=include_source, + include_system_tags=include_system_tags, include_content_hash=include_content_hash, ) ) @@ -333,16 +336,22 @@ def __init__( # determine tag columns first and then exclude any source info self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + self._system_tag_columns = tuple( + c for c in table.column_names if c.startswith(constants.SYSTEM_TAG_PREFIX) + ) + self._all_tag_columns = self._tag_columns + self._system_tag_columns if delta := set(tag_columns) - set(self._tag_columns): raise ValueError( f"Specified tag columns {delta} are not present in the table." ) table, prefix_tables = arrow_utils.prepare_prefixed_columns( - table, prefix_info, exclude_columns=self._tag_columns + table, + prefix_info, + exclude_columns=self._all_tag_columns, ) # now table should only contain tag columns and packet columns self._packet_columns = tuple( - c for c in table.column_names if c not in tag_columns + c for c in table.column_names if c not in self._all_tag_columns ) self._table = table self._source_info_table = prefix_tables[constants.SOURCE_PREFIX] @@ -356,11 +365,17 @@ def __init__( tag_schema = pa.schema( f for f in self._table.schema if f.name in self._tag_columns ) + system_tag_schema = pa.schema( + f for f in self._table.schema if f.name in self._system_tag_columns + ) + all_tag_schema = arrow_utils.join_arrow_schemas(tag_schema, system_tag_schema) packet_schema = pa.schema( f for f in self._table.schema if f.name in self._packet_columns ) self._tag_schema = tag_schema + self._system_tag_schema = system_tag_schema + self._all_tag_schema = all_tag_schema self._packet_schema = packet_schema # self._tag_converter = SemanticConverter.from_semantic_schema( # schemas.SemanticSchema.from_arrow_schema( @@ -382,7 +397,9 @@ def data_content_identity_structure(self) -> Any: This is used to identify the content of the stream. """ table_hash = self._data_context.arrow_hasher.hash_table( - self.as_table(include_data_context=True, include_source=True), + self.as_table( + include_data_context=True, include_source=True, include_system_tags=True + ), ) return ( self.__class__.__name__, @@ -413,6 +430,7 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": """ @@ -432,11 +450,14 @@ def as_table( output_table = output_table.append_column( hash_column_name, pa.array(content_hashes, type=pa.large_string()) ) + if not include_system_tags: + output_table = output_table.drop_columns(self._system_tag_columns) table_stack = (output_table,) if include_data_context: table_stack += (self._data_context_table,) if include_source: table_stack += (self._source_info_table,) + return arrow_utils.hstack_tables(*table_stack) def clear_cache(self) -> None: @@ -454,9 +475,9 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: # TODO: make it work with table batch stream if self._cached_elements is None: self._cached_elements = [] - tag_present = len(self._tag_columns) > 0 + tag_present = len(self._all_tag_columns) > 0 if tag_present: - tags = self._table.select(self._tag_columns) + tags = self._table.select(self._all_tag_columns) tag_batches = tags.to_batches() else: tag_batches = repeat(DictTag({})) @@ -600,6 +621,7 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": self.refresh() @@ -609,6 +631,7 @@ def as_table( return self._cached_stream.as_table( include_data_context=include_data_context, include_source=include_source, + include_system_tags=include_system_tags, include_content_hash=include_content_hash, ) @@ -689,6 +712,7 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": if self._cached_output_table is None: @@ -697,13 +721,13 @@ def as_table( tag_schema, packet_schema = None, None for tag, packet in self.iter_packets(): if tag_schema is None: - tag_schema = tag.arrow_schema() + tag_schema = tag.arrow_schema(include_system_tags=True) if packet_schema is None: packet_schema = packet.arrow_schema( include_context=True, include_source=True, ) - all_tags.append(tag.as_dict()) + all_tags.append(tag.as_dict(include_system_tags=True)) # FIXME: using in the pinch conversion to str from path # replace with an appropriate semantic converter-based approach! dict_patcket = packet.as_dict(include_context=True, include_source=True) @@ -729,6 +753,15 @@ def as_table( ) drop_columns = [] + if not include_system_tags: + # TODO: get system tags more effiicently + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) if not include_source: drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) if not include_data_context: @@ -740,6 +773,7 @@ def as_table( if include_content_hash: if self._cached_content_hash_column is None: content_hashes = [] + # TODO: verify that order will be preserved for tag, packet in self.iter_packets(): content_hashes.append(packet.content_hash()) self._cached_content_hash_column = pa.array( @@ -903,6 +937,7 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, ) -> "pa.Table": if self._cached_output_table is None: @@ -911,13 +946,13 @@ def as_table( tag_schema, packet_schema = None, None for tag, packet in self.iter_packets(): if tag_schema is None: - tag_schema = tag.arrow_schema() + tag_schema = tag.arrow_schema(include_system_tags=True) if packet_schema is None: packet_schema = packet.arrow_schema( include_context=True, include_source=True, ) - all_tags.append(tag.as_dict()) + all_tags.append(tag.as_dict(include_system_tags=True)) # FIXME: using in the pinch conversion to str from path # replace with an appropriate semantic converter-based approach! dict_patcket = packet.as_dict(include_context=True, include_source=True) @@ -947,8 +982,17 @@ def as_table( drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) if not include_data_context: drop_columns.append(constants.CONTEXT_KEY) + if not include_system_tags: + # TODO: come up with a more efficient approach + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) - output_table = self._cached_output_table.drop(drop_columns) + output_table = self._cached_output_table.drop_columns(drop_columns) # lazily prepare content hash column if requested if include_content_hash: diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py index 325bf83..86a9461 100644 --- a/src/orcapod/data/system_constants.py +++ b/src/orcapod/data/system_constants.py @@ -6,6 +6,7 @@ DATA_CONTEXT_KEY = "context_key" INPUT_PACKET_HASH = "input_packet_hash" PACKET_RECORD_ID = "packet_id" +SYSTEM_TAG_PREFIX = "system_tag_" class SystemConstant: @@ -40,5 +41,9 @@ def INPUT_PACKET_HASH(self) -> str: def PACKET_RECORD_ID(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{PACKET_RECORD_ID}" + @property + def SYSTEM_TAG_PREFIX(self) -> str: + return f"{self._global_prefix}{DATAGRAM_PREFIX}{SYSTEM_TAG_PREFIX}" + -orcapod_constants = SystemConstant() +constants = SystemConstant() diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index c587380..22abe80 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -5,7 +5,7 @@ from orcapod.types import TypeSpec from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any -from orcapod.data.system_constants import orcapod_constants as constants +from orcapod.data.system_constants import constants from orcapod.utils import arrow_utils from collections.abc import Collection diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index ddcdccb..94f40a8 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -634,7 +634,229 @@ class Tag(Datagram, Protocol): - Quality indicators or confidence scores """ - pass + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> tuple[str, ...]: + """ + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') + """ + ... + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> TypeSpec: + """ + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + include_source: Whether to include source info fields. + + Returns: + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } + """ + ... + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> "pa.Schema": + """ + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] + """ + ... + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_source: Whether to include source info fields. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_source: Whether to include source info columns in the schema. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # include_source: bool = False, + # ) -> dict[str, Any]: + # """Extended version with source info support.""" + # ... + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_system_tags: bool = False, + ) -> Datagram: + """ + Convert the packet to a Datagram. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + + Returns: + Datagram: Datagram representation of packet data + """ + ... + + def system_tags(self) -> dict[str, DataValue]: + """ + Return metadata about the packet's source/origin. + + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information + + Returns: + dict[str, str | None]: Source information for each data column as key-value pairs. + """ + ... class Packet(Datagram, Protocol): @@ -1158,6 +1380,7 @@ def as_df( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, + include_system_tags: bool = False, ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. @@ -1169,6 +1392,7 @@ def as_table( include_data_context: bool = False, include_source: bool = False, include_content_hash: bool | str = False, + include_system_tags: bool = False, ) -> "pa.Table": """ Convert the entire stream to a PyArrow Table. From cf535f92bbb30c0555f2048a2ba658b2e941281d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 00:17:30 +0000 Subject: [PATCH 172/224] fix: missing cache attribute in pod stream --- src/orcapod/data/base.py | 8 ++++---- src/orcapod/data/pods.py | 1 - src/orcapod/data/streams.py | 5 +++-- src/orcapod/protocols/data_protocols.py | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index e0e254e..efdb49b 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -35,7 +35,7 @@ def __init__( """ self._label = label self._data_context = contexts.resolve_context(data_context) - self._content_hash: str | None = None + self._content_hash: bytes | None = None self._int_hash: int | None = None @property @@ -89,7 +89,7 @@ def identity_structure(self) -> Any: # TODO: come up with a way to signify non-determinate identity structure return None - def content_hash(self) -> str: + def content_hash(self) -> bytes: """ Compute a hash based on the content of this object. @@ -100,8 +100,8 @@ def content_hash(self) -> str: if self._content_hash is None: structure = self.identity_structure() processed_structure = process_structure(structure) - self._content_hash = self._data_context.object_hasher.hash_to_hex( - processed_structure, prefix_hasher_id=True + self._content_hash = self._data_context.object_hasher.hash( + processed_structure ) return self._content_hash diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 794258b..3a653ed 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -5,7 +5,6 @@ from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal -from numpy import record from orcapod import contexts from orcapod.data.datagrams import ( ArrowPacket, diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 8c342fb..7bbc926 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -111,7 +111,7 @@ def substream_identities(self) -> tuple[str, ...]: Returns the identities of the substreams that this stream is composed of. This is used to identify the substreams in the computational graph. """ - return (self.content_hash(),) + return (self.content_hash().hex(),) def get_substream(self, substream_id: str) -> dp.Stream: """ @@ -234,7 +234,7 @@ def as_df( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, - ) -> "pl.DataFrame": + ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. """ @@ -812,6 +812,7 @@ def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): # Packet-level caching (from your PodStream) self._cached_output_packets: list[tuple[dp.Tag, dp.Packet | None]] | None = None self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: """ diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 94f40a8..c64e817 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1379,9 +1379,9 @@ def as_df( self, include_data_context: bool = False, include_source: bool = False, - include_content_hash: bool | str = False, include_system_tags: bool = False, - ) -> "pl.DataFrame": + include_content_hash: bool | str = False, + ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. """ @@ -1391,8 +1391,8 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, - include_content_hash: bool | str = False, include_system_tags: bool = False, + include_content_hash: bool | str = False, ) -> "pa.Table": """ Convert the entire stream to a PyArrow Table. From 7ff856e05a155d1911f635a3f6453ec64187198c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 07:27:39 +0000 Subject: [PATCH 173/224] feat: add execution engine capability --- ...ipynb => 01_introduction_to_orcapod.ipynb} | 419 +++++-- .../01_quick_dive_into_orcapod.ipynb | 1056 ----------------- .../02_parallel_execution_on_ray.ipynb | 129 ++ .../data/datagrams/arrow_tag_packet.py | 4 +- src/orcapod/data/kernels.py | 2 +- src/orcapod/data/pods.py | 168 ++- src/orcapod/data/streams.py | 305 ++++- src/orcapod/execution_engines/__init__.py | 1 + .../execution_engines/ray_execution_engine.py | 119 ++ src/orcapod/hashing/content_identifiable.py | 662 +++++------ src/orcapod/hashing/types.py | 256 ++-- src/orcapod/pipeline/nodes.py | 38 + src/orcapod/protocols/data_protocols.py | 110 +- 13 files changed, 1596 insertions(+), 1673 deletions(-) rename notebooks/tutorials/{01_orcapod_quick_exploration.ipynb => 01_introduction_to_orcapod.ipynb} (85%) delete mode 100644 notebooks/tutorials/01_quick_dive_into_orcapod.ipynb create mode 100644 notebooks/tutorials/02_parallel_execution_on_ray.ipynb create mode 100644 src/orcapod/execution_engines/__init__.py create mode 100644 src/orcapod/execution_engines/ray_execution_engine.py diff --git a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb similarity index 85% rename from notebooks/tutorials/01_orcapod_quick_exploration.ipynb rename to notebooks/tutorials/01_introduction_to_orcapod.ipynb index 16169a9..1938f8f 100644 --- a/notebooks/tutorials/01_orcapod_quick_exploration.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 71, + "execution_count": 1, "id": "27cdd37d", "metadata": {}, "outputs": [], @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 2, "id": "e6a9e8b6", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 3, "id": "420477e8", "metadata": {}, "outputs": [], @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 4, "id": "dab6bf9c", "metadata": {}, "outputs": [], @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 5, "id": "cd0394d8", "metadata": {}, "outputs": [], @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 6, "id": "2d4a0812", "metadata": {}, "outputs": [ @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 7, "id": "79e67bfc", "metadata": {}, "outputs": [ @@ -165,7 +165,7 @@ " ({'a': 3, 'b': 'z'}, {'c': True, 'd': 3.3})]" ] }, - "execution_count": 77, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -184,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 8, "id": "52baee9c", "metadata": {}, "outputs": [ @@ -213,7 +213,7 @@ "└─────┴─────┴───────┴─────┘" ] }, - "execution_count": 78, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 9, "id": "4648fbe9", "metadata": {}, "outputs": [ @@ -269,7 +269,7 @@ "└─────┴─────┴───────┴─────┴───────────┴───────────┘" ] }, - "execution_count": 79, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 10, "id": "001b2a9c", "metadata": {}, "outputs": [ @@ -317,7 +317,7 @@ "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, - "execution_count": 80, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -336,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 11, "id": "d3b9e394", "metadata": {}, "outputs": [ @@ -365,7 +365,7 @@ "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, - "execution_count": 81, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -384,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 12, "id": "92cbfa50", "metadata": {}, "outputs": [ @@ -413,7 +413,7 @@ "└─────┴─────┴───────┴─────┴──────────────┘" ] }, - "execution_count": 83, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 13, "id": "bf6acd59", "metadata": {}, "outputs": [ @@ -453,7 +453,7 @@ "_context_key: [[null,null,null]]" ] }, - "execution_count": 84, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -478,9 +478,57 @@ "The tags and packets returned by the streams can be thought of as special dictionary." ] }, + { + "cell_type": "markdown", + "id": "995161fc", + "metadata": {}, + "source": [ + "Let's work again with the stream we created earlier with the following content" + ] + }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, + "id": "68bff9fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 4)
abcd
i64strboolf64
1"x"true1.1
2"y"false2.2
3"z"true3.3
" + ], + "text/plain": [ + "shape: (3, 4)\n", + "┌─────┬─────┬───────┬─────┐\n", + "│ a ┆ b ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 │\n", + "╞═════╪═════╪═══════╪═════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 │\n", + "└─────┴─────┴───────┴─────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream.as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "c78096a7", "metadata": {}, "outputs": [], @@ -490,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "id": "6f8a2f0b", "metadata": {}, "outputs": [], @@ -500,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "id": "e1ac13b1", "metadata": {}, "outputs": [ @@ -510,7 +558,7 @@ "{'a': 1, 'b': 'x'}" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -521,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "id": "263fa1c5", "metadata": {}, "outputs": [ @@ -531,7 +579,7 @@ "{'c': True, 'd': 1.1}" ] }, - "execution_count": 16, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -550,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "id": "42158816", "metadata": {}, "outputs": [ @@ -560,7 +608,7 @@ "1" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -571,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "id": "6a792175", "metadata": {}, "outputs": [ @@ -581,7 +629,7 @@ "'x'" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -592,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "id": "a28f2051", "metadata": {}, "outputs": [ @@ -602,7 +650,7 @@ "True" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -613,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "id": "981e6c44", "metadata": {}, "outputs": [ @@ -623,7 +671,7 @@ "1.1" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -642,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "id": "56423d2c", "metadata": {}, "outputs": [ @@ -652,7 +700,7 @@ "{'c': bool, 'd': float}" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "id": "d5e02f81", "metadata": {}, "outputs": [ @@ -674,7 +722,7 @@ "('c', 'd')" ] }, - "execution_count": 22, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -694,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "id": "b1b18ee4", "metadata": {}, "outputs": [ @@ -709,7 +757,7 @@ "d: [[1.1]]" ] }, - "execution_count": 23, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "id": "3aa4020e", "metadata": {}, "outputs": [ @@ -739,7 +787,7 @@ "d: double" ] }, - "execution_count": 24, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -758,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "id": "bea6c771", "metadata": {}, "outputs": [ @@ -768,7 +816,7 @@ "{'a': 1, 'b': 'x'}" ] }, - "execution_count": 25, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -787,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "id": "92f00feb", "metadata": {}, "outputs": [ @@ -797,7 +845,7 @@ "{'c': None, 'd': None}" ] }, - "execution_count": 26, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -816,7 +864,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "id": "bba2bc5c", "metadata": {}, "outputs": [ @@ -826,7 +874,7 @@ "{'c': True, 'd': 1.1, '_source_c': None, '_source_d': None}" ] }, - "execution_count": 27, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -837,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "id": "bd09d9d1", "metadata": {}, "outputs": [ @@ -856,7 +904,7 @@ "_source_d: [[null]]" ] }, - "execution_count": 28, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -875,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "id": "03219fd3", "metadata": {}, "outputs": [ @@ -885,7 +933,7 @@ "'arrow_v0.1@6e1143896d73d370757811b52ceeea8d1d456cd30206416fbf81754e1cea5568'" ] }, - "execution_count": 29, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -920,7 +968,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "id": "11ee5130", "metadata": {}, "outputs": [], @@ -945,6 +993,86 @@ "stream2 = op.streams.ImmutableTableStream(table2, tag_columns=[\"id\"])" ] }, + { + "cell_type": "code", + "execution_count": 33, + "id": "73b75816", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
idab
i64i64str
01"x"
12"y"
43"z"
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ id ┆ a ┆ b │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ str │\n", + "╞═════╪═════╪═════╡\n", + "│ 0 ┆ 1 ┆ x │\n", + "│ 1 ┆ 2 ┆ y │\n", + "│ 4 ┆ 3 ┆ z │\n", + "└─────┴─────┴─────┘" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream1.as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "519754a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
idcd
i64boolf64
0true1.1
1false2.2
2true3.3
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌─────┬───────┬─────┐\n", + "│ id ┆ c ┆ d │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ f64 │\n", + "╞═════╪═══════╪═════╡\n", + "│ 0 ┆ true ┆ 1.1 │\n", + "│ 1 ┆ false ┆ 2.2 │\n", + "│ 2 ┆ true ┆ 3.3 │\n", + "└─────┴───────┴─────┘" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stream2.as_df()" + ] + }, { "cell_type": "markdown", "id": "6f87fcf3", @@ -955,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "id": "8299d4b1", "metadata": {}, "outputs": [], @@ -965,7 +1093,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 36, "id": "dfc7ee9f", "metadata": {}, "outputs": [], @@ -991,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 37, "id": "48ef0a8f", "metadata": {}, "outputs": [ @@ -1019,7 +1147,7 @@ "└─────┴─────┴─────┴───────┴─────┘" ] }, - "execution_count": 85, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1046,7 +1174,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 38, "id": "fbc58246", "metadata": {}, "outputs": [ @@ -1074,7 +1202,7 @@ "└─────┴─────┴─────┴───────┴─────┘" ] }, - "execution_count": 90, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1085,7 +1213,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 39, "id": "c6b0b571", "metadata": {}, "outputs": [ @@ -1113,7 +1241,7 @@ "└─────┴─────┴─────┘" ] }, - "execution_count": 91, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1124,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 40, "id": "5be42490", "metadata": {}, "outputs": [ @@ -1153,7 +1281,7 @@ "└─────┴──────────┴─────┘" ] }, - "execution_count": 92, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1164,7 +1292,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 41, "id": "c9c98304", "metadata": {}, "outputs": [ @@ -1193,7 +1321,7 @@ "└──────┴─────┴─────┘" ] }, - "execution_count": 93, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1220,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 42, "id": "35423d9a", "metadata": {}, "outputs": [], @@ -1249,7 +1377,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 43, "id": "119d33a3", "metadata": {}, "outputs": [], @@ -1267,7 +1395,49 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 44, + "id": "e3b60eca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
idab
i64i64i64
0110
1220
2330
3440
4550
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────┬─────┬─────┐\n", + "│ id ┆ a ┆ b │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═════╪═════╪═════╡\n", + "│ 0 ┆ 1 ┆ 10 │\n", + "│ 1 ┆ 2 ┆ 20 │\n", + "│ 2 ┆ 3 ┆ 30 │\n", + "│ 3 ┆ 4 ┆ 40 │\n", + "│ 4 ┆ 5 ┆ 50 │\n", + "└─────┴─────┴─────┘" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_stream.as_df()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "id": "2b3b42ff", "metadata": {}, "outputs": [], @@ -1286,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 46, "id": "ff05a8fc", "metadata": {}, "outputs": [ @@ -1296,7 +1466,7 @@ "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(ImmutableTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" ] }, - "execution_count": 97, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -1307,33 +1477,62 @@ }, { "cell_type": "code", - "execution_count": 98, - "id": "6431180f", + "execution_count": 47, + "id": "35107c18", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tag: {'id': 0}, Packet: {'sum': 11}\n", - "Tag: {'id': 1}, Packet: {'sum': 22}\n", - "Tag: {'id': 2}, Packet: {'sum': 33}\n", - "Tag: {'id': 3}, Packet: {'sum': 44}\n", - "Tag: {'id': 4}, Packet: {'sum': 55}\n" - ] + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "for t, p in output_stream:\n", - " print(f\"Tag: {t}, Packet: {p}\")" + "output_stream.as_df() # this triggers the computation!" ] }, { "cell_type": "markdown", - "id": "6ff00efa", + "id": "ab5d3ef0", "metadata": {}, "source": [ - "Simple, right?" + "If you prefer, you can explicitly trigger the comptuation by calling `run` method. In a future tutorial, we will see how we can combine `run` with execution engine to achieve asynchronous computations!" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "6431180f", + "metadata": {}, + "outputs": [], + "source": [ + "output_stream.run()" ] }, { @@ -1354,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 49, "id": "9b7fcbbf", "metadata": {}, "outputs": [], @@ -1373,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 50, "id": "3e496c59", "metadata": {}, "outputs": [], @@ -1384,7 +1583,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 51, "id": "23c0fa92", "metadata": {}, "outputs": [ @@ -1398,7 +1597,7 @@ " ({'id': 4}, {'stats': {'sum': 55, 'difference': -45, 'product': 250}})]" ] }, - "execution_count": 101, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1409,7 +1608,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 52, "id": "bba7f8d3", "metadata": {}, "outputs": [ @@ -1428,7 +1627,7 @@ " {'message': 'Hi! The sum was 55, the difference was -45, and the product was 250.'})]" ] }, - "execution_count": 102, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1455,7 +1654,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 53, "id": "cb4bc91a", "metadata": {}, "outputs": [], @@ -1475,7 +1674,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 54, "id": "f371822b", "metadata": {}, "outputs": [], @@ -1500,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 55, "id": "e132fc93", "metadata": {}, "outputs": [ @@ -1512,8 +1711,8 @@ "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", + "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", "Kernel PodNode:FunctionPod:combine_results(sum: int, product: int)-> is acting as a source!\n" ] } @@ -1536,7 +1735,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 56, "id": "cca9e0d0", "metadata": {}, "outputs": [ @@ -1546,7 +1745,7 @@ "PodNode(pod=FunctionPod:add_numbers)" ] }, - "execution_count": 106, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1573,7 +1772,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 57, "id": "21086f72", "metadata": {}, "outputs": [], @@ -1591,7 +1790,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 58, "id": "1e741659", "metadata": {}, "outputs": [], @@ -1617,7 +1816,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 59, "id": "c77154ec", "metadata": {}, "outputs": [ @@ -1648,7 +1847,7 @@ "└─────┴─────┘" ] }, - "execution_count": 109, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1685,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 60, "id": "37e65e33", "metadata": {}, "outputs": [], @@ -1697,7 +1896,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 61, "id": "3bad8332", "metadata": {}, "outputs": [ @@ -1727,7 +1926,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 62, "id": "8f146ae7", "metadata": {}, "outputs": [ @@ -1758,7 +1957,7 @@ "└─────┴─────┘" ] }, - "execution_count": 115, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1769,7 +1968,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 63, "id": "8fd7bf4e", "metadata": {}, "outputs": [ @@ -1800,7 +1999,7 @@ "└─────┴─────────┘" ] }, - "execution_count": 116, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1811,7 +2010,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 64, "id": "2a918db1", "metadata": {}, "outputs": [ @@ -1842,7 +2041,7 @@ "└─────┴───────────────────────┘" ] }, - "execution_count": 117, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1876,7 +2075,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.3" + "version": "3.11.12" } }, "nbformat": 4, diff --git a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb b/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb deleted file mode 100644 index 5b44850..0000000 --- a/notebooks/tutorials/01_quick_dive_into_orcapod.ipynb +++ /dev/null @@ -1,1056 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "27cdd37d", - "metadata": {}, - "outputs": [], - "source": [ - "import orcapod as op" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "df42576d", - "metadata": {}, - "outputs": [], - "source": [ - "from orcapod.data.datagrams import DictDatagram, ArrowDatagram, DictPacket, ArrowPacket" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c1dfc4d1", - "metadata": {}, - "outputs": [], - "source": [ - "data = {\n", - " \"name\": \"orcapod\",\n", - " \"__something\": \"there\",\n", - " \"_another_kind\": 5,\n", - " \"value\": 42,\n", - " \"_source_value\": \"Japan\",\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "45e2f6e2", - "metadata": {}, - "outputs": [], - "source": [ - "dict_datagram = DictDatagram(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "01b060b6", - "metadata": {}, - "outputs": [], - "source": [ - "table = dict_datagram.as_table(include_all_info=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "5e8a867e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'name': 'orcapod'}\n", - "{'_another_kind': 5, 'value': 42}\n" - ] - } - ], - "source": [ - "stream = op.streams.ImmutableTableStream(table, tag_columns=[\"name\"])\n", - "\n", - "for t, p in stream:\n", - " print(t)\n", - " print(p)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "10f029b5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_another_kind': None, 'value': 'Japan'}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.source_info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0f9edd9", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'stream' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mstream\u001b[49m.as_table(include_source=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", - "\u001b[31mNameError\u001b[39m: name 'stream' is not defined" - ] - } - ], - "source": [ - "stream.as_table(include_source=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "9bc4346b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_another_kind': 5,\n", - " 'value': 42,\n", - " '_context_key': 'std:v0.1.0:default',\n", - " '__something': 'there',\n", - " '_source__another_kind': None,\n", - " '_source_value': None}" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.as_dict(include_all_info=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "ffd88de9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ArrowPacket(data={'_another_kind': 5, 'value': 42}, meta_columns=1, context='std:v0.1.0:default')" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "93b7638a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_another_kind': None, 'value': None}" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.source_info()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9cd4692c", - "metadata": {}, - "outputs": [], - "source": [ - "N = 10\n", - "stream = op.SyncStreamFromLists(\n", - " tags=[{\"id\": i} for i in range(N)],\n", - " packets=[{\"x\": i, \"y\": i + 1} for i in range(N)],\n", - " tag_typespec={\"id\": int},\n", - " packet_typespec={\"x\": int, \"y\": int},\n", - " label=\"MySource\",\n", - ")\n", - "\n", - "word_stream = op.SyncStreamFromLists(\n", - " tags=[{\"id\": i} for i in range(N)],\n", - " packets=[{\"word1\": f\"hello {i}\", \"word2\": f\"world {i}\"} for i in range(N)],\n", - " tag_typespec={\"id\": int},\n", - " packet_typespec={\"word1\": str, \"word2\": str},\n", - " label=\"HelloWorld\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78ab941b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'id': 0} {'x': 0, 'y': 1}\n", - "{'id': 1} {'x': 1, 'y': 2}\n", - "{'id': 2} {'x': 2, 'y': 3}\n", - "{'id': 3} {'x': 3, 'y': 4}\n", - "{'id': 4} {'x': 4, 'y': 5}\n", - "{'id': 5} {'x': 5, 'y': 6}\n", - "{'id': 6} {'x': 6, 'y': 7}\n", - "{'id': 7} {'x': 7, 'y': 8}\n", - "{'id': 8} {'x': 8, 'y': 9}\n", - "{'id': 9} {'x': 9, 'y': 10}\n" - ] - } - ], - "source": [ - "for tag, packet in stream:\n", - " print(tag, packet)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ef13511e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'id': 0} {'word1': 'hello 0', 'word2': 'world 0'}\n", - "{'id': 1} {'word1': 'hello 1', 'word2': 'world 1'}\n", - "{'id': 2} {'word1': 'hello 2', 'word2': 'world 2'}\n", - "{'id': 3} {'word1': 'hello 3', 'word2': 'world 3'}\n", - "{'id': 4} {'word1': 'hello 4', 'word2': 'world 4'}\n", - "{'id': 5} {'word1': 'hello 5', 'word2': 'world 5'}\n", - "{'id': 6} {'word1': 'hello 6', 'word2': 'world 6'}\n", - "{'id': 7} {'word1': 'hello 7', 'word2': 'world 7'}\n", - "{'id': 8} {'word1': 'hello 8', 'word2': 'world 8'}\n", - "{'id': 9} {'word1': 'hello 9', 'word2': 'world 9'}\n" - ] - } - ], - "source": [ - "for tag, packet in word_stream:\n", - " print(tag, packet)" - ] - }, - { - "cell_type": "markdown", - "id": "ea7eb5ed", - "metadata": {}, - "source": [ - "## Defining function pods" - ] - }, - { - "cell_type": "markdown", - "id": "891bbadf", - "metadata": {}, - "source": [ - "Now we define our own function pods to perform simple computation. \n", - "Defining a function pod is quite simple, you simply \n", - "1. define a regular function with type annotations\n", - "2. decorate with `op.function_pod`, passing in the name ('key') for the output value(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f8781072", - "metadata": {}, - "outputs": [], - "source": [ - "@op.function_pod(\"total\")\n", - "def total(x: int, y: int) -> int:\n", - " return x + y\n", - "\n", - "\n", - "@op.function_pod(\"delta\")\n", - "def delta(x: int, y: int) -> int:\n", - " return 2 * y - x\n", - "\n", - "\n", - "@op.function_pod(\"mult\")\n", - "def mult(x: int, y: int) -> int:\n", - " return x * y\n", - "\n", - "\n", - "@op.function_pod(\"concat_string\")\n", - "def concat(x: str, y: str) -> str:\n", - " return x + y\n" - ] - }, - { - "cell_type": "markdown", - "id": "bd843166", - "metadata": {}, - "source": [ - "Wrapped functions are now `FunctionPod` and expects to be called with streams as inputs. You can still access the original function through its `function` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7b8f8056", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "Expected SyncStream, got int for stream 5", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# this won't work, because it's expecting a stream as input\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m5\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:60\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m streams:\n\u001b[32m 59\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, SyncStream):\n\u001b[32m---> \u001b[39m\u001b[32m60\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 61\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mExpected SyncStream, got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(stream).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m for stream \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstream\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 62\u001b[39m )\n\u001b[32m 63\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source):\n\u001b[32m 64\u001b[39m \u001b[38;5;66;03m# if the stream is a Source, instantiate it\u001b[39;00m\n\u001b[32m 65\u001b[39m stream = stream()\n", - "\u001b[31mTypeError\u001b[39m: Expected SyncStream, got int for stream 5" - ] - } - ], - "source": [ - "# this won't work, because it's expecting a stream as input\n", - "total(5, 6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fba23537", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# but you can access original function this way\n", - "total.function(5, 6)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e56ffa7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Passing a stream into a pod does NOT immediately trigger execution, but rather returns another stream\n", - "\n", - "total_stream = total(stream)" - ] - }, - { - "cell_type": "markdown", - "id": "0af7a165", - "metadata": {}, - "source": [ - "Iterating through the stream or calling `flow` triggers the computation" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4c9017c9", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'total_stream' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtotal_stream\u001b[49m:\n\u001b[32m 2\u001b[39m \u001b[38;5;28mprint\u001b[39m(tag, packet)\n", - "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" - ] - } - ], - "source": [ - "for tag, packet in total_stream:\n", - " print(tag, packet)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "59104716", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'total_stream' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mtotal_stream\u001b[49m.flow()\n", - "\u001b[31mNameError\u001b[39m: name 'total_stream' is not defined" - ] - } - ], - "source": [ - "total_stream.flow()" - ] - }, - { - "cell_type": "markdown", - "id": "d1013dd1", - "metadata": {}, - "source": [ - "If you try to pass in an incompatible stream (stream whose packets don't match the expected inputs of the function), you will immediately get an error." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "77547b4d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Key 'word1' not found in parameter types.\n" - ] - }, - { - "ename": "TypeError", - "evalue": "Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[8]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m total_stream = \u001b[43mtotal\u001b[49m\u001b[43m(\u001b[49m\u001b[43mword_stream\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:75\u001b[39m, in \u001b[36mKernel.__call__\u001b[39m\u001b[34m(self, label, *streams, **kwargs)\u001b[39m\n\u001b[32m 69\u001b[39m normalized_streams = [\n\u001b[32m 70\u001b[39m stream() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(stream, Source) \u001b[38;5;28;01melse\u001b[39;00m stream\n\u001b[32m 71\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m stream \u001b[38;5;129;01min\u001b[39;00m verified_streams\n\u001b[32m 72\u001b[39m ]\n\u001b[32m 74\u001b[39m pre_processed_streams = \u001b[38;5;28mself\u001b[39m.pre_forward_hook(*normalized_streams, **kwargs)\n\u001b[32m---> \u001b[39m\u001b[32m75\u001b[39m output_stream = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43mpre_processed_streams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m post_processed_stream = \u001b[38;5;28mself\u001b[39m.post_forward_hook(output_stream, **kwargs)\n\u001b[32m 77\u001b[39m \u001b[38;5;66;03m# create an invocation instance\u001b[39;00m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:236\u001b[39m, in \u001b[36mFunctionPod.forward\u001b[39m\u001b[34m(self, *streams, **kwargs)\u001b[39m\n\u001b[32m 232\u001b[39m _, packet_typespec = stream.types(trigger_run=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m packet_typespec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m check_typespec_compatibility(\n\u001b[32m 234\u001b[39m packet_typespec, \u001b[38;5;28mself\u001b[39m.function_input_typespec\n\u001b[32m 235\u001b[39m ):\n\u001b[32m--> \u001b[39m\u001b[32m236\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[32m 237\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInput packet types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is not compatible with the function\u001b[39m\u001b[33m'\u001b[39m\u001b[33ms expected input types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.function_input_typespec\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 238\u001b[39m )\n\u001b[32m 239\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().forward(*streams, **kwargs)\n", - "\u001b[31mTypeError\u001b[39m: Input packet types {'word1': , 'word2': } is not compatible with the function's expected input types {'x': , 'y': }" - ] - } - ], - "source": [ - "total_stream = total(word_stream)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4c9c030a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "({'id': int}, {'x': int, 'y': int})" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# you can check the tag and packet types of the stream\n", - "stream.types()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "34338baf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "({'id': int}, {'x': int, 'y': int})" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# you can check the tag and packet types of the stream\n", - "stream.types()" - ] - }, - { - "cell_type": "markdown", - "id": "3ba299b2", - "metadata": {}, - "source": [ - "## Defining pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "1e1dd036", - "metadata": {}, - "source": [ - "We will now piece together multiple function pods into a pipeline. We do this by instantiating a `Pipeline` object. We will store the results into a simple data store." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8083f54a", - "metadata": {}, - "outputs": [], - "source": [ - "# Use simple data store, saving data to Parquet files\n", - "from orcapod.stores.delta_table_arrow_data_store import DeltaTableArrowDataStore\n", - "\n", - "pipeline_store = DeltaTableArrowDataStore(\"./delta_store\", batch_size=100)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a475308c", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = op.Pipeline(\"test_pipeline\", pipeline_store)\n" - ] - }, - { - "cell_type": "markdown", - "id": "a42158b9", - "metadata": {}, - "source": [ - "Now we have a pipeline object, we can use it to define our pipeline by simply \"chaining\" together function pod calls." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f923ecf1", - "metadata": {}, - "outputs": [], - "source": [ - "with pipeline:\n", - " total_stream = total(stream)\n", - " delta_stream = delta(stream)\n", - " mult_stream = mult(\n", - " total_stream.map({\"total\": \"x\"}), delta_stream.map({\"delta\": \"y\"})\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "b67e9413", - "metadata": {}, - "source": [ - "And that's it! Now the elements of the pipeline is available as properties on the pipeline." - ] - }, - { - "cell_type": "markdown", - "id": "7ee41a20", - "metadata": {}, - "source": [ - "By default, the function pods are made available under the function's name in the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "64746ada", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "Memoizing single packet return 2 packets!", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[14]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", - "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" - ] - } - ], - "source": [ - "pipeline.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "66230603", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FunctionPodNode>" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.total" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "6587f2f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "FunctionPodNode>" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.mult" - ] - }, - { - "cell_type": "markdown", - "id": "16d0dba3", - "metadata": {}, - "source": [ - "Other implicitly created nodes such as joining of two streams are made available under the corresponding operator class (e.g. `Join`)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "bd0dfba2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "KernelNode" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.Join" - ] - }, - { - "cell_type": "markdown", - "id": "71dba5c5", - "metadata": {}, - "source": [ - "You can list out all nodes through `nodes` property" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "e22758ab", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", - " 'MapPackets_0': KernelNode,\n", - " 'MapPackets_1': KernelNode,\n", - " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>}" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.nodes" - ] - }, - { - "cell_type": "markdown", - "id": "039b617f", - "metadata": {}, - "source": [ - "You can easily rename any node using the pipeline's `rename` method" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "0d1a470e", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline.rename(\"MapPackets_0\", \"total_map\")\n", - "pipeline.rename(\"MapPackets_1\", \"mult_map\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "3a43984d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'MySource': KernelNode,\n", - " 'total': FunctionPodNode>,\n", - " 'delta': FunctionPodNode>,\n", - " 'Join': KernelNode,\n", - " 'mult': FunctionPodNode>,\n", - " 'total_map': KernelNode,\n", - " 'mult_map': KernelNode}" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.nodes" - ] - }, - { - "cell_type": "markdown", - "id": "c438f111", - "metadata": {}, - "source": [ - "Renaming does NOT change the structure of the pipeline in anyway -- it simply changes how it's labeld for your convenience." - ] - }, - { - "cell_type": "markdown", - "id": "befa6107", - "metadata": {}, - "source": [ - "### Running pipeline and accessing results" - ] - }, - { - "cell_type": "markdown", - "id": "4d4412b1", - "metadata": {}, - "source": [ - "Since we just created the pipeline, there are no results associated with any node. You can get [Polars](https://pola.rs) DataFrame viewing into the results through the node's `df` attribute." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "96106e09", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Flushing triggered!!\n" - ] - } - ], - "source": [ - "pipeline.total.df" - ] - }, - { - "cell_type": "markdown", - "id": "62b7e59a", - "metadata": {}, - "source": [ - "Before we run, the source nodes is also not \"recorded\" and thus will appear empty." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "33b449b6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Flushing triggered!!\n" - ] - } - ], - "source": [ - "pipeline.MySource.df" - ] - }, - { - "cell_type": "markdown", - "id": "408e8012", - "metadata": {}, - "source": [ - "We can trigger the entire pipeline to run and record all results by simply calling the `run` method." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "189f943f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n", - "Flushing triggered!!\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error processing packet {'x': 8, 'y': 9}: Memoizing single packet return 2 packets!\n" - ] - }, - { - "ename": "AssertionError", - "evalue": "Memoizing single packet return 2 packets!", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[18]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/pipeline.py:217\u001b[39m, in \u001b[36mPipeline.run\u001b[39m\u001b[34m(self, full_sync)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m full_sync:\n\u001b[32m 216\u001b[39m node.reset_cache()\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[43mnode\u001b[49m\u001b[43m.\u001b[49m\u001b[43mflow\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 219\u001b[39m \u001b[38;5;28mself\u001b[39m.flush()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:336\u001b[39m, in \u001b[36mStream.flow\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 331\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mflow\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Collection[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 332\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 333\u001b[39m \u001b[33;03m Flow everything through the stream, returning the entire collection of\u001b[39;00m\n\u001b[32m 334\u001b[39m \u001b[33;03m (Tag, Packet) as a collection. This will tigger any upstream computation of the stream.\u001b[39;00m\n\u001b[32m 335\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m336\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m[\u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43me\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m]\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/base.py:590\u001b[39m, in \u001b[36mSource.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 587\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 588\u001b[39m \u001b[33;03m Simple iter method that allows for Source object to act as a stream.\u001b[39;00m\n\u001b[32m 589\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m590\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/streams.py:99\u001b[39m, in \u001b[36mSyncStreamFromGenerator.__iter__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__iter__\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[\u001b[38;5;28mtuple\u001b[39m[Tag, Packet]]:\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.check_consistency:\n\u001b[32m---> \u001b[39m\u001b[32m99\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.generator_factory()\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:107\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 105\u001b[39m logger.error(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mraise\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 108\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.error_handling == \u001b[33m\"\u001b[39m\u001b[33mwarn\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 109\u001b[39m warnings.warn(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mError processing packet \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/core/pod.py:94\u001b[39m, in \u001b[36mPod.forward..generator\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m tag, packet \u001b[38;5;129;01min\u001b[39;00m stream:\n\u001b[32m 93\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m tag, output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpacket\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 96\u001b[39m logger.debug(\n\u001b[32m 97\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCall returned None as output for tag \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtag\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. Skipping...\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 98\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:629\u001b[39m, in \u001b[36mCachedFunctionPodWrapper.call\u001b[39m\u001b[34m(self, tag, packet)\u001b[39m\n\u001b[32m 627\u001b[39m output_packet = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 628\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m.skip_memoization_lookup:\n\u001b[32m--> \u001b[39m\u001b[32m629\u001b[39m output_packet = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_retrieve_memoized_with_packet_key\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpacket_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m output_packet \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 631\u001b[39m logger.debug(\n\u001b[32m 632\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoized output for \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m with \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpacket_key\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found, skipping computation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 633\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/workspace/orcapod-python/src/orcapod/pipeline/nodes.py:573\u001b[39m, in \u001b[36mCachedFunctionPodWrapper._retrieve_memoized_with_packet_key\u001b[39m\u001b[34m(self, packet_key)\u001b[39m\n\u001b[32m 571\u001b[39m packets = \u001b[38;5;28mself\u001b[39m.output_converter.from_arrow_table_to_python_packets(arrow_table)\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# since memoizing single packet, it should only contain one packet\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m573\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(packets) == \u001b[32m1\u001b[39m, (\n\u001b[32m 574\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mMemoizing single packet return \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(packets)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m packets!\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 575\u001b[39m )\n\u001b[32m 576\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m packets[\u001b[32m0\u001b[39m]\n", - "\u001b[31mAssertionError\u001b[39m: Memoizing single packet return 2 packets!" - ] - } - ], - "source": [ - "pipeline.run()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "1674bec4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (10, 3)
idxy
i64i64i64
001
112
223
334
445
556
667
778
889
9910
" - ], - "text/plain": [ - "shape: (10, 3)\n", - "┌─────┬─────┬─────┐\n", - "│ id ┆ x ┆ y │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ i64 ┆ i64 │\n", - "╞═════╪═════╪═════╡\n", - "│ 0 ┆ 0 ┆ 1 │\n", - "│ 1 ┆ 1 ┆ 2 │\n", - "│ 2 ┆ 2 ┆ 3 │\n", - "│ 3 ┆ 3 ┆ 4 │\n", - "│ 4 ┆ 4 ┆ 5 │\n", - "│ 5 ┆ 5 ┆ 6 │\n", - "│ 6 ┆ 6 ┆ 7 │\n", - "│ 7 ┆ 7 ┆ 8 │\n", - "│ 8 ┆ 8 ┆ 9 │\n", - "│ 9 ┆ 9 ┆ 10 │\n", - "└─────┴─────┴─────┘" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.MySource.df" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "2b69d213", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (10, 2)
idtotal
i64i64
01
13
25
37
49
511
613
715
817
919
" - ], - "text/plain": [ - "shape: (10, 2)\n", - "┌─────┬───────┐\n", - "│ id ┆ total │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═══════╡\n", - "│ 0 ┆ 1 │\n", - "│ 1 ┆ 3 │\n", - "│ 2 ┆ 5 │\n", - "│ 3 ┆ 7 │\n", - "│ 4 ┆ 9 │\n", - "│ 5 ┆ 11 │\n", - "│ 6 ┆ 13 │\n", - "│ 7 ┆ 15 │\n", - "│ 8 ┆ 17 │\n", - "│ 9 ┆ 19 │\n", - "└─────┴───────┘" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.total.df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb new file mode 100644 index 0000000..b7067c9 --- /dev/null +++ b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0c2dfaec", + "metadata": {}, + "outputs": [], + "source": [ + "from orcapod.execution_engines import NativeRayAsyncEngine\n", + "import orcapod as op\n", + "import pyarrow as pa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3196df7e", + "metadata": {}, + "outputs": [], + "source": [ + "input_stream = op.streams.ImmutableTableStream(\n", + " pa.Table.from_pylist([{\"id\": i, \"x\": i * 2, \"y\": i * 3} for i in range(30)]),\n", + " tag_columns=[\"id\"],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e1f338b", + "metadata": {}, + "outputs": [], + "source": [ + "ray_engine = NativeRayAsyncEngine(\n", + " \"ray://raycluster-op-test-kuberay-head-svc.ray.svc.cluster.local:10001\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63c692df", + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep\n", + "\n", + "\n", + "@op.function_pod(\"sum\")\n", + "def add_numbers(x: int, y: int) -> int:\n", + " \"\"\"\n", + " A simple function that adds two numbers.\n", + " \"\"\"\n", + " sleep(0.5)\n", + " return x + y" + ] + }, + { + "cell_type": "markdown", + "id": "0de4762b", + "metadata": {}, + "source": [ + "Run first synchronously" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "506a3a1e", + "metadata": {}, + "outputs": [], + "source": [ + "result_stream1 = add_numbers(input_stream)\n", + "result_stream1.run()\n", + "result_stream1.as_df()" + ] + }, + { + "cell_type": "markdown", + "id": "fcc8c2f8", + "metadata": {}, + "source": [ + "Now let's run it asynchronously using the Ray engine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e83fddac", + "metadata": {}, + "outputs": [], + "source": [ + "result_stream2 = add_numbers(input_stream)\n", + "result_stream2.run(ray_engine)\n", + "result_stream2.as_df()" + ] + }, + { + "cell_type": "markdown", + "id": "23179bdc", + "metadata": {}, + "source": [ + "**NOTE**: Depending on the availability of nodes and how Ray was configured, you may *not* see any improvement in the running speed for the example above (it may even take longer due to overhead!). If you observe that you don't seem to be getting any speed up, please consult your Ray cluster administrator." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index adf7f44..afca7a5 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -54,7 +54,7 @@ def __init__( for c in self._data_table.column_names if c.startswith(constants.SYSTEM_TAG_PREFIX) ] - self._system_tags_dict = ( + self._system_tags_dict: dict[str, DataValue] = ( self._data_context.type_converter.arrow_table_to_python_dicts( self._data_table.select(extracted_system_tag_columns) )[0] @@ -190,7 +190,7 @@ def as_datagram( data_context=self._data_context, ) - def system_tags(self) -> dict[str, str | None]: + def system_tags(self) -> dict[str, DataValue | None]: """ Return system tags for all keys. diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 920769c..ccea0af 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -50,7 +50,7 @@ def kernel_id(self) -> tuple[str, ...]: Returns a unique identifier for the kernel. This is used to identify the kernel in the computational graph. """ - return (f"{self.__class__.__name__}", self.content_hash()) + return (f"{self.__class__.__name__}", self.content_hash().hex()) @property def data_context(self) -> contexts.DataContext: diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 3a653ed..9604da6 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -17,7 +17,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore -from orcapod.types import TypeSpec +from orcapod.types import DataValue, TypeSpec from orcapod.types import typespec_utils as tsutils from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components @@ -170,7 +170,20 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: @abstractmethod def call( - self, tag: dp.Tag, packet: dp.Packet + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, + ) -> tuple[dp.Tag, dp.Packet | None]: ... + + @abstractmethod + async def async_call( + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, ) -> tuple[dp.Tag, dp.Packet | None]: ... def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: @@ -355,8 +368,59 @@ def __str__(self) -> str: return f"FunctionPod:{func_sig}" def call( - self, tag: dp.Tag, packet: dp.Packet, record_id: str | None = None + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, ) -> tuple[dp.Tag, DictPacket | None]: + if not self.is_active(): + logger.info( + f"Pod is not active: skipping computation on input packet {packet}" + ) + return tag, None + + # any kernel/pod invocation happening inside the function will NOT be tracked + if not isinstance(packet, dict): + input_dict = packet.as_dict(include_source=False) + else: + input_dict = packet + + with self._tracker_manager.no_tracking(): + if execution_engine is not None: + # use the provided execution engine to run the function + values = execution_engine.submit_sync(self.function, **input_dict) + else: + values = self.function(**input_dict) + + output_data = self.process_function_output(values) + + if record_id is None: + # if record_id is not provided, generate it from the packet + record_id = self.get_record_id(packet) + source_info = { + k: ":".join(self.kernel_id + (record_id, k)) for k in output_data + } + + output_packet = DictPacket( + output_data, + source_info=source_info, + python_schema=self.output_packet_types(), + data_context=self._data_context, + ) + return tag, output_packet + + async def async_call( + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, + ) -> tuple[dp.Tag, dp.Packet | None]: + """ + Asynchronous call to the function pod. This is a placeholder for future implementation. + Currently, it behaves like the synchronous call. + """ if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -366,8 +430,38 @@ def call( # any kernel/pod invocation happening inside the function will NOT be tracked with self._tracker_manager.no_tracking(): - values = self.function(**packet.as_dict(include_source=False)) + # any kernel/pod invocation happening inside the function will NOT be tracked + if not isinstance(packet, dict): + input_dict = packet.as_dict(include_source=False) + else: + input_dict = packet + if execution_engine is not None: + # use the provided execution engine to run the function + values = await execution_engine.submit_async( + self.function, **input_dict + ) + else: + values = self.function(**input_dict) + + output_data = self.process_function_output(values) + + if record_id is None: + # if record_id is not provided, generate it from the packet + record_id = self.get_record_id(packet) + source_info = { + k: ":".join(self.kernel_id + (record_id, k)) for k in output_data + } + output_packet = DictPacket( + output_data, + source_info=source_info, + python_schema=self.output_packet_types(), + data_context=self._data_context, + ) + return tag, output_packet + + def process_function_output(self, values: Any) -> dict[str, DataValue]: + output_values = [] if len(self.output_keys) == 0: output_values = [] elif len(self.output_keys) == 1: @@ -384,21 +478,7 @@ def call( f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" ) - output_data = {k: v for k, v in zip(self.output_keys, output_values)} - if record_id is None: - # if record_id is not provided, generate it from the packet - record_id = self.get_record_id(packet) - source_info = { - k: ":".join(self.kernel_id + (record_id, k)) for k in output_data - } - - output_packet = DictPacket( - {k: v for k, v in zip(self.output_keys, output_values)}, - source_info=source_info, - python_schema=self.output_packet_types(), - data_context=self._data_context, - ) - return tag, output_packet + return {k: v for k, v in zip(self.output_keys, output_values)} def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -474,9 +554,26 @@ def validate_inputs(self, *streams: dp.Stream) -> None: self.pod.validate_inputs(*streams) def call( - self, tag: dp.Tag, packet: dp.Packet, record_id: str | None = None + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, ) -> tuple[dp.Tag, dp.Packet | None]: - return self.pod.call(tag, packet, record_id=record_id) + return self.pod.call( + tag, packet, record_id=record_id, execution_engine=execution_engine + ) + + async def async_call( + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, + ) -> tuple[dp.Tag, dp.Packet | None]: + return await self.pod.async_call( + tag, packet, record_id=record_id, execution_engine=execution_engine + ) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -528,6 +625,31 @@ def call( tag: dp.Tag, packet: dp.Packet, record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: + # TODO: consider logic for overwriting existing records + if record_id is None: + record_id = self.get_record_id(packet) + output_packet = None + if not skip_cache_lookup: + output_packet = self.get_recorded_output_packet(packet) + if output_packet is None: + tag, output_packet = super().call( + tag, packet, record_id=record_id, execution_engine=execution_engine + ) + if output_packet is not None and not skip_cache_insert: + self.record_packet(packet, output_packet, record_id=record_id) + + return tag, output_packet + + async def async_call( + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: @@ -538,7 +660,9 @@ def call( if not skip_cache_lookup: output_packet = self.get_recorded_output_packet(packet) if output_packet is None: - tag, output_packet = super().call(tag, packet, record_id=record_id) + tag, output_packet = await super().async_call( + tag, packet, record_id=record_id, execution_engine=execution_engine + ) if output_packet is not None and not skip_cache_insert: self.record_packet(packet, output_packet, record_id=record_id) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 7bbc926..1caad9d 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,6 +1,6 @@ import logging from abc import ABC, abstractmethod -from collections.abc import Collection, Iterator, Mapping +from collections.abc import AsyncIterator, Collection, Iterator, Mapping from datetime import datetime, timezone from itertools import repeat from pathlib import Path @@ -19,14 +19,17 @@ from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule + if TYPE_CHECKING: import pyarrow as pa import pyarrow.compute as pc import polars as pl + import asyncio else: pa = LazyModule("pyarrow") pc = LazyModule("pyarrow.compute") pl = LazyModule("polars") + asyncio = LazyModule("asyncio") # TODO: consider using this instead of making copy of dicts @@ -35,6 +38,30 @@ logger = logging.getLogger(__name__) +def synchronous_run(async_func, *args, **kwargs): + """ + Use existing event loop if available. + + Pros: Reuses existing loop, more efficient + Cons: More complex, need to handle loop detection + """ + try: + # Check if we're already in an event loop + loop = asyncio.get_running_loop() + + def run_in_thread(): + return asyncio.run(async_func(*args, **kwargs)) + + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_in_thread) + return future.result() + except RuntimeError: + # No event loop running, safe to use asyncio.run() + return asyncio.run(async_func(*args, **kwargs)) + + class OperatorStreamBaseMixin: def join(self, other_stream: dp.Stream) -> dp.Stream: """ @@ -91,6 +118,7 @@ def __init__( source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), data_context: str | contexts.DataContext | None = None, + execution_engine: dp.ExecutionEngine | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -104,6 +132,7 @@ def __init__( # if source is provided, use its data context data_context = source.data_context_key self._data_context = contexts.resolve_context(data_context) + self._execution_engine = execution_engine @property def substream_identities(self) -> tuple[str, ...]: @@ -113,6 +142,22 @@ def substream_identities(self) -> tuple[str, ...]: """ return (self.content_hash().hex(),) + @property + def execution_engine(self): + """ + Returns the execution engine that is used to execute this stream. + This is typically used to track the execution context of the stream. + """ + return self._execution_engine + + @execution_engine.setter + def execution_engine(self, engine: dp.ExecutionEngine | None) -> None: + """ + Sets the execution engine for the stream. + This is typically used to track the execution context of the stream. + """ + self._execution_engine = engine + def get_substream(self, substream_id: str) -> dp.Stream: """ Returns the substream with the given substream_id. @@ -217,8 +262,21 @@ def __iter__( @abstractmethod def iter_packets( self, + execution_engine: dp.ExecutionEngine | None = None, ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + @abstractmethod + def run( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> None: ... + + @abstractmethod + async def run_async( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> None: ... + @abstractmethod def as_table( self, @@ -226,6 +284,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": ... def as_df( @@ -234,6 +293,7 @@ def as_df( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. @@ -244,15 +304,18 @@ def as_df( include_source=include_source, include_system_tags=include_system_tags, include_content_hash=include_content_hash, + execution_engine=execution_engine, ) ) - def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + def flow( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Collection[tuple[dp.Tag, dp.Packet]]: """ Flow everything through the stream, returning the entire collection of (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. """ - return [e for e in self] + return [e for e in self.iter_packets(execution_engine=execution_engine)] def identity_structure(self) -> Any: """ @@ -432,6 +495,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": """ Returns the underlying table representation of the stream. @@ -451,7 +515,8 @@ def as_table( hash_column_name, pa.array(content_hashes, type=pa.large_string()) ) if not include_system_tags: - output_table = output_table.drop_columns(self._system_tag_columns) + # Check in original implementation + output_table = output_table.drop_columns(list(self._system_tag_columns)) table_stack = (output_table,) if include_data_context: table_stack += (self._data_context_table,) @@ -467,7 +532,9 @@ def clear_cache(self) -> None: """ self._cached_elements = None - def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: + def iter_packets( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Iterator[tuple[dp.Tag, ArrowPacket]]: """ Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). @@ -510,6 +577,24 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, ArrowPacket]]: ) yield from self._cached_elements + def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + """ + Runs the stream, which in this case is a no-op since the stream is immutable. + This is typically used to trigger any upstream computation of the stream. + """ + # No-op for immutable streams + pass + + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: + """ + Runs the stream asynchronously, which in this case is a no-op since the stream is immutable. + This is typically used to trigger any upstream computation of the stream. + """ + # No-op for immutable streams + pass + def __repr__(self) -> str: return ( f"{self.__class__.__name__}(table={self._table.column_names}, " @@ -617,12 +702,29 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified + def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + self._cached_stream.run(execution_engine=execution_engine) + + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + await self._cached_stream.run_async(execution_engine=execution_engine) + def as_table( self, include_data_context: bool = False, include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": self.refresh() assert self._cached_stream is not None, ( @@ -633,14 +735,18 @@ def as_table( include_source=include_source, include_system_tags=include_system_tags, include_content_hash=include_content_hash, + execution_engine=execution_engine, ) - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def iter_packets( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." ) - return self._cached_stream.iter_packets() + return self._cached_stream.iter_packets(execution_engine=execution_engine) def __repr__(self) -> str: return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" @@ -665,7 +771,9 @@ def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): self._cached_output_table: pa.Table | None = None self._cached_content_hash_column: pa.Array | None = None - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def iter_packets( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: if self._prepared_stream_iterator is not None: for i, (tag, packet) in enumerate(self._prepared_stream_iterator): if i in self._cached_output_packets: @@ -675,7 +783,9 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: yield tag, packet else: # Process packet - processed = self.pod.call(tag, packet) + processed = self.pod.call( + tag, packet, execution_engine=execution_engine + ) if processed is not None: # Update shared cache for future iterators (optimization) self._cached_output_packets[i] = processed @@ -692,6 +802,47 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: if packet is not None: yield tag, packet + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: + if self._prepared_stream_iterator is not None: + pending_call_lut = {} + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i not in self._cached_output_packets: + # Process packet + pending_call_lut[i] = self.pod.async_call( + tag, packet, execution_engine=execution_engine + ) + + indices = list(pending_call_lut.keys()) + pending_calls = [pending_call_lut[i] for i in indices] + + results = await asyncio.gather(*pending_calls) + for i, result in zip(indices, results): + self._cached_output_packets[i] = result + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + + def run( + self, + execution_engine: dp.ExecutionEngine | None = None, + try_async_backend: bool = True, + ) -> None: + if try_async_backend: + # Use async run if requested + try: + return synchronous_run( + self.run_async, execution_engine=execution_engine + ) + except RuntimeError as e: + logger.warning( + "Failed to run async stream synchronously, falling back to sync run: %s", + e, + ) + # Fallback to synchronous run + self.flow(execution_engine=execution_engine) + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. @@ -714,12 +865,13 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] all_packets = [] tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(): + for tag, packet in self.iter_packets(execution_engine=execution_engine): if tag_schema is None: tag_schema = tag.arrow_schema(include_system_tags=True) if packet_schema is None: @@ -814,9 +966,11 @@ def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): self._cached_output_table: pa.Table | None = None self._cached_content_hash_column: pa.Array | None = None - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: """ - Processes the input stream and prepares the output stream. + Runs the stream, processing the input stream and preparing the output stream. This is typically called before iterating over the packets. """ if self._cached_output_packets is None: @@ -869,6 +1023,110 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: tag_keys = self.input_stream.keys()[0] + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + pending_calls = [] + if missing is not None and missing.num_rows > 0: + for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + pending = self.pod.async_call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + pending_calls.append(pending) + import asyncio + + completed_calls = await asyncio.gather(*pending_calls) + for results in completed_calls: + for tag, packet in results: + cached_results.append((tag, packet)) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def run( + self, + execution_engine: dp.ExecutionEngine | None = None, + try_async_backend: bool = True, + ) -> None: + if try_async_backend: + # Use async run if requested + try: + return synchronous_run( + self.run_async, execution_engine=execution_engine + ) + except RuntimeError as e: + logger.warning( + "Failed to run async stream synchronously, falling back to sync run: %s", + e, + ) + # Fallback to synchronous run + self.flow(execution_engine=execution_engine) + + def iter_packets( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Processes the input stream and prepares the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_records(include_system_columns=True) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = ( + all_results.filter(pc.is_valid(pc.field("_exists"))) + .drop_columns(target_entries.column_names) + .drop_columns(["_exists"]) + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + if existing is not None and existing.num_rows > 0: # If there are existing entries, we can cache them existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) @@ -879,7 +1137,12 @@ def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: if missing is not None and missing.num_rows > 0: for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): # Since these packets are known to be missing, skip the cache lookup - tag, packet = self.pod.call(tag, packet, skip_cache_lookup=True) + tag, packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) cached_results.append((tag, packet)) if packet is not None: yield tag, packet @@ -940,12 +1203,13 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] all_packets = [] tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(): + for tag, packet in self.iter_packets(execution_engine=execution_engine): if tag_schema is None: tag_schema = tag.arrow_schema(include_system_tags=True) if packet_schema is None: @@ -999,7 +1263,7 @@ def as_table( if include_content_hash: if self._cached_content_hash_column is None: content_hashes = [] - for tag, packet in self.iter_packets(): + for tag, packet in self.iter_packets(execution_engine=execution_engine): content_hashes.append(packet.content_hash()) self._cached_content_hash_column = pa.array( content_hashes, type=pa.large_string() @@ -1048,7 +1312,9 @@ def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": """ Returns the underlying table representation of the stream. @@ -1057,15 +1323,20 @@ def as_table( return self._stream.as_table( include_data_context=include_data_context, include_source=include_source, + include_system_tags=include_system_tags, include_content_hash=include_content_hash, + execution_engine=execution_engine, ) - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def iter_packets( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: """ Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). """ - return self._stream.iter_packets() + return self._stream.iter_packets(execution_engine=execution_engine) def identity_structure(self) -> Any: return self._stream.identity_structure() diff --git a/src/orcapod/execution_engines/__init__.py b/src/orcapod/execution_engines/__init__.py new file mode 100644 index 0000000..9a76995 --- /dev/null +++ b/src/orcapod/execution_engines/__init__.py @@ -0,0 +1 @@ +from .ray_execution_engine import NativeRayAsyncEngine diff --git a/src/orcapod/execution_engines/ray_execution_engine.py b/src/orcapod/execution_engines/ray_execution_engine.py new file mode 100644 index 0000000..47d2032 --- /dev/null +++ b/src/orcapod/execution_engines/ray_execution_engine.py @@ -0,0 +1,119 @@ +from orcapod.utils.lazy_module import LazyModule +from typing import TYPE_CHECKING, Any, TypeVar +import logging +from collections.abc import Callable + +if TYPE_CHECKING: + import asyncio + import ray +else: + asyncio = LazyModule("asyncio") + ray = LazyModule("ray") + +logger = logging.getLogger(__name__) + + +T = TypeVar("T") + + +class NativeRayAsyncEngine: + """ + Ray execution engine using native asyncio support. + + This approach uses Ray's built-in async capabilities: + 1. For tasks: Uses ObjectRef.future() + asyncio.wrap_future() + 2. For batch operations: Uses ray's native async support + 3. No polling needed - Ray handles async integration + """ + + def __init__(self, ray_address: str | None = None, **ray_init_kwargs): + """Initialize Ray with native async support.""" + + if not ray.is_initialized(): + ray.init(address=ray_address, **ray_init_kwargs) + self._ray_initialized_here = True + else: + self._ray_initialized_here = False + + logger.info("Native Ray async engine initialized") + logger.info(f"Cluster resources: {ray.cluster_resources()}") + + def submit_sync(self, func: Callable[..., T], *args, **kwargs) -> T: + """ + Submit a function synchronously using Ray. + + This is a blocking call that waits for the result. + """ + # Create remote function and submit + remote_func = ray.remote(func) + object_ref = remote_func.remote(*args, **kwargs) + + # Wait for the result - this is blocking + result = ray.get(object_ref) + return result + + async def submit_async(self, func: Callable[..., T], *args, **kwargs) -> T: + """ + Submit a function using Ray's native async support. + + Uses ObjectRef.future() which Ray converts to asyncio.Future natively. + """ + # Create remote function and submit + remote_func = ray.remote(func) + object_ref = remote_func.remote(*args, **kwargs) + + # Use Ray's native async support - this is the key insight! + # ObjectRef.future() returns a concurrent.futures.Future that works with asyncio + future = object_ref.future() # type: ignore + asyncio_future = asyncio.wrap_future(future) + + # This is truly non-blocking and integrates with asyncio event loop + result = await asyncio_future + return result + + async def submit_batch_async( + self, + func: Callable[..., T], + args_list: list[tuple], + kwargs_list: list[dict] | None = None, + ) -> list[T]: + """ + Submit batch using Ray's native async support. + + This is much more efficient than individual submissions. + """ + if kwargs_list is None: + kwargs_list = [{}] * len(args_list) + + # Create remote function once + remote_func = ray.remote(func) + + # Submit all tasks and get ObjectRefs + object_refs = [ + remote_func.remote(*args, **kwargs) + for args, kwargs in zip(args_list, kwargs_list) + ] + + # Convert all ObjectRefs to asyncio futures + asyncio_futures = [ + asyncio.wrap_future(obj_ref.future()) # type: ignore + for obj_ref in object_refs + ] + + # Use asyncio.gather for efficient concurrent execution + results = await asyncio.gather(*asyncio_futures) + return results + + def get_cluster_info(self) -> dict[str, Any]: + """Get Ray cluster information.""" + return { + "cluster_resources": ray.cluster_resources(), + "available_resources": ray.available_resources(), + "nodes": ray.nodes(), + } + + def shutdown(self) -> None: + """Shutdown Ray if we initialized it.""" + if self._ray_initialized_here and ray.is_initialized(): + ray.shutdown() + logger.info("Native Ray async engine shut down") diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py index ece579c..af1ccb0 100644 --- a/src/orcapod/hashing/content_identifiable.py +++ b/src/orcapod/hashing/content_identifiable.py @@ -1,331 +1,331 @@ -from collections.abc import Collection, Mapping -from pathlib import Path -from typing import Any -from uuid import UUID -from orcapod import contexts -import logging -from orcapod.protocols import hashing_protocols as hp - -logger = logging.getLogger(__name__) - - -class ContentIdentifiableBase: - """ - Base class for content-identifiable objects. - This class provides a way to define objects that can be uniquely identified - based on their content rather than their identity in memory. Specifically, the identity of the - object is determined by the structure returned by the `identity_structure` method. - The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, - which defaults to the one returned by `get_default_object_hasher`. - Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. - """ - - def __init__( - self, - label: str | None = None, - data_context: Any = None, # Placeholder for ObjectHasher or similar context - ) -> None: - """ - Initialize the ContentHashable with an optional ObjectHasher. - """ - self._data_context = contexts.resolve_context(data_context) - self._label = label - self._cached_hash: bytes | None = None - - @property - def has_assigned_label(self) -> bool: - """ - Check if the label is explicitly set for this object. - - Returns: - bool: True if the label is explicitly set, False otherwise. - """ - return self._label is not None - - @property - def label(self) -> str: - """ - Get the label of this object. - - Returns: - str | None: The label of the object, or None if not set. - """ - return self._label or self.computed_label() or self.__class__.__name__ - - @label.setter - def label(self, label: str | None) -> None: - """ - Set the label of this object. - - Args: - label (str | None): The label to set for this object. - """ - self._label = label - - def computed_label(self) -> str | None: - """ - Compute a label for this object based on its content. If label is not explicitly set for this object - and computed_label returns a valid value, it will be used as label of this object. - """ - return None - - def identity_structure(self) -> Any: - """ - Return a structure that represents the identity of this object. - - Override this method in your subclass to provide a stable representation - of your object's content. The structure should contain all fields that - determine the object's identity. - - Returns: - Any: A structure representing this object's content, or None to use default hash - """ - return None - - def content_hash(self) -> bytes: - """ - Compute a hash based on the content of this object. - - This method uses the identity structure to compute a hash value. - If no identity structure is provided, it will return None. - - Returns: - int: A hash value based on the content of this object, or None if no identity structure is provided. - """ - if self._cached_hash is None: - structure = self.identity_structure() - - processed_structure = process_structure(structure) - - self._cached_hash = self._data_context.object_hasher.hash( - processed_structure - ) - return self._cached_hash - - def __hash__(self) -> int: - """ - Hash implementation that uses the identity structure if provided, - otherwise falls back to the superclass's hash method. - - Returns: - int: A hash value based on either content or identity - """ - # Get the identity structure - structure = self.identity_structure() - if structure is None: - # If no identity structure is provided, use the default hash - return super().__hash__() - - return self._data_context.object_hasher.hash_to_int(structure) - - def __eq__(self, other: object) -> bool: - """ - Equality check that compares the identity structures of two objects. - - Args: - other (object): The object to compare against. - - Returns: - bool: True if both objects have the same identity structure, False otherwise. - """ - if not isinstance(other, ContentIdentifiableBase): - return NotImplemented - - return self.identity_structure() == other.identity_structure() - - -def process_structure( - obj: Any, - visited: set[int] | None = None, - force_hash: bool = True, - function_info_extractor: hp.FunctionInfoExtractor | None = None, -) -> Any: - """ - Recursively process a structure to prepare it for hashing. - - Args: - obj: The object or structure to process - visited: Set of object ids already visited (to handle circular references) - function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation - - Returns: - A processed version of the structure suitable for stable hashing - """ - # Initialize the visited set if this is the top-level call - if visited is None: - visited = set() - else: - visited = visited.copy() # Copy to avoid modifying the original set - - # Check for circular references - use object's memory address - # NOTE: While id() is not stable across sessions, we only use it within a session - # to detect circular references, not as part of the final hash - obj_id = id(obj) - if obj_id in visited: - logger.debug( - f"Detected circular reference for object of type {type(obj).__name__}" - ) - return "CircularRef" # Don't include the actual id in hash output - - # For objects that could contain circular references, add to visited - if isinstance(obj, (dict, list, tuple, set)) or not isinstance( - obj, (str, int, float, bool, type(None)) - ): - visited.add(obj_id) - - # Handle None - if obj is None: - return None - - # TODO: currently using runtime_checkable on ContentIdentifiable protocol - # Re-evaluate this strategy to see if a faster / more robust check could be used - if isinstance(obj, hp.ContentIdentifiable): - logger.debug( - f"Processing ContentHashableBase instance of type {type(obj).__name__}" - ) - return obj.content_hash() - - # Handle basic types - if isinstance(obj, (str, int, float, bool)): - return obj - - # Handle bytes and bytearray - if isinstance(obj, (bytes, bytearray)): - logger.debug( - f"Converting bytes/bytearray of length {len(obj)} to hex representation" - ) - return obj.hex() - - # Handle Path objects - if isinstance(obj, Path): - logger.debug(f"Converting Path object to string: {obj}") - raise NotImplementedError( - "Path objects are not supported in this hasher. Please convert to string." - ) - return str(obj) - - # Handle UUID objects - if isinstance(obj, UUID): - logger.debug(f"Converting UUID to string: {obj}") - raise NotImplementedError( - "UUID objects are not supported in this hasher. Please convert to string." - ) - return str(obj) - - # Handle named tuples (which are subclasses of tuple) - if hasattr(obj, "_fields") and isinstance(obj, tuple): - logger.debug(f"Processing named tuple of type {type(obj).__name__}") - # For namedtuples, convert to dict and then process - d = {field: getattr(obj, field) for field in obj._fields} # type: ignore - return process_structure(d, visited) - - # Handle mappings (dict-like objects) - if isinstance(obj, Mapping): - # Process both keys and values - processed_items = [ - ( - process_structure(k, visited), - process_structure(v, visited), - ) - for k, v in obj.items() - ] - - # Sort by the processed keys for deterministic order - processed_items.sort(key=lambda x: str(x[0])) - - # Create a new dictionary with string keys based on processed keys - # TODO: consider checking for possibly problematic values in processed_k - # and issue a warning - return { - str(processed_k): processed_v - for processed_k, processed_v in processed_items - } - - # Handle sets and frozensets - if isinstance(obj, (set, frozenset)): - logger.debug( - f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" - ) - # Process each item first, then sort the processed results - processed_items = [process_structure(item, visited) for item in obj] - return sorted(processed_items, key=str) - - # Handle collections (list-like objects) - if isinstance(obj, Collection): - logger.debug( - f"Processing collection of type {type(obj).__name__} with {len(obj)} items" - ) - return [process_structure(item, visited) for item in obj] - - # For functions, use the function_content_hash - if callable(obj) and hasattr(obj, "__code__"): - logger.debug(f"Processing function: {getattr(obj, '__name__')}") - if function_info_extractor is not None: - # Use the extractor to get a stable representation - function_info = function_info_extractor.extract_function_info(obj) - logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") - - # simply return the function info as a stable representation - return function_info - else: - raise ValueError( - f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" - ) - - # handle data types - if isinstance(obj, type): - logger.debug(f"Processing class/type: {obj.__name__}") - return f"type:{obj.__name__}" - - # For other objects, attempt to create deterministic representation only if force_hash=True - class_name = obj.__class__.__name__ - module_name = obj.__class__.__module__ - if force_hash: - try: - import re - - logger.debug( - f"Processing generic object of type {module_name}.{class_name}" - ) - - # Try to get a stable dict representation if possible - if hasattr(obj, "__dict__"): - # Sort attributes to ensure stable order - attrs = sorted( - (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") - ) - # Limit to first 10 attributes to avoid extremely long representations - if len(attrs) > 10: - logger.debug( - f"Object has {len(attrs)} attributes, limiting to first 10" - ) - attrs = attrs[:10] - attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] - obj_repr = f"{{{', '.join(attr_strs)}}}" - else: - # Get basic repr but remove memory addresses - logger.debug( - "Object has no __dict__, using repr() with memory address removal" - ) - obj_repr = repr(obj) - if len(obj_repr) > 1000: - logger.debug( - f"Object repr is {len(obj_repr)} chars, truncating to 1000" - ) - obj_repr = obj_repr[:1000] + "..." - # Remove memory addresses which look like '0x7f9a1c2b3d4e' - obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) - - return f"{module_name}.{class_name}:{obj_repr}" - except Exception as e: - # Last resort - use class name only - logger.warning(f"Failed to process object representation: {e}") - try: - return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" - except AttributeError: - logger.error("Could not determine object class, using UnknownObject") - return "UnknownObject" - else: - raise ValueError( - f"Processing of {obj} of type {module_name}.{class_name} is not supported" - ) +# from collections.abc import Collection, Mapping +# from pathlib import Path +# from typing import Any +# from uuid import UUID +# from orcapod import contexts +# import logging +# from orcapod.protocols import hashing_protocols as hp + +# logger = logging.getLogger(__name__) + + +# class ContentIdentifiableBase: +# """ +# Base class for content-identifiable objects. +# This class provides a way to define objects that can be uniquely identified +# based on their content rather than their identity in memory. Specifically, the identity of the +# object is determined by the structure returned by the `identity_structure` method. +# The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, +# which defaults to the one returned by `get_default_object_hasher`. +# Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. +# """ + +# def __init__( +# self, +# label: str | None = None, +# data_context: Any = None, # Placeholder for ObjectHasher or similar context +# ) -> None: +# """ +# Initialize the ContentHashable with an optional ObjectHasher. +# """ +# self._data_context = contexts.resolve_context(data_context) +# self._label = label +# self._cached_hash: bytes | None = None + +# @property +# def has_assigned_label(self) -> bool: +# """ +# Check if the label is explicitly set for this object. + +# Returns: +# bool: True if the label is explicitly set, False otherwise. +# """ +# return self._label is not None + +# @property +# def label(self) -> str: +# """ +# Get the label of this object. + +# Returns: +# str | None: The label of the object, or None if not set. +# """ +# return self._label or self.computed_label() or self.__class__.__name__ + +# @label.setter +# def label(self, label: str | None) -> None: +# """ +# Set the label of this object. + +# Args: +# label (str | None): The label to set for this object. +# """ +# self._label = label + +# def computed_label(self) -> str | None: +# """ +# Compute a label for this object based on its content. If label is not explicitly set for this object +# and computed_label returns a valid value, it will be used as label of this object. +# """ +# return None + +# def identity_structure(self) -> Any: +# """ +# Return a structure that represents the identity of this object. + +# Override this method in your subclass to provide a stable representation +# of your object's content. The structure should contain all fields that +# determine the object's identity. + +# Returns: +# Any: A structure representing this object's content, or None to use default hash +# """ +# return None + +# def content_hash(self) -> bytes: +# """ +# Compute a hash based on the content of this object. + +# This method uses the identity structure to compute a hash value. +# If no identity structure is provided, it will return None. + +# Returns: +# int: A hash value based on the content of this object, or None if no identity structure is provided. +# """ +# if self._cached_hash is None: +# structure = self.identity_structure() + +# processed_structure = process_structure(structure) + +# self._cached_hash = self._data_context.object_hasher.hash( +# processed_structure +# ) +# return self._cached_hash + +# def __hash__(self) -> int: +# """ +# Hash implementation that uses the identity structure if provided, +# otherwise falls back to the superclass's hash method. + +# Returns: +# int: A hash value based on either content or identity +# """ +# # Get the identity structure +# structure = self.identity_structure() +# if structure is None: +# # If no identity structure is provided, use the default hash +# return super().__hash__() + +# return self._data_context.object_hasher.hash_to_int(structure) + +# def __eq__(self, other: object) -> bool: +# """ +# Equality check that compares the identity structures of two objects. + +# Args: +# other (object): The object to compare against. + +# Returns: +# bool: True if both objects have the same identity structure, False otherwise. +# """ +# if not isinstance(other, ContentIdentifiableBase): +# return NotImplemented + +# return self.identity_structure() == other.identity_structure() + + +# def process_structure( +# obj: Any, +# visited: set[int] | None = None, +# force_hash: bool = True, +# function_info_extractor: hp.FunctionInfoExtractor | None = None, +# ) -> Any: +# """ +# Recursively process a structure to prepare it for hashing. + +# Args: +# obj: The object or structure to process +# visited: Set of object ids already visited (to handle circular references) +# function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation + +# Returns: +# A processed version of the structure suitable for stable hashing +# """ +# # Initialize the visited set if this is the top-level call +# if visited is None: +# visited = set() +# else: +# visited = visited.copy() # Copy to avoid modifying the original set + +# # Check for circular references - use object's memory address +# # NOTE: While id() is not stable across sessions, we only use it within a session +# # to detect circular references, not as part of the final hash +# obj_id = id(obj) +# if obj_id in visited: +# logger.debug( +# f"Detected circular reference for object of type {type(obj).__name__}" +# ) +# return "CircularRef" # Don't include the actual id in hash output + +# # For objects that could contain circular references, add to visited +# if isinstance(obj, (dict, list, tuple, set)) or not isinstance( +# obj, (str, int, float, bool, type(None)) +# ): +# visited.add(obj_id) + +# # Handle None +# if obj is None: +# return None + +# # TODO: currently using runtime_checkable on ContentIdentifiable protocol +# # Re-evaluate this strategy to see if a faster / more robust check could be used +# if isinstance(obj, hp.ContentIdentifiable): +# logger.debug( +# f"Processing ContentHashableBase instance of type {type(obj).__name__}" +# ) +# return obj.content_hash() + +# # Handle basic types +# if isinstance(obj, (str, int, float, bool)): +# return obj + +# # Handle bytes and bytearray +# if isinstance(obj, (bytes, bytearray)): +# logger.debug( +# f"Converting bytes/bytearray of length {len(obj)} to hex representation" +# ) +# return obj.hex() + +# # Handle Path objects +# if isinstance(obj, Path): +# logger.debug(f"Converting Path object to string: {obj}") +# raise NotImplementedError( +# "Path objects are not supported in this hasher. Please convert to string." +# ) +# return str(obj) + +# # Handle UUID objects +# if isinstance(obj, UUID): +# logger.debug(f"Converting UUID to string: {obj}") +# raise NotImplementedError( +# "UUID objects are not supported in this hasher. Please convert to string." +# ) +# return str(obj) + +# # Handle named tuples (which are subclasses of tuple) +# if hasattr(obj, "_fields") and isinstance(obj, tuple): +# logger.debug(f"Processing named tuple of type {type(obj).__name__}") +# # For namedtuples, convert to dict and then process +# d = {field: getattr(obj, field) for field in obj._fields} # type: ignore +# return process_structure(d, visited) + +# # Handle mappings (dict-like objects) +# if isinstance(obj, Mapping): +# # Process both keys and values +# processed_items = [ +# ( +# process_structure(k, visited), +# process_structure(v, visited), +# ) +# for k, v in obj.items() +# ] + +# # Sort by the processed keys for deterministic order +# processed_items.sort(key=lambda x: str(x[0])) + +# # Create a new dictionary with string keys based on processed keys +# # TODO: consider checking for possibly problematic values in processed_k +# # and issue a warning +# return { +# str(processed_k): processed_v +# for processed_k, processed_v in processed_items +# } + +# # Handle sets and frozensets +# if isinstance(obj, (set, frozenset)): +# logger.debug( +# f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" +# ) +# # Process each item first, then sort the processed results +# processed_items = [process_structure(item, visited) for item in obj] +# return sorted(processed_items, key=str) + +# # Handle collections (list-like objects) +# if isinstance(obj, Collection): +# logger.debug( +# f"Processing collection of type {type(obj).__name__} with {len(obj)} items" +# ) +# return [process_structure(item, visited) for item in obj] + +# # For functions, use the function_content_hash +# if callable(obj) and hasattr(obj, "__code__"): +# logger.debug(f"Processing function: {getattr(obj, '__name__')}") +# if function_info_extractor is not None: +# # Use the extractor to get a stable representation +# function_info = function_info_extractor.extract_function_info(obj) +# logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") + +# # simply return the function info as a stable representation +# return function_info +# else: +# raise ValueError( +# f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" +# ) + +# # handle data types +# if isinstance(obj, type): +# logger.debug(f"Processing class/type: {obj.__name__}") +# return f"type:{obj.__name__}" + +# # For other objects, attempt to create deterministic representation only if force_hash=True +# class_name = obj.__class__.__name__ +# module_name = obj.__class__.__module__ +# if force_hash: +# try: +# import re + +# logger.debug( +# f"Processing generic object of type {module_name}.{class_name}" +# ) + +# # Try to get a stable dict representation if possible +# if hasattr(obj, "__dict__"): +# # Sort attributes to ensure stable order +# attrs = sorted( +# (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") +# ) +# # Limit to first 10 attributes to avoid extremely long representations +# if len(attrs) > 10: +# logger.debug( +# f"Object has {len(attrs)} attributes, limiting to first 10" +# ) +# attrs = attrs[:10] +# attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] +# obj_repr = f"{{{', '.join(attr_strs)}}}" +# else: +# # Get basic repr but remove memory addresses +# logger.debug( +# "Object has no __dict__, using repr() with memory address removal" +# ) +# obj_repr = repr(obj) +# if len(obj_repr) > 1000: +# logger.debug( +# f"Object repr is {len(obj_repr)} chars, truncating to 1000" +# ) +# obj_repr = obj_repr[:1000] + "..." +# # Remove memory addresses which look like '0x7f9a1c2b3d4e' +# obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + +# return f"{module_name}.{class_name}:{obj_repr}" +# except Exception as e: +# # Last resort - use class name only +# logger.warning(f"Failed to process object representation: {e}") +# try: +# return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" +# except AttributeError: +# logger.error("Could not determine object class, using UnknownObject") +# return "UnknownObject" +# else: +# raise ValueError( +# f"Processing of {obj} of type {module_name}.{class_name} is not supported" +# ) diff --git a/src/orcapod/hashing/types.py b/src/orcapod/hashing/types.py index 6306d94..027b9e2 100644 --- a/src/orcapod/hashing/types.py +++ b/src/orcapod/hashing/types.py @@ -1,178 +1,178 @@ -"""Hash strategy protocols for dependency injection.""" +# """Hash strategy protocols for dependency injection.""" -from abc import ABC, abstractmethod -from collections.abc import Callable -from typing import Any, Protocol, runtime_checkable -import uuid +# from abc import ABC, abstractmethod +# from collections.abc import Callable +# from typing import Any, Protocol, runtime_checkable +# import uuid -from orcapod.types import PacketLike, PathLike, PathSet, TypeSpec +# from orcapod.types import PacketLike, PathLike, PathSet, TypeSpec -import pyarrow as pa +# import pyarrow as pa -@runtime_checkable -class Identifiable(Protocol): - """Protocol for objects that can provide an identity structure.""" +# @runtime_checkable +# class Identifiable(Protocol): +# """Protocol for objects that can provide an identity structure.""" - def identity_structure(self) -> Any: - """ - Return a structure that represents the identity of this object. +# def identity_structure(self) -> Any: +# """ +# Return a structure that represents the identity of this object. - Returns: - Any: A structure representing this object's content. - Should be deterministic and include all identity-relevant data. - Return None to indicate no custom identity is available. - """ - pass # pragma: no cover +# Returns: +# Any: A structure representing this object's content. +# Should be deterministic and include all identity-relevant data. +# Return None to indicate no custom identity is available. +# """ +# pass # pragma: no cover -class ObjectHasher(ABC): - """Abstract class for general object hashing.""" +# class ObjectHasher(ABC): +# """Abstract class for general object hashing.""" - # TODO: consider more explicitly stating types of objects accepted - @abstractmethod - def hash(self, obj: Any) -> bytes: - """ - Hash an object to a byte representation. +# # TODO: consider more explicitly stating types of objects accepted +# @abstractmethod +# def hash(self, obj: Any) -> bytes: +# """ +# Hash an object to a byte representation. - Args: - obj (Any): The object to hash. +# Args: +# obj (Any): The object to hash. - Returns: - bytes: The byte representation of the hash. - """ - ... +# Returns: +# bytes: The byte representation of the hash. +# """ +# ... - @abstractmethod - def get_hasher_id(self) -> str: - """ - Returns a unique identifier/name assigned to the hasher - """ +# @abstractmethod +# def get_hasher_id(self) -> str: +# """ +# Returns a unique identifier/name assigned to the hasher +# """ - def hash_to_hex( - self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False - ) -> str: - hash_bytes = self.hash(obj) - hex_str = hash_bytes.hex() +# def hash_to_hex( +# self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False +# ) -> str: +# hash_bytes = self.hash(obj) +# hex_str = hash_bytes.hex() - # TODO: clean up this logic, as char_count handling is messy - if char_count is not None: - if char_count > len(hex_str): - raise ValueError( - f"Cannot truncate to {char_count} chars, hash only has {len(hex_str)}" - ) - hex_str = hex_str[:char_count] - if prefix_hasher_id: - hex_str = self.get_hasher_id() + "@" + hex_str - return hex_str +# # TODO: clean up this logic, as char_count handling is messy +# if char_count is not None: +# if char_count > len(hex_str): +# raise ValueError( +# f"Cannot truncate to {char_count} chars, hash only has {len(hex_str)}" +# ) +# hex_str = hex_str[:char_count] +# if prefix_hasher_id: +# hex_str = self.get_hasher_id() + "@" + hex_str +# return hex_str - def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: - """ - Hash an object to an integer. +# def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: +# """ +# Hash an object to an integer. - Args: - obj (Any): The object to hash. - hexdigits (int): Number of hexadecimal digits to use for the hash. +# Args: +# obj (Any): The object to hash. +# hexdigits (int): Number of hexadecimal digits to use for the hash. - Returns: - int: The integer representation of the hash. - """ - hex_hash = self.hash_to_hex(obj, char_count=hexdigits) - return int(hex_hash, 16) +# Returns: +# int: The integer representation of the hash. +# """ +# hex_hash = self.hash_to_hex(obj, char_count=hexdigits) +# return int(hex_hash, 16) - def hash_to_uuid( - self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID - ) -> uuid.UUID: - """Convert hash to proper UUID5.""" - return uuid.uuid5(namespace, self.hash(obj)) +# def hash_to_uuid( +# self, obj: Any, namespace: uuid.UUID = uuid.NAMESPACE_OID +# ) -> uuid.UUID: +# """Convert hash to proper UUID5.""" +# return uuid.uuid5(namespace, self.hash(obj)) -@runtime_checkable -class FileContentHasher(Protocol): - """Protocol for file-related hashing.""" +# @runtime_checkable +# class FileContentHasher(Protocol): +# """Protocol for file-related hashing.""" - def hash_file(self, file_path: PathLike) -> bytes: ... +# def hash_file(self, file_path: PathLike) -> bytes: ... -@runtime_checkable -class ArrowHasher(Protocol): - """Protocol for hashing arrow packets.""" +# @runtime_checkable +# class ArrowHasher(Protocol): +# """Protocol for hashing arrow packets.""" - def get_hasher_id(self) -> str: ... +# def get_hasher_id(self) -> str: ... - def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... +# def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: ... -@runtime_checkable -class StringCacher(Protocol): - """Protocol for caching string key value pairs.""" +# @runtime_checkable +# class StringCacher(Protocol): +# """Protocol for caching string key value pairs.""" - def get_cached(self, cache_key: str) -> str | None: ... - def set_cached(self, cache_key: str, value: str) -> None: ... - def clear_cache(self) -> None: ... +# def get_cached(self, cache_key: str) -> str | None: ... +# def set_cached(self, cache_key: str, value: str) -> None: ... +# def clear_cache(self) -> None: ... -# Function hasher protocol -@runtime_checkable -class FunctionInfoExtractor(Protocol): - """Protocol for extracting function information.""" +# # Function hasher protocol +# @runtime_checkable +# class FunctionInfoExtractor(Protocol): +# """Protocol for extracting function information.""" - def extract_function_info( - self, - func: Callable[..., Any], - function_name: str | None = None, - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | None = None, - ) -> dict[str, Any]: ... +# def extract_function_info( +# self, +# func: Callable[..., Any], +# function_name: str | None = None, +# input_typespec: TypeSpec | None = None, +# output_typespec: TypeSpec | None = None, +# ) -> dict[str, Any]: ... -class SemanticTypeHasher(Protocol): - """Abstract base class for semantic type-specific hashers.""" +# class SemanticTypeHasher(Protocol): +# """Abstract base class for semantic type-specific hashers.""" - @abstractmethod - def hash_column( - self, - column: pa.Array, - ) -> pa.Array: - """Hash a column with this semantic type and return the hash bytes.""" - pass +# @abstractmethod +# def hash_column( +# self, +# column: pa.Array, +# ) -> pa.Array: +# """Hash a column with this semantic type and return the hash bytes.""" +# pass - @abstractmethod - def set_cacher(self, cacher: StringCacher) -> None: - """Add a string cacher for caching hash values.""" - pass +# @abstractmethod +# def set_cacher(self, cacher: StringCacher) -> None: +# """Add a string cacher for caching hash values.""" +# pass -# ---------------Legacy implementations and protocols to be deprecated--------------------- +# # ---------------Legacy implementations and protocols to be deprecated--------------------- -@runtime_checkable -class LegacyFileHasher(Protocol): - """Protocol for file-related hashing.""" +# @runtime_checkable +# class LegacyFileHasher(Protocol): +# """Protocol for file-related hashing.""" - def hash_file(self, file_path: PathLike) -> str: ... +# def hash_file(self, file_path: PathLike) -> str: ... -# Higher-level operations that compose file hashing -@runtime_checkable -class LegacyPathSetHasher(Protocol): - """Protocol for hashing pathsets (files, directories, collections).""" +# # Higher-level operations that compose file hashing +# @runtime_checkable +# class LegacyPathSetHasher(Protocol): +# """Protocol for hashing pathsets (files, directories, collections).""" - def hash_pathset(self, pathset: PathSet) -> str: ... +# def hash_pathset(self, pathset: PathSet) -> str: ... -@runtime_checkable -class LegacyPacketHasher(Protocol): - """Protocol for hashing packets.""" +# @runtime_checkable +# class LegacyPacketHasher(Protocol): +# """Protocol for hashing packets.""" - def hash_packet(self, packet: PacketLike) -> str: ... +# def hash_packet(self, packet: PacketLike) -> str: ... -# Combined interface for convenience (optional) -@runtime_checkable -class LegacyCompositeFileHasher( - LegacyFileHasher, LegacyPathSetHasher, LegacyPacketHasher, Protocol -): - """Combined interface for all file-related hashing operations.""" +# # Combined interface for convenience (optional) +# @runtime_checkable +# class LegacyCompositeFileHasher( +# LegacyFileHasher, LegacyPathSetHasher, LegacyPacketHasher, Protocol +# ): +# """Combined interface for all file-related hashing operations.""" - pass +# pass diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 22abe80..781161b 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -272,6 +272,7 @@ def call( tag: dp.Tag, packet: dp.Packet, record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: @@ -284,6 +285,43 @@ def call( record_id=record_id, skip_cache_lookup=skip_cache_lookup, skip_cache_insert=skip_cache_insert, + execution_engine=execution_engine, + ) + + if output_packet is not None: + retrieved = ( + output_packet.get_meta_value(self.DATA_RETRIEVED_FLAG) is not None + ) + # add pipeline record if the output packet is not None + # TODO: verify cache lookup logic + self.add_pipeline_record( + tag, + packet, + record_id, + retrieved=retrieved, + skip_cache_lookup=skip_cache_lookup, + ) + return tag, output_packet + + async def async_call( + self, + tag: dp.Tag, + packet: dp.Packet, + record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[dp.Tag, dp.Packet | None]: + if record_id is None: + record_id = self.get_record_id(packet) + + tag, output_packet = await super().async_call( + tag, + packet, + record_id=record_id, + skip_cache_lookup=skip_cache_lookup, + skip_cache_insert=skip_cache_insert, + execution_engine=execution_engine, ) if output_packet is not None: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index c64e817..284a697 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,15 +1,32 @@ -from collections.abc import Collection, Iterator, Mapping +from collections.abc import Collection, Iterator, Mapping, Callable from datetime import datetime from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import DataValue, TypeSpec + if TYPE_CHECKING: import pyarrow as pa import polars as pl import pandas as pd +class ExecutionEngine(Protocol): + def submit_sync(self, function: Callable, *args, **kwargs) -> Any: + """ + Run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... + + async def submit_async(self, function: Callable, *args, **kwargs) -> Any: + """ + Asynchronously run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... + + class Datagram(Protocol): """ Protocol for immutable datagram containers in Orcapod. @@ -1229,6 +1246,29 @@ def substream_identities(self) -> tuple[str, ...]: """ ... + @property + def execution_engine(self) -> ExecutionEngine | None: + """ + The execution engine attached to this stream. By default, the stream + will use this execution engine whenever it needs to perform computation. + None means the stream is not attached to any execution engine and will default + to running natively. + """ + + @execution_engine.setter + def execution_engine(self, engine: ExecutionEngine | None) -> None: + """ + Set the execution engine for this stream. + + This allows the stream to use a specific execution engine for + computation, enabling optimized execution strategies and resource + management. + + Args: + engine: The execution engine to attach to this stream + """ + ... + def get_substream(self, substream_id: str) -> "Stream": """ Retrieve a specific sub-stream by its identifier. @@ -1354,7 +1394,9 @@ def __iter__(self) -> Iterator[tuple[Tag, Packet]]: """ ... - def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: + def iter_packets( + self, execution_engine: ExecutionEngine | None = None + ) -> Iterator[tuple[Tag, Packet]]: """ Alias for __iter__ for explicit packet iteration. @@ -1375,12 +1417,41 @@ def iter_packets(self) -> Iterator[tuple[Tag, Packet]]: """ ... + def run(self, execution_engine: ExecutionEngine | None = None) -> None: + """ + Execute the stream using the provided execution engine. + + This method triggers computation of the stream content based on its + source kernel and upstream streams. It returns a new stream instance + containing the computed (tag, packet) pairs. + + Args: + execution_engine: The execution engine to use for computation + + """ + ... + + async def run_async(self, execution_engine: ExecutionEngine | None = None) -> None: + """ + Asynchronously execute the stream using the provided execution engine. + + This method triggers computation of the stream content based on its + source kernel and upstream streams. It returns a new stream instance + containing the computed (tag, packet) pairs. + + Args: + execution_engine: The execution engine to use for computation + + """ + ... + def as_df( self, include_data_context: bool = False, include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: ExecutionEngine | None = None, ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. @@ -1393,6 +1464,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + execution_engine: ExecutionEngine | None = None, ) -> "pa.Table": """ Convert the entire stream to a PyArrow Table. @@ -1410,7 +1482,9 @@ def as_table( """ ... - def flow(self) -> Collection[tuple[Tag, Packet]]: + def flow( + self, execution_engine: ExecutionEngine | None = None + ) -> Collection[tuple[Tag, Packet]]: """ Return the entire stream as a collection of (tag, packet) pairs. @@ -1418,8 +1492,9 @@ def flow(self) -> Collection[tuple[Tag, Packet]]: collection type. It is useful for small streams or when you need to process all data at once. - Returns: - Collection[tuple[Tag, Packet]]: All (tag, packet) pairs in the stream + Args: + execution_engine: Optional execution engine to use for computation. + If None, the stream will use its default execution engine. """ ... @@ -1810,8 +1885,20 @@ def output_packet_types(self) -> TypeSpec: """ ... + async def async_call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + ) -> tuple[Tag, Packet | None]: ... + def call( - self, tag: Tag, packet: Packet, record_id: str | None = None + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, ) -> tuple[Tag, Packet | None]: """ Process a single packet with its associated tag. @@ -1846,11 +1933,22 @@ def call( class CachedPod(Pod, Protocol): + async def async_call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[Tag, Packet | None]: ... + def call( self, tag: Tag, packet: Packet, record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[Tag, Packet | None]: From 8e4fdd779473388fda9e32aa4d3c4115a2788084 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 07:27:59 +0000 Subject: [PATCH 174/224] refactor: remove unused modules --- src/orcapod/dj/__init__.py | 0 src/orcapod/dj/mapper.py | 127 ----------- src/orcapod/dj/operation.py | 12 - src/orcapod/dj/pod.py | 149 ------------- src/orcapod/dj/source.py | 425 ------------------------------------ src/orcapod/dj/stream.py | 155 ------------- src/orcapod/dj/tracker.py | 155 ------------- 7 files changed, 1023 deletions(-) delete mode 100644 src/orcapod/dj/__init__.py delete mode 100644 src/orcapod/dj/mapper.py delete mode 100644 src/orcapod/dj/operation.py delete mode 100644 src/orcapod/dj/pod.py delete mode 100644 src/orcapod/dj/source.py delete mode 100644 src/orcapod/dj/stream.py delete mode 100644 src/orcapod/dj/tracker.py diff --git a/src/orcapod/dj/__init__.py b/src/orcapod/dj/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/orcapod/dj/mapper.py b/src/orcapod/dj/mapper.py deleted file mode 100644 index efec07c..0000000 --- a/src/orcapod/dj/mapper.py +++ /dev/null @@ -1,127 +0,0 @@ -import warnings -from typing import Optional - -from orcapod.core.operators import Join, MapPackets, MapTags, Operator -from .operation import QueryOperation -from .stream import QueryStream - - -class QueryMapper(QueryOperation, Operator): - """ - A special type of mapper that returns and works with QueryStreams - """ - - -def convert_to_query_mapper(operation: Operator) -> QueryMapper: - """ - Convert a generic mapper to an equivalent, Query mapper - """ - - if isinstance(operation, Join): - return JoinQuery() - elif isinstance(operation, MapPackets): - proj_map = {v: k for k, v in operation.key_map.items()} - # if drop_unmapped is True, we need to project the keys - args = [] if operation.drop_unmapped else [...] - return ProjectQuery(*args, **proj_map) - elif isinstance(operation, MapTags): - proj_map = {v: k for k, v in operation.key_map.items()} - if operation.drop_unmapped: - warnings.warn("Dropping unmapped tags is not supported in DataJoint") - return ProjectQuery(..., **proj_map) - elif isinstance(operation, QueryOperation): - # if the operation is already a QueryOperation, just return it - return operation - else: - raise ValueError(f"Unknown operation: {operation}") - - -class JoinQuery(QueryMapper): - """ - DataJoint specific Join operation that only works on QueryStream - """ - - def forward(self, *streams: QueryStream, project=False) -> QueryStream: - if len(streams) < 2: - raise ValueError("Join operation requires at least two streams") - - if not all(isinstance(s, QueryStream) for s in streams): - raise ValueError("All streams must be QueryStreams") - - # join the tables - joined_query = None - upstream_tables = set() - for stream in streams: - next_query = stream.query.proj() if project else stream.query - if joined_query is None: - joined_query = next_query - else: - joined_query = joined_query * stream.query - upstream_tables.update(stream.upstream_tables) - - return QueryStream(joined_query, upstream_tables) - - -class ProjectQuery(QueryMapper): - """ - Project (rename/remove) tag and packet keys - """ - - def __init__(self, *args, _label: Optional[str] = None, **projection_kwargs): - super().__init__(label=_label) - self.projection_args = args - self.projection_kwargs = projection_kwargs - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - str(self.projection_args), - str(self.projection_kwargs), - ) + tuple(streams) - - def forward(self, *streams: QueryStream) -> QueryStream: - if len(streams) != 1: - raise ValueError("Project operation requires exactly one stream") - - stream = streams[0] - - # project the query - projected_query = stream.query.proj( - *self.projection_args, **self.projection_kwargs - ) - - projected_upstreams = [ - table.proj(*self.projection_args, **self.projection_kwargs) - for table in stream.upstream_tables - ] - - return QueryStream(projected_query, projected_upstreams) - - -class RestrictQuery(QueryMapper): - """ - Restrict (filter) tag and packet keys - """ - - def __init__(self, *restrictions, label: Optional[str] = None): - super().__init__(label=label) - self.restrictions = restrictions - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - str(self.restrictions), - ) + tuple(streams) - - def forward(self, *streams: QueryStream) -> QueryStream: - if len(streams) != 1: - raise ValueError("Restrict operation requires exactly one stream") - - stream = streams[0] - - # restrict the query - restricted_query = stream.query - for restriction in self.restrictions: - restricted_query = restricted_query & restriction - - return QueryStream(restricted_query, stream.upstream_tables) diff --git a/src/orcapod/dj/operation.py b/src/orcapod/dj/operation.py deleted file mode 100644 index 70b218e..0000000 --- a/src/orcapod/dj/operation.py +++ /dev/null @@ -1,12 +0,0 @@ -from orcapod.core.base import Kernel -from .stream import QueryStream - - -class QueryOperation(Kernel): - """ - A special type of operation that returns and works with - QueryStreams - """ - - def __call__(self, *streams: QueryStream, **kwargs) -> QueryStream: - return super().__call__(*streams, **kwargs) diff --git a/src/orcapod/dj/pod.py b/src/orcapod/dj/pod.py deleted file mode 100644 index 7101090..0000000 --- a/src/orcapod/dj/pod.py +++ /dev/null @@ -1,149 +0,0 @@ -import logging -from typing import Collection, Optional, Tuple - -import datajoint as dj -from datajoint import Schema -from datajoint.table import Table - -from orcapod.core.pod import FunctionPod, Pod -from ..utils.name import pascal_to_snake, snake_to_pascal -from .mapper import JoinQuery -from .operation import QueryOperation -from .source import QuerySource -from .stream import QueryStream, TableCachedStream, TableStream - -logger = logging.getLogger(__name__) - - -class QueryPod(Pod, QueryOperation): - """ - A special type of operation that returns and works with - QueryStreams - """ - - -class TableCachedPod(QueryPod, QuerySource): - def __init__( - self, - fp: FunctionPod, - schema: Schema, - table_name: str = None, - table_postfix: str = "", - streams: Collection[QueryStream] = None, - create_table: bool = True, - label: Optional[str] = None, - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self.fp = fp - self.schema = schema - self.table_name = ( - table_name if table_name is not None else pascal_to_snake(fp.function_name) - ) + (f"_{table_postfix}" if table_postfix else "") - self.streams = streams if streams is not None else [] - self.table = None - if create_table: - self.compile() - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - str(self.schema), - self.table_name, - self.fp, - ) + tuple(self.streams) - - @property - def label(self) -> str: - if self._label is None: - return snake_to_pascal(self.fp.function_name) - return self._label - - def prepare_source_query(self) -> Tuple[QueryStream, Collection[Table]]: - if len(self.streams) > 1: - query_stream = JoinQuery()(*self.streams) - else: - query_stream = self.streams[0] - - source_query = query_stream.query - - upstream_tables = query_stream.upstream_tables - - return source_query, upstream_tables - - def compile(self) -> None: - if not all(isinstance(s, QueryStream) for s in self.streams): - raise ValueError("All streams must be QueryStreams") - - source_query, upstream_tables = self.prepare_source_query() - - # upstreams = '\n'.join([f"-> self.streams[{i}].upstream_tables[{j}]" for i, stream in enumerate(self.streams) for j in range(len(stream.upstream_tables))]) - upstreams = "\n".join( - f"-> upstream_tables[{i}]" for i in range(len(upstream_tables)) - ) - outputs = "\n".join([f"{k}: varchar(255)" for k in self.fp.output_keys]) - - class PodTable(dj.Computed): - definition = f""" - # {self.table_name} outputs - {upstreams} - --- - {outputs} - """ - - source = self - - @property - def key_source(self): - return source_query - - def make(self, key): - # form QueryStream using key - query_stream = QueryStream(source_query & key, upstream_tables) - - for key, packet in self.source.fp(query_stream): - key.update(packet) - self.insert1(key) - logger.info(f"Inserted key: {key}") - - PodTable.__name__ = snake_to_pascal(self.table_name) - PodTable = self.schema(PodTable) - self.table = PodTable - - def forward(self, *streams: QueryStream) -> QueryStream: - """ - This method has two modes of operations: as a source and as a pod. - When used as a source, it will return a stream of data from what's already - in the table. When used as a pod, it will execute the function as necessary - and insert the results into the table. - """ - - # if no stream are provided, act as a source and return the table stream - if len(streams) < 1: - return TableStream(self.table) - - # verify that all stream are QueryStreams - if not all(isinstance(s, QueryStream) for s in streams): - raise ValueError("All streams must be QueryStreams") - - # be sure to project out non primary keys - if len(streams) > 1: - joined_streams = JoinQuery()(*streams, project=True) - else: - # TODO: add proj method onto QueryStream - joined_streams = QueryStream( - streams[0].query.proj(), streams[0].upstream_tables - ) - - source_query, upstream_tables = self.prepare_source_query() - - # restrict the source using the passed in query - source_query = source_query & joined_streams.query - - # form QueryStream using key - query_stream = QueryStream(source_query, upstream_tables) - - # set table to allow direct insert - self.table._allow_insert = True - - return TableCachedStream(self.table, self.fp(query_stream)) diff --git a/src/orcapod/dj/source.py b/src/orcapod/dj/source.py deleted file mode 100644 index 0eaa6dc..0000000 --- a/src/orcapod/dj/source.py +++ /dev/null @@ -1,425 +0,0 @@ -import logging -from typing import Any, Collection, Optional, Union - -import datajoint as dj -from datajoint import Schema, Table - -from orcapod.hashing import hash_to_uuid - -from orcapod.core.sources import Source -from orcapod.core.streams import SyncStream -from orcapod.utils.name import pascal_to_snake, snake_to_pascal -from orcapod.utils.stream_utils import common_elements -from orcapod.dj.operation import QueryOperation -from orcapod.dj.stream import QueryStream, TableCachedStream, TableStream - -logger = logging.getLogger(__name__) - - -class QuerySource(Source, QueryOperation): - """ - A speical type of source that returns and works with QueryStreams - """ - - -class TableSource(QuerySource): - """ - A source that reads from a table. - """ - - def __init__( - self, table: Union[Table, type[Table]], label: Optional[str] = None - ) -> None: - super().__init__(label=label) - # if table is an instance, grab the class for consistency - if not isinstance(table, type): - table = table.__class__ - self.table = table - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - str(self.table.full_table_name), - ) + tuple(streams) - - @property - def label(self) -> str: - if self._label is None: - return self.table.__name__ - return self._label - - def forward(self, *streams: SyncStream) -> QueryStream: - """ - Read from the table and return a stream of packets. - """ - if len(streams) > 0: - raise ValueError("No streams should be passed to TableSource") - # make sure to pass in an instance of the table for the query - return QueryStream(self.table(), [self.table()]) - - def proj(self, *args, **kwargs) -> "TableSource": - """ - Project the table and return a new source. - """ - return TableSource(self.table.proj(*args, **kwargs)) - - def __and__(self, other: Any) -> "TableSource": - """ - Join the table with another table and return a new source. - """ - if isinstance(other, TableSource): - other = other.table - elif isinstance(other, QueryStream): - other = other.query - else: - raise ValueError(f"Object of type {type(other)} is not supported.") - return TableSource(self.table & other) - - def __repr__(self): - return self.table().__repr__() - - def preview(self, limit=None, width=None): - return self.table().preview(limit=limit, width=width) - - def _repr_html_(self): - """:return: HTML to display table in Jupyter notebook.""" - return self.table()._repr_html_() - - -class TableCachedStreamSource(QuerySource): - """ - This class wraps any `Stream` and caches the output into a DataJoint table. - The class instance acts as a source and returns a `QueryStream` when invoked. - """ - - def __init__( - self, - stream: SyncStream, - schema: Schema, - table_name: str | None = None, - label: str | None = None, - ): - super().__init__(label=label) - self.stream = stream - self.schema = schema - # if table name is not provided, use the name of the stream source - if table_name is None: - if stream.invocation is not None: - table_name = stream.invocation.operation.__class__.__name__ - else: - table_name = stream.__class__.__name__ - # make sure the table name is in snake case - self.table_name = pascal_to_snake(table_name) - - self.table = None - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.stream, - str(self.schema), - self.table_name, - ) + tuple(streams) - - @property - def label(self) -> str: - if self._label is None: - if ( - hasattr(self.stream.invocation, "label") - and self.stream.invocation.label is not None - ): - return self.stream.invocation.label - else: - return snake_to_pascal(self.table_name) - return self._label - - def compile(self, tag_keys: Collection[str], packet_keys: Collection[str]) -> None: - # create a table to store the cached packets - - key_fields = "\n".join([f"{k}: varchar(255)" for k in tag_keys]) - output_fields = "\n".join([f"{k}: varchar(255)" for k in packet_keys]) - - class CachedTable(dj.Manual): - source = self # this refers to the outer class instance - definition = f""" - # {self.table_name} outputs - {key_fields} - --- - {output_fields} - """ - - CachedTable.__name__ = snake_to_pascal(self.table_name) - CachedTable = self.schema(CachedTable) - self.table = CachedTable - - def forward(self, *streams: QueryStream) -> QueryStream: - if len(streams) > 0: - raise ValueError("No streams should be passed to TableCachedStreamSource") - - if self.table is None: - # TODO: consider handling this lazily - self.compile(*self.stream.keys()) - - return TableCachedStream(self.table, self.stream) - - -class TableCachedSource(QuerySource): - """ - This class wraps any `Source` and caches the output into a DataJoint table. - Consequently, the table returns a `QueryStream` that can be used by any downstraem - processes that relies on DJ-based streams (e.g. `TableCachedPod`). - """ - - def __init__( - self, - source: Source, - schema: Schema, - table_name: str = None, - table_postfix: str = "", - label: Optional[str] = None, - ): - super().__init__(label=label) - self.source = source - self.schema = schema - # if table name is not provided, use the name of the source - self.table_name = ( - table_name - if table_name is not None - else pascal_to_snake(source.__class__.__name__) - ) + (f"_{table_postfix}" if table_postfix else "") - self.table = None - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.source, - str(self.schema), - self.table_name, - ) + tuple(streams) - - @property - def label(self) -> str: - if self._label is None: - return self.source.label - return self._label - - def compile(self) -> None: - # create a table to store the cached packets - tag_keys, packet_keys = self.source().keys() - key_fields = "\n".join([f"{k}: varchar(255)" for k in tag_keys]) - output_fields = "\n".join([f"{k}: varchar(255)" for k in packet_keys]) - - class CachedTable(dj.Manual): - source = self # this refers to the outer class instance - definition = f""" - # {self.table_name} outputs - {key_fields} - --- - {output_fields} - """ - - def populate( - self, batch_size: int = 10, use_skip_duplicates: bool = False - ) -> int: - return sum( - 1 - for _ in self.source( - batch_size=batch_size, - use_skip_duplicates=use_skip_duplicates, - ) - ) - - CachedTable.__name__ = snake_to_pascal(self.table_name) - CachedTable = self.schema(CachedTable) - self.table = CachedTable - - def forward( - self, - *streams: QueryStream, - batch_size: int = 10, - use_skip_duplicates: bool = False, - ) -> QueryStream: - if len(streams) > 0: - raise ValueError("No streams should be passed to TableCachedSource") - - if self.table is None: - self.compile() - return TableCachedStream( - self.table, - self.source(), - batch_size=batch_size, - use_skip_duplicates=use_skip_duplicates, - ) - - -class MergedQuerySource(QuerySource): - """ - A source that represents multiple merged query. - """ - - def __init__( - self, - *streams: QueryStream, - schema: Schema, - table_name: str = None, - table_postfix: str = "", - label: Optional[str] = None, - lazy_build: bool = True, - ) -> None: - super().__init__(label=label) - self.streams = streams - self.schema = schema - self.table = None - if table_name is None: - table_name = self.label if self.label is not None else "MergedData" - - self.table_name = pascal_to_snake(table_name) + ( - f"_{table_postfix}" if table_postfix else "" - ) - if not lazy_build: - self.compile() - - @property - def label(self) -> str: - if self._label is None: - return "_".join([stream.label for stream in self.streams]) - return self._label - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.streams, - ) + tuple(streams) - - def forward(self, *streams: SyncStream) -> QueryStream: - if len(streams) > 0: - logger.warning( - "Handling multiple streams in forward is not implemented yet in " - "MergedQuerySource and this will be silently ignored" - ) - if self.table is None: - self.compile() - - return TableStream(self.table) - - def compile(self) -> None: - part_tag_keys = [] - part_packet_keys = [] - for stream in self.streams: - tag_key, packet_key = stream.keys() - part_tag_keys.append(tag_key) - part_packet_keys.append(packet_key) - - # find common keys among all tags and use that as primary key - common_tag_keys = common_elements(*part_tag_keys) - common_packet_keys = common_elements(*part_packet_keys) - - use_uuid = True - if all([len(k) == len(common_tag_keys) for k in part_tag_keys]): - # if all tags have the same number of keys, it is not necessary - # to include an additional UUID - use_uuid = False - - # create a table to store the cached packets - key_fields = "\n".join([f"{k}: varchar(255)" for k in common_tag_keys]) - output_fields = "\n".join([f"{k}: varchar(255)" for k in common_packet_keys]) - table_field = f"{self.table_name}_part" - uuid_field = f"{self.table_name}_uuid" if use_uuid else "" - table_entry = f"{table_field}: varchar(255)" - uuid_entry = f"{uuid_field}: uuid" if use_uuid else "" - - class MergedTable(dj.Manual): - source = self - definition = f""" - # {self.table_name} inputs - {key_fields} - {table_entry} - {uuid_entry} - --- - {output_fields} - """ - - for stream in self.streams: - if not isinstance(stream, QueryStream): - raise ValueError( - f"Stream {stream} is not a QueryStream. " - "Please use a QueryStream as input." - ) - part_table = make_part_table( - stream, - common_tag_keys, - common_packet_keys, - table_field, - uuid_field, - ) - setattr(MergedTable, snake_to_pascal(stream.label), part_table) - - MergedTable.__name__ = snake_to_pascal(self.table_name) - MergedTable = self.schema(MergedTable) - self.table = MergedTable - - # class CachedTable(dj.Manual): - # source = self # this refers to the outer class instance - # definition = f""" - # # {self.table_name} outputs - # {key_fields} - # --- - # {output_fields} - # """ - - # def populate( - # self, batch_size: int = 10, use_skip_duplicates: bool = False - # ) -> int: - # return sum( - # 1 - # for _ in self.operation( - # batch_size=batch_size, - # use_skip_duplicates=use_skip_duplicates, - # ) - # ) - - # CachedTable.__name__ = snake_to_pascal(self.table_name) - # CachedTable = self.schema(CachedTable) - # self.table = CachedTable - - -def make_part_table( - stream: QueryStream, - common_tag_keys, - common_packet_keys, - table_field, - uuid_field, -) -> type[dj.Part]: - upstreams = "\n".join( - f"-> self.upstream_tables[{i}]" for i in range(len(stream.upstream_tables)) - ) - - tag_keys, packet_keys = stream.keys() - - extra_packet_keys = [k for k in packet_keys if k not in common_packet_keys] - - extra_output_fields = "\n".join([f"{k}: varchar(255)" for k in extra_packet_keys]) - - class PartTable(dj.Part, dj.Computed): - upstream_tables = stream.upstream_tables - definition = f""" - -> master - --- - {upstreams} - {extra_output_fields} - """ - - @property - def key_source(self): - return stream.query - - def make(self, key): - content = (stream.query & key).fetch1() - content[table_field] = self.__class__.__name__ - if uuid_field: - content[uuid_field] = hash_to_uuid(key) - self.master.insert1(content, ignore_extra_fields=True) - self.insert1(content, ignore_extra_fields=True) - - PartTable.__name__ = snake_to_pascal(stream.label) - return PartTable diff --git a/src/orcapod/dj/stream.py b/src/orcapod/dj/stream.py deleted file mode 100644 index e8e7195..0000000 --- a/src/orcapod/dj/stream.py +++ /dev/null @@ -1,155 +0,0 @@ -import copy -import logging -from typing import Any, Collection, Union - -from datajoint.expression import QueryExpression -from datajoint.table import Table - -from orcapod.core.streams import SyncStream - -logger = logging.getLogger(__name__) - - -def query_without_restriction(query: QueryExpression) -> QueryExpression: - """ - Make a new QueryExpression, copying all attributes except for the restriction - """ - new_query = copy.copy(query) - new_query._restriction = None - new_query._restriction_attributes = None - - return new_query - - -class QueryStream(SyncStream): - """ - DataJoint query-based data stream. Critically, `QueryStream` is backed - by a DataJoint query, and iterating over it will yield packets equivalent to - iterating over the query results. - """ - - def __init__( - self, query: QueryExpression, upstream_tables: Collection[Table] - ) -> None: - super().__init__() - self.query = query - # remove the restriction from the query - self.upstream_tables = [ - query_without_restriction(table) for table in upstream_tables - ] - - def __iter__(self): - for row in self.query.fetch(as_dict=True): - tag = {k: row[k] for k in self.query.primary_key} - packet = {k: row[k] for k in row if k not in self.query.primary_key} - yield tag, packet - - def proj(self, *args, **kwargs) -> "QueryStream": - """ - Project the query stream and return a new query stream - """ - from .mapper import ProjectQuery - - return ProjectQuery(*args, **kwargs)(self) - - def __and__(self, other: Any) -> "QueryStream": - """ - Restrict the query stream by `other` and return a new query stream - """ - # lazy load to avoid circular import - from .source import TableSource - from .mapper import RestrictQuery - - if isinstance(other, TableSource): - other = other.table - elif isinstance(other, QueryStream): - other = other.query - else: - raise ValueError(f"Object of type {type(other)} is not supported.") - return RestrictQuery(other)(self) - - def __mul__(self, other: Any) -> Union["QueryStream", SyncStream]: - from .mapper import JoinQuery - - if isinstance(other, QueryStream): - return JoinQuery()(self, other) - else: - # fallback to default handling that returns a SyncStream - return super().__mul__(other) - - def __repr__(self): - return self.query.__repr__() - - def preview(self, limit=None, width=None): - return self.query.preview(limit=limit, width=width) - - def _repr_html_(self): - """:return: HTML to display table in Jupyter notebook.""" - return self.query._repr_html_() - - def identity_structure(self, *streams) -> Any: - return (self.__class__.__name__, self.query.make_sql()) - - -class TableStream(QueryStream): - """ - DataJoint table-based data stream - """ - - def __init__(self, table: Table) -> None: - if isinstance(table, type): - table = table() - super().__init__(table, [table]) - self.table = table - - -class TableCachedStream(TableStream): - """ - A stream that caches the output from another stream into a DJ table - and then returns the content. Effectively act as a TableStream as - all returned packets can be found in the table. - """ - - def __init__( - self, - table: Table, - stream: SyncStream, - batch_size: int = 1, - use_skip_duplicates=False, - ) -> None: - if isinstance(table, type): - table = table() - super().__init__(table) - self.stream = stream - self.batch_size = batch_size - self.use_skip_duplicates = use_skip_duplicates - - def __iter__(self): - logger.info( - f"Caching stream into table {self.table.table_name} with batch size {self.batch_size}, use_skip_duplicates={self.use_skip_duplicates}" - ) - batch = [] - batch_count = 0 - for tag, packet in self.stream: - # cache the packet into the table - if self.use_skip_duplicates or not self.table & tag: - # if use_skip_duplicates is True, skip duplicates - # if not, insert the packet into the table - batch.append(tag | packet) - batch_count += 1 - - # if batch_size is <= 0, will accumulate all packets into the batch - # and insert at the very end - if self.batch_size > 0 and batch_count >= self.batch_size: - self.table.insert(batch, skip_duplicates=self.use_skip_duplicates) - logger.debug( - f"Inserted batch of size {len(batch)} into table {self.table.table_name}" - ) - batch = [] - batch_count = 0 - yield tag, packet - if batch: - logger.debug( - f"Inserted the final batch of size {len(batch)} into table {self.table.table_name}" - ) - self.table.insert(batch, skip_duplicates=self.use_skip_duplicates) diff --git a/src/orcapod/dj/tracker.py b/src/orcapod/dj/tracker.py deleted file mode 100644 index 3276ba9..0000000 --- a/src/orcapod/dj/tracker.py +++ /dev/null @@ -1,155 +0,0 @@ -import sys -from collections import defaultdict -from types import ModuleType -from typing import Any, Collection, Optional, Tuple - -import networkx as nx -from datajoint import Schema - -from orcapod.core.base import Kernel, Source -from orcapod.core.operators import Operator, Merge -from orcapod.core.pod import FunctionPod -from orcapod.core.tracker import GraphTracker - -from .mapper import convert_to_query_mapper -from .operation import QueryOperation -from .pod import TableCachedPod -from .source import MergedQuerySource, TableCachedSource -from .stream import QueryStream - - -def convert_to_query_operation( - operation: Kernel, - schema: Schema, - table_name: str = None, - table_postfix: str = "", - upstreams: Optional[Collection[QueryStream]] = None, -) -> Tuple[QueryOperation, bool]: - """ - Convert a generic operation to an equivalent, DataJoint specific operation - """ - if upstreams is None: - upstreams = [] - - if isinstance(operation, QueryOperation): - return operation, False - - if isinstance(operation, Source) and len(upstreams) == 0: - return ( - TableCachedSource( - operation, - schema=schema, - table_name=table_name, - table_postfix=table_postfix, - ), - True, - ) - - if isinstance(operation, FunctionPod): - return ( - TableCachedPod( - operation, - schema=schema, - table_name=table_name, - table_postfix=table_postfix, - streams=upstreams, - ), - True, - ) - - if isinstance(operation, Merge): - return ( - MergedQuerySource( - *upstreams, - schema=schema, - table_name=table_name, - table_postfix=table_postfix, - ), - True, - ) - - if isinstance(operation, Operator): - return convert_to_query_mapper(operation), True - - # operation conversion is not supported, raise an error - raise ValueError(f"Unsupported operation for DJ conversion: {operation}") - - -class QueryTracker(GraphTracker): - """ - Query-specific tracker that tracks the invocations of operations - and their associated streams. - """ - - def __init__(self) -> None: - super().__init__() - self._converted_graph = None - - def generate_tables( - self, schema: Schema, module_name="pipeline" - ) -> Tuple[Any, ModuleType, ModuleType]: - G = self.generate_graph() - - # create a new module and add the tables to it - table_module = ModuleType(module_name) - table_module.__name__ = module_name + "_tables" - op_module = ModuleType(module_name) - op_module.__name__ = module_name + "_op" - - desired_labels_lut = defaultdict(list) - node_lut = {} - edge_lut = {} - for invocation in nx.topological_sort(G): - streams = [edge_lut.get(stream, stream) for stream in invocation.streams] - new_node, converted = convert_to_query_operation( - invocation.kernel, - schema, - table_name=None, - table_postfix=invocation.content_hash_int(), - upstreams=streams, - ) - - node_lut[invocation] = new_node - desired_labels_lut[new_node.label].append(new_node) - - if converted: - output_stream = new_node(*streams) - for edge in G.out_edges(invocation): - edge_lut[G.edges[edge]["stream"]] = output_stream - - # construct labels for the oprations - node_label_lut = {} - for label, nodes in desired_labels_lut.items(): - if len(nodes) > 1: - for idx, node in enumerate(nodes): - node_label_lut[node] = f"{label}Id{idx}" - else: - node_label_lut[nodes[0]] = label - # generate the new converted computation graph - G_dj = nx.DiGraph() - for invocation in G: - G_dj.add_node(node_lut[invocation]) - - for edge in G.edges: - stream = G.edges[edge]["stream"] - G_dj.add_edge( - node_lut[edge[0]], - node_lut[edge[1]], - stream=edge_lut.get(stream, stream), - ) - - for op in G_dj: - if hasattr(op, "table"): - op.__module__ = str(op_module) - op.__name__ = node_label_lut[op] - setattr(op_module, node_label_lut[op], op) - - table = op.table - table.__module__ = str(table_module) - table_name = node_label_lut[op] - setattr(table_module, table_name, table) - - setattr(table_module, "schema", schema) - sys.modules[module_name] = table_module - - return G_dj, op_module, table_module From 83189f9e0a97292bdfb0030ca96ec79af887180b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 08:00:08 +0000 Subject: [PATCH 175/224] fix: add proper handling of complex return type to table conversion --- .../01_introduction_to_orcapod.ipynb | 97 +- .../02_parallel_execution_on_ray.ipynb | 123 +- pyproject.toml | 7 +- src/orcapod/data/kernels.py | 2 +- src/orcapod/data/streams.py | 15 +- .../semantic_types/universal_converter.py | 6 +- uv.lock | 1454 ++++++++--------- 7 files changed, 905 insertions(+), 799 deletions(-) diff --git a/notebooks/tutorials/01_introduction_to_orcapod.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb index 1938f8f..9feecdb 100644 --- a/notebooks/tutorials/01_introduction_to_orcapod.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -1589,12 +1589,29 @@ "outputs": [ { "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idstats
i64list[struct[2]]
0[{"sum",11}, {"difference",-9}, {"product",10}]
1[{"sum",22}, {"difference",-18}, {"product",40}]
2[{"sum",33}, {"difference",-27}, {"product",90}]
3[{"sum",44}, {"difference",-36}, {"product",160}]
4[{"sum",55}, {"difference",-45}, {"product",250}]
" + ], "text/plain": [ - "[({'id': 0}, {'stats': {'sum': 11, 'difference': -9, 'product': 10}}),\n", - " ({'id': 1}, {'stats': {'sum': 22, 'difference': -18, 'product': 40}}),\n", - " ({'id': 2}, {'stats': {'sum': 33, 'difference': -27, 'product': 90}}),\n", - " ({'id': 3}, {'stats': {'sum': 44, 'difference': -36, 'product': 160}}),\n", - " ({'id': 4}, {'stats': {'sum': 55, 'difference': -45, 'product': 250}})]" + "shape: (5, 2)\n", + "┌─────┬─────────────────────────────────┐\n", + "│ id ┆ stats │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ list[struct[2]] │\n", + "╞═════╪═════════════════════════════════╡\n", + "│ 0 ┆ [{\"sum\",11}, {\"difference\",-9}… │\n", + "│ 1 ┆ [{\"sum\",22}, {\"difference\",-18… │\n", + "│ 2 ┆ [{\"sum\",33}, {\"difference\",-27… │\n", + "│ 3 ┆ [{\"sum\",44}, {\"difference\",-36… │\n", + "│ 4 ┆ [{\"sum\",55}, {\"difference\",-45… │\n", + "└─────┴─────────────────────────────────┘" ] }, "execution_count": 51, @@ -1603,7 +1620,7 @@ } ], "source": [ - "stats_output.flow()" + "stats_output.as_df()" ] }, { @@ -1614,17 +1631,29 @@ "outputs": [ { "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idmessage
i64str
0"Hi! The sum was 11, the differ…
1"Hi! The sum was 22, the differ…
2"Hi! The sum was 33, the differ…
3"Hi! The sum was 44, the differ…
4"Hi! The sum was 55, the differ…
" + ], "text/plain": [ - "[({'id': 0},\n", - " {'message': 'Hi! The sum was 11, the difference was -9, and the product was 10.'}),\n", - " ({'id': 1},\n", - " {'message': 'Hi! The sum was 22, the difference was -18, and the product was 40.'}),\n", - " ({'id': 2},\n", - " {'message': 'Hi! The sum was 33, the difference was -27, and the product was 90.'}),\n", - " ({'id': 3},\n", - " {'message': 'Hi! The sum was 44, the difference was -36, and the product was 160.'}),\n", - " ({'id': 4},\n", - " {'message': 'Hi! The sum was 55, the difference was -45, and the product was 250.'})]" + "shape: (5, 2)\n", + "┌─────┬─────────────────────────────────┐\n", + "│ id ┆ message │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═════╪═════════════════════════════════╡\n", + "│ 0 ┆ Hi! The sum was 11, the differ… │\n", + "│ 1 ┆ Hi! The sum was 22, the differ… │\n", + "│ 2 ┆ Hi! The sum was 33, the differ… │\n", + "│ 3 ┆ Hi! The sum was 44, the differ… │\n", + "│ 4 ┆ Hi! The sum was 55, the differ… │\n", + "└─────┴─────────────────────────────────┘" ] }, "execution_count": 52, @@ -1633,7 +1662,7 @@ } ], "source": [ - "messages.flow()" + "messages.as_df()" ] }, { @@ -1702,21 +1731,7 @@ "execution_count": 55, "id": "e132fc93", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:combine_results(sum: int, product: int)-> is acting as a source!\n" - ] - } - ], + "outputs": [], "source": [ "# now defien the pipeline\n", "with pipeline:\n", @@ -1899,21 +1914,7 @@ "execution_count": 61, "id": "3bad8332", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:multiply_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:add_numbers(a: int, b: int)-> is acting as a source!\n", - "Kernel PodNode:FunctionPod:combine_results(sum: int, product: int)-> is acting as a source!\n" - ] - } - ], + "outputs": [], "source": [ "# now defien the pipeline\n", "with pipeline2:\n", @@ -2075,7 +2076,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb index b7067c9..692e121 100644 --- a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb +++ b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "0c2dfaec", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "3196df7e", "metadata": {}, "outputs": [], @@ -30,7 +30,36 @@ "execution_count": null, "id": "9e1f338b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-08-05 07:53:27,293\tINFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver\n", + "SIGTERM handler is not set because current thread is not the main thread.\n", + "2025-08-05 07:53:30,607\tWARNING utils.py:1280 -- Python patch version mismatch: The cluster was started with:\n", + " Ray: 2.48.0\n", + " Python: 3.13.5\n", + "This process on Ray Client was started with:\n", + " Ray: 2.48.0\n", + " Python: 3.13.3\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m(autoscaler +33s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", + "\u001b[36m(autoscaler +33s)\u001b[0m Adding 5 node(s) of type workergroup.\n", + "\u001b[36m(autoscaler +33s)\u001b[0m Resized to 6 CPUs, 5 GPUs.\n", + "\u001b[36m(autoscaler +33s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*19. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +39s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*9. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +54s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*12. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +1m20s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*8. Add suitable node types to this cluster to resolve this issue.\n" + ] + } + ], "source": [ "ray_engine = NativeRayAsyncEngine(\n", " \"ray://raycluster-op-test-kuberay-head-svc.ray.svc.cluster.local:10001\"\n", @@ -39,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "63c692df", "metadata": {}, "outputs": [], @@ -66,10 +95,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "506a3a1e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (30, 2)
idsum
i64i64
00
15
210
315
420
25125
26130
27135
28140
29145
" + ], + "text/plain": [ + "shape: (30, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 0 │\n", + "│ 1 ┆ 5 │\n", + "│ 2 ┆ 10 │\n", + "│ 3 ┆ 15 │\n", + "│ 4 ┆ 20 │\n", + "│ … ┆ … │\n", + "│ 25 ┆ 125 │\n", + "│ 26 ┆ 130 │\n", + "│ 27 ┆ 135 │\n", + "│ 28 ┆ 140 │\n", + "│ 29 ┆ 145 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "result_stream1 = add_numbers(input_stream)\n", "result_stream1.run()\n", @@ -86,10 +153,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "e83fddac", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (30, 2)
idsum
i64i64
00
15
210
315
420
25125
26130
27135
28140
29145
" + ], + "text/plain": [ + "shape: (30, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 0 │\n", + "│ 1 ┆ 5 │\n", + "│ 2 ┆ 10 │\n", + "│ 3 ┆ 15 │\n", + "│ 4 ┆ 20 │\n", + "│ … ┆ … │\n", + "│ 25 ┆ 125 │\n", + "│ 26 ┆ 130 │\n", + "│ 27 ┆ 135 │\n", + "│ 28 ┆ 140 │\n", + "│ 29 ┆ 145 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "result_stream2 = add_numbers(input_stream)\n", "result_stream2.run(ray_engine)\n", @@ -121,7 +226,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.12" + "version": "3.13.3" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 6af8bd7..10d436b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,9 +16,10 @@ dependencies = [ "pyarrow>=20.0.0", "polars>=1.31.0", "beartype>=0.21.0", + ] readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.12.0" license = { text = "MIT License" } classifiers = [ "Programming Language :: Python :: 3", @@ -31,6 +32,8 @@ Homepage = "https://github.com/walkerlab/orcapod-python" [project.optional-dependencies] redis = ["redis>=6.2.0"] +ray = ["ray[default]>=2.48.0", "ipywidgets>=8.1.7"] +all = ["orcapod[redis]", "orcapod[ray]"] [tool.setuptools.packages.find] @@ -44,11 +47,13 @@ dev = [ "deltalake>=1.0.2", "httpie>=3.2.4", "ipykernel>=6.29.5", + "ipywidgets>=8.1.7", "jsonschema>=4.25.0", "pyarrow-stubs>=20.0.0.20250716", "pyiceberg>=0.9.1", "pytest>=8.3.5", "pytest-cov>=6.1.1", + "ray>=2.48.0", "redis>=6.2.0", "ruff>=0.11.11", "tqdm>=4.67.1", diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index ccea0af..a70c91c 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -122,7 +122,7 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An if len(streams) == 0: # If no streams are provided, then this is a source kernel # and we simply return None as the identity structure. - print(f"Kernel {self} is acting as a source!") + logger.debug(f"Kernel {self} is acting as a source!") return None streams = self.pre_kernel_processing(*streams) self.validate_inputs(*streams) diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 1caad9d..6ea31e0 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -888,13 +888,15 @@ def as_table( dict_patcket[k] = str(v) all_packets.append(dict_patcket) - # FIXME: this skips the semantic version conversion and thus is not - # fully correct! + # TODO: re-verify the implemetation of this conversion + converter = self._data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) all_tags_as_tables: pa.Table = pa.Table.from_pylist( all_tags, schema=tag_schema ) all_packets_as_tables: pa.Table = pa.Table.from_pylist( - all_packets, schema=packet_schema + struct_packets, schema=packet_schema ) self._cached_output_table = arrow_utils.hstack_tables( @@ -1226,13 +1228,14 @@ def as_table( dict_patcket[k] = str(v) all_packets.append(dict_patcket) - # FIXME: this skips the semantic version conversion and thus is not - # fully correct! + converter = self._data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) all_tags_as_tables: pa.Table = pa.Table.from_pylist( all_tags, schema=tag_schema ) all_packets_as_tables: pa.Table = pa.Table.from_pylist( - all_packets, schema=packet_schema + struct_packets, schema=packet_schema ) self._cached_output_table = arrow_utils.hstack_tables( diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 1ec711c..412a1a9 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -171,7 +171,7 @@ def arrow_schema_to_python_schema(self, arrow_schema: pa.Schema) -> dict[str, ty return python_schema - def python_dict_to_struct_dict( + def python_dicts_to_struct_dicts( self, python_dicts: list[dict[str, Any]], python_schema: dict[str, type] | None = None, @@ -242,14 +242,14 @@ def python_dicts_to_arrow_table( if python_schema is None: python_schema = infer_schema_from_pylist_data(python_dicts) - struct_dict = self.python_dict_to_struct_dict( + struct_dicts = self.python_dicts_to_struct_dicts( python_dicts, python_schema=python_schema ) # Convert to Arrow schema arrow_schema = self.python_schema_to_arrow_schema(python_schema) - return pa.Table.from_pylist(struct_dict, schema=arrow_schema) + return pa.Table.from_pylist(struct_dicts, schema=arrow_schema) def arrow_table_to_python_dicts( self, arrow_table: pa.Table diff --git a/uv.lock b/uv.lock index 637997f..632bbc5 100644 --- a/uv.lock +++ b/uv.lock @@ -1,10 +1,94 @@ version = 1 revision = 2 -requires-python = ">=3.10" +requires-python = ">=3.12.0" resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version < '3.11'", + "python_full_version >= '3.13'", + "python_full_version < '3.13'", +] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.12.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/97/77cb2450d9b35f517d6cf506256bf4f5bda3f93a66b4ad64ba7fc917899c/aiohttp-3.12.15-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:802d3868f5776e28f7bf69d349c26fc0efadb81676d0afa88ed00d98a26340b7", size = 702333, upload-time = "2025-07-29T05:50:46.507Z" }, + { url = "https://files.pythonhosted.org/packages/83/6d/0544e6b08b748682c30b9f65640d006e51f90763b41d7c546693bc22900d/aiohttp-3.12.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2800614cd560287be05e33a679638e586a2d7401f4ddf99e304d98878c29444", size = 476948, upload-time = "2025-07-29T05:50:48.067Z" }, + { url = "https://files.pythonhosted.org/packages/3a/1d/c8c40e611e5094330284b1aea8a4b02ca0858f8458614fa35754cab42b9c/aiohttp-3.12.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8466151554b593909d30a0a125d638b4e5f3836e5aecde85b66b80ded1cb5b0d", size = 469787, upload-time = "2025-07-29T05:50:49.669Z" }, + { url = "https://files.pythonhosted.org/packages/38/7d/b76438e70319796bfff717f325d97ce2e9310f752a267bfdf5192ac6082b/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e5a495cb1be69dae4b08f35a6c4579c539e9b5706f606632102c0f855bcba7c", size = 1716590, upload-time = "2025-07-29T05:50:51.368Z" }, + { url = "https://files.pythonhosted.org/packages/79/b1/60370d70cdf8b269ee1444b390cbd72ce514f0d1cd1a715821c784d272c9/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6404dfc8cdde35c69aaa489bb3542fb86ef215fc70277c892be8af540e5e21c0", size = 1699241, upload-time = "2025-07-29T05:50:53.628Z" }, + { url = "https://files.pythonhosted.org/packages/a3/2b/4968a7b8792437ebc12186db31523f541943e99bda8f30335c482bea6879/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ead1c00f8521a5c9070fcb88f02967b1d8a0544e6d85c253f6968b785e1a2ab", size = 1754335, upload-time = "2025-07-29T05:50:55.394Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c1/49524ed553f9a0bec1a11fac09e790f49ff669bcd14164f9fab608831c4d/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6990ef617f14450bc6b34941dba4f12d5613cbf4e33805932f853fbd1cf18bfb", size = 1800491, upload-time = "2025-07-29T05:50:57.202Z" }, + { url = "https://files.pythonhosted.org/packages/de/5e/3bf5acea47a96a28c121b167f5ef659cf71208b19e52a88cdfa5c37f1fcc/aiohttp-3.12.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd736ed420f4db2b8148b52b46b88ed038d0354255f9a73196b7bbce3ea97545", size = 1719929, upload-time = "2025-07-29T05:50:59.192Z" }, + { url = "https://files.pythonhosted.org/packages/39/94/8ae30b806835bcd1cba799ba35347dee6961a11bd507db634516210e91d8/aiohttp-3.12.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c5092ce14361a73086b90c6efb3948ffa5be2f5b6fbcf52e8d8c8b8848bb97c", size = 1635733, upload-time = "2025-07-29T05:51:01.394Z" }, + { url = "https://files.pythonhosted.org/packages/7a/46/06cdef71dd03acd9da7f51ab3a9107318aee12ad38d273f654e4f981583a/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aaa2234bb60c4dbf82893e934d8ee8dea30446f0647e024074237a56a08c01bd", size = 1696790, upload-time = "2025-07-29T05:51:03.657Z" }, + { url = "https://files.pythonhosted.org/packages/02/90/6b4cfaaf92ed98d0ec4d173e78b99b4b1a7551250be8937d9d67ecb356b4/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6d86a2fbdd14192e2f234a92d3b494dd4457e683ba07e5905a0b3ee25389ac9f", size = 1718245, upload-time = "2025-07-29T05:51:05.911Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e6/2593751670fa06f080a846f37f112cbe6f873ba510d070136a6ed46117c6/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a041e7e2612041a6ddf1c6a33b883be6a421247c7afd47e885969ee4cc58bd8d", size = 1658899, upload-time = "2025-07-29T05:51:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/8f/28/c15bacbdb8b8eb5bf39b10680d129ea7410b859e379b03190f02fa104ffd/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:5015082477abeafad7203757ae44299a610e89ee82a1503e3d4184e6bafdd519", size = 1738459, upload-time = "2025-07-29T05:51:09.56Z" }, + { url = "https://files.pythonhosted.org/packages/00/de/c269cbc4faa01fb10f143b1670633a8ddd5b2e1ffd0548f7aa49cb5c70e2/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:56822ff5ddfd1b745534e658faba944012346184fbfe732e0d6134b744516eea", size = 1766434, upload-time = "2025-07-29T05:51:11.423Z" }, + { url = "https://files.pythonhosted.org/packages/52/b0/4ff3abd81aa7d929b27d2e1403722a65fc87b763e3a97b3a2a494bfc63bc/aiohttp-3.12.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b2acbbfff69019d9014508c4ba0401822e8bae5a5fdc3b6814285b71231b60f3", size = 1726045, upload-time = "2025-07-29T05:51:13.689Z" }, + { url = "https://files.pythonhosted.org/packages/71/16/949225a6a2dd6efcbd855fbd90cf476052e648fb011aa538e3b15b89a57a/aiohttp-3.12.15-cp312-cp312-win32.whl", hash = "sha256:d849b0901b50f2185874b9a232f38e26b9b3d4810095a7572eacea939132d4e1", size = 423591, upload-time = "2025-07-29T05:51:15.452Z" }, + { url = "https://files.pythonhosted.org/packages/2b/d8/fa65d2a349fe938b76d309db1a56a75c4fb8cc7b17a398b698488a939903/aiohttp-3.12.15-cp312-cp312-win_amd64.whl", hash = "sha256:b390ef5f62bb508a9d67cb3bba9b8356e23b3996da7062f1a57ce1a79d2b3d34", size = 450266, upload-time = "2025-07-29T05:51:17.239Z" }, + { url = "https://files.pythonhosted.org/packages/f2/33/918091abcf102e39d15aba2476ad9e7bd35ddb190dcdd43a854000d3da0d/aiohttp-3.12.15-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9f922ffd05034d439dde1c77a20461cf4a1b0831e6caa26151fe7aa8aaebc315", size = 696741, upload-time = "2025-07-29T05:51:19.021Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2a/7495a81e39a998e400f3ecdd44a62107254803d1681d9189be5c2e4530cd/aiohttp-3.12.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2ee8a8ac39ce45f3e55663891d4b1d15598c157b4d494a4613e704c8b43112cd", size = 474407, upload-time = "2025-07-29T05:51:21.165Z" }, + { url = "https://files.pythonhosted.org/packages/49/fc/a9576ab4be2dcbd0f73ee8675d16c707cfc12d5ee80ccf4015ba543480c9/aiohttp-3.12.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3eae49032c29d356b94eee45a3f39fdf4b0814b397638c2f718e96cfadf4c4e4", size = 466703, upload-time = "2025-07-29T05:51:22.948Z" }, + { url = "https://files.pythonhosted.org/packages/09/2f/d4bcc8448cf536b2b54eed48f19682031ad182faa3a3fee54ebe5b156387/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b97752ff12cc12f46a9b20327104448042fce5c33a624f88c18f66f9368091c7", size = 1705532, upload-time = "2025-07-29T05:51:25.211Z" }, + { url = "https://files.pythonhosted.org/packages/f1/f3/59406396083f8b489261e3c011aa8aee9df360a96ac8fa5c2e7e1b8f0466/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:894261472691d6fe76ebb7fcf2e5870a2ac284c7406ddc95823c8598a1390f0d", size = 1686794, upload-time = "2025-07-29T05:51:27.145Z" }, + { url = "https://files.pythonhosted.org/packages/dc/71/164d194993a8d114ee5656c3b7ae9c12ceee7040d076bf7b32fb98a8c5c6/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fa5d9eb82ce98959fc1031c28198b431b4d9396894f385cb63f1e2f3f20ca6b", size = 1738865, upload-time = "2025-07-29T05:51:29.366Z" }, + { url = "https://files.pythonhosted.org/packages/1c/00/d198461b699188a93ead39cb458554d9f0f69879b95078dce416d3209b54/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0fa751efb11a541f57db59c1dd821bec09031e01452b2b6217319b3a1f34f3d", size = 1788238, upload-time = "2025-07-29T05:51:31.285Z" }, + { url = "https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5346b93e62ab51ee2a9d68e8f73c7cf96ffb73568a23e683f931e52450e4148d", size = 1710566, upload-time = "2025-07-29T05:51:33.219Z" }, + { url = "https://files.pythonhosted.org/packages/59/e4/16a8eac9df39b48ae102ec030fa9f726d3570732e46ba0c592aeeb507b93/aiohttp-3.12.15-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:049ec0360f939cd164ecbfd2873eaa432613d5e77d6b04535e3d1fbae5a9e645", size = 1624270, upload-time = "2025-07-29T05:51:35.195Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f8/cd84dee7b6ace0740908fd0af170f9fab50c2a41ccbc3806aabcb1050141/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b52dcf013b57464b6d1e51b627adfd69a8053e84b7103a7cd49c030f9ca44461", size = 1677294, upload-time = "2025-07-29T05:51:37.215Z" }, + { url = "https://files.pythonhosted.org/packages/ce/42/d0f1f85e50d401eccd12bf85c46ba84f947a84839c8a1c2c5f6e8ab1eb50/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b2af240143dd2765e0fb661fd0361a1b469cab235039ea57663cda087250ea9", size = 1708958, upload-time = "2025-07-29T05:51:39.328Z" }, + { url = "https://files.pythonhosted.org/packages/d5/6b/f6fa6c5790fb602538483aa5a1b86fcbad66244997e5230d88f9412ef24c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ac77f709a2cde2cc71257ab2d8c74dd157c67a0558a0d2799d5d571b4c63d44d", size = 1651553, upload-time = "2025-07-29T05:51:41.356Z" }, + { url = "https://files.pythonhosted.org/packages/04/36/a6d36ad545fa12e61d11d1932eef273928b0495e6a576eb2af04297fdd3c/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:47f6b962246f0a774fbd3b6b7be25d59b06fdb2f164cf2513097998fc6a29693", size = 1727688, upload-time = "2025-07-29T05:51:43.452Z" }, + { url = "https://files.pythonhosted.org/packages/aa/c8/f195e5e06608a97a4e52c5d41c7927301bf757a8e8bb5bbf8cef6c314961/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:760fb7db442f284996e39cf9915a94492e1896baac44f06ae551974907922b64", size = 1761157, upload-time = "2025-07-29T05:51:45.643Z" }, + { url = "https://files.pythonhosted.org/packages/05/6a/ea199e61b67f25ba688d3ce93f63b49b0a4e3b3d380f03971b4646412fc6/aiohttp-3.12.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad702e57dc385cae679c39d318def49aef754455f237499d5b99bea4ef582e51", size = 1710050, upload-time = "2025-07-29T05:51:48.203Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2e/ffeb7f6256b33635c29dbed29a22a723ff2dd7401fff42ea60cf2060abfb/aiohttp-3.12.15-cp313-cp313-win32.whl", hash = "sha256:f813c3e9032331024de2eb2e32a88d86afb69291fbc37a3a3ae81cc9917fb3d0", size = 422647, upload-time = "2025-07-29T05:51:50.718Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8e/78ee35774201f38d5e1ba079c9958f7629b1fd079459aea9467441dbfbf5/aiohttp-3.12.15-cp313-cp313-win_amd64.whl", hash = "sha256:1a649001580bdb37c6fdb1bebbd7e3bc688e8ec2b5c6f52edbb664662b17dc84", size = 449067, upload-time = "2025-07-29T05:51:52.549Z" }, +] + +[[package]] +name = "aiohttp-cors" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/d89e846a5444b3d5eb8985a6ddb0daef3774928e1bfbce8e84ec97b0ffa7/aiohttp_cors-0.8.1.tar.gz", hash = "sha256:ccacf9cb84b64939ea15f859a146af1f662a6b1d68175754a07315e305fb1403", size = 38626, upload-time = "2025-03-31T14:16:20.048Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl", hash = "sha256:3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d", size = 25231, upload-time = "2025-03-31T14:16:18.478Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] [[package]] @@ -29,36 +113,7 @@ wheels = [ name = "arro3-core" version = "0.5.1" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.12'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/eb/2a166478dfc951958bf4cd33891bfa346ab9c53c3a87f5ffe99dbe981577/arro3_core-0.5.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a85c4d78fb4a3e3b216b01e44ac16121a06e80169555cd0f7b8fcf038a6c14b3", size = 2448695, upload-time = "2025-05-31T23:17:55.526Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c0/2b1719accd4cc2f81bd36ad79a16750a63e0d7a5132e43115b586d52e21d/arro3_core-0.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2dd7a3b78c8936407e4eebbbe3134410d1be0c51fb697a8b8a5c8118690190a9", size = 2155415, upload-time = "2025-05-31T23:17:57.992Z" }, - { url = "https://files.pythonhosted.org/packages/9c/dc/6bcb859c4a83fff95b2ccc906c027db1f0396610a57bafc90bd933dcce83/arro3_core-0.5.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fae76973505d64cebf26a30c78d37a5a1fe012b3d6a6c682fea33ebd1dfc4d99", size = 2594341, upload-time = "2025-05-31T23:18:01.536Z" }, - { url = "https://files.pythonhosted.org/packages/6f/48/109cf08ca7532636d4c356a421e1620e7b01fb6882e12b6afbfa4b933c38/arro3_core-0.5.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c65d5ffb89cf9bcc62bb7f64beb049877ca03b504841ffc3cab6e853a13637c", size = 2637344, upload-time = "2025-05-31T23:18:05.307Z" }, - { url = "https://files.pythonhosted.org/packages/b2/4b/5a9dfc81195c8fcf2f99f9cb8f3d8c23ca9da541964d44e409a01ab06d3b/arro3_core-0.5.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ad9e3e69a0888bf1cd2c9cf2e7d60787ac9bf3b9508937bfb6ff55aba9a6b56b", size = 2878497, upload-time = "2025-05-31T23:18:08.803Z" }, - { url = "https://files.pythonhosted.org/packages/f1/26/a2a0685f3648afb20bbe4920cee6dc8a29b9942fa8c0190f6a8fc3ad4ef3/arro3_core-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36484d31141691c48d6e48f1c387d3b19fe5a814ffcde26b2ac04ebe68f81c76", size = 2540359, upload-time = "2025-05-31T23:18:12.092Z" }, - { url = "https://files.pythonhosted.org/packages/64/40/6b22f0f094d905d610945a9b7d4662d5f143f6638c37e89fb888443aee64/arro3_core-0.5.1-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:78942ee33f55758ce0138b30377185f2d93b9221fb5c239075b56159b3e3fb5b", size = 2289699, upload-time = "2025-05-31T23:18:15.895Z" }, - { url = "https://files.pythonhosted.org/packages/cf/46/eebe9826aeca54bc04bf8ed6e9506134dcf1d02a960482b0164a98d51800/arro3_core-0.5.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:063b9ffe896dbd01649eb46d04b55f19eb6bc7fa505d1781d64308e13a2510cc", size = 2723968, upload-time = "2025-05-31T23:18:19.597Z" }, - { url = "https://files.pythonhosted.org/packages/90/bc/5c2361010692854efb47211e15eeeb9cef02eb037dbb95b9dd68b4554ba7/arro3_core-0.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a818344b61b59c09c3f6550c03e5b01678535160b35d38eaa5f988667df69187", size = 2435669, upload-time = "2025-05-31T23:18:22.649Z" }, - { url = "https://files.pythonhosted.org/packages/39/0d/1fef7dcca81696bdea0e79971155b114fb3fb204f177eed25a07f856f57a/arro3_core-0.5.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:60fa11fe05f3b48e7b37c1d4f12d94ef678514d2e908033ac30d10d04b1bd957", size = 2869358, upload-time = "2025-05-31T23:18:27.008Z" }, - { url = "https://files.pythonhosted.org/packages/cc/02/1196e7f795658a5ef7c4b5811fe84845025e7baf391d05be36e763336156/arro3_core-0.5.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:8d1ce524ca27598154f84cf980c6fa4baf0c1379584de2e922e88905dfb939dd", size = 2797000, upload-time = "2025-05-31T23:18:30.694Z" }, - { url = "https://files.pythonhosted.org/packages/1c/ea/31bc0bc32ad3e22a937c866b685e0b1123f4747dabc23703531d7626a5d2/arro3_core-0.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2546df75769b60bbd74aa7a169cd538e909aabf2200a99edfdda542e560b5c11", size = 2709346, upload-time = "2025-05-31T23:18:34.125Z" }, - { url = "https://files.pythonhosted.org/packages/fb/2c/6bfb3a4cd26b1fed099767e124063f0b4fe5e7f0cab0160004ba5900cad0/arro3_core-0.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:d89350dc36f58c9c0fb941fbcd46e2e00f76f3438844ef3dce2419ce64631739", size = 2611596, upload-time = "2025-05-31T23:18:37.826Z" }, - { url = "https://files.pythonhosted.org/packages/86/45/c2540f04330f52f431a0ca0824c15d86fc38dd8b3f2af027a41a90ea91e7/arro3_core-0.5.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e6c43f2f59cd43044663969031c4ef29aab76247b5bda74800187a8b9bda3b9e", size = 2448953, upload-time = "2025-05-31T23:18:40.996Z" }, - { url = "https://files.pythonhosted.org/packages/4b/8f/9fc60dcc201f72f3d9d2ca86b61ff374eb640b58a65660b8de2ac53654d6/arro3_core-0.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:006214e68df6f66bbd1712989258cac2b307085627962348749cc2802b843f25", size = 2155535, upload-time = "2025-05-31T23:18:44.178Z" }, - { url = "https://files.pythonhosted.org/packages/5e/9e/4e6a3c41b52b08b8f34f7830df2a0e499d3e4ab43c6d45984e2af13fa780/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:be77d366d43025599a5a0c520cced43c181f750cf6bcc174a72a97a7338f9e37", size = 2594752, upload-time = "2025-05-31T23:18:47.586Z" }, - { url = "https://files.pythonhosted.org/packages/bd/77/94d8099c8fbfe3489ec92da76f65844b276f82b18d9cb6a547a717bd38cc/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca7cba980b3d2e3552dd06da67c8c298d970bd9430ed661a2316c893bfca3873", size = 2637291, upload-time = "2025-05-31T23:18:50.539Z" }, - { url = "https://files.pythonhosted.org/packages/ff/22/050c75161bcbe2e6b3ff5f8de11f760a376523fa905f4787b09bab65a4b5/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1866f014ca091049692d81601760b65fdad7b779d9c73698f709cd6ee4e8b5c3", size = 2869405, upload-time = "2025-05-31T23:18:53.73Z" }, - { url = "https://files.pythonhosted.org/packages/ac/88/87a3293db47dab5b23ecd910532f02c56d15f52920fc5d72404935126345/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e1433e98b4385f2565c59d69c1bbb4f18da7d2693d2d9712e219e020e8f9025", size = 2540544, upload-time = "2025-05-31T23:18:56.954Z" }, - { url = "https://files.pythonhosted.org/packages/71/e8/f85ce3be71c967b24e96c3af589ae3390548ab0d9fd69d5ed535225fd620/arro3_core-0.5.1-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:afba61734d4fc772ddf26888c299f94157e530a080835a981431a37398168fd6", size = 2289505, upload-time = "2025-05-31T23:19:00.354Z" }, - { url = "https://files.pythonhosted.org/packages/9c/4b/432eb5135fbcc5d8770ad7bd4193545e97588caf1f690d4f724bbb927632/arro3_core-0.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69b8885acf0e94b54adb6f060b4c41ee138d171b37a6356b690bece6b911565d", size = 2724357, upload-time = "2025-05-31T23:19:04.201Z" }, - { url = "https://files.pythonhosted.org/packages/83/91/056ab3166c5e562eab66477f573aff02bb4b92ba0de8affffd1bace6e50c/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2fe8f6d43697719abf822f9f02df7547681669c092b41bcee2b3a689f99e1588", size = 2435801, upload-time = "2025-05-31T23:19:07.617Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5f/b7a6a2106ba508e20f9788bb53c71b56211defd3729c7bcfe6ec09d36fd1/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a2aa298a78135d993e9257f110ac140e008d7bdc11eb23d8bc1c02493afbdf5a", size = 2869804, upload-time = "2025-05-31T23:19:11.059Z" }, - { url = "https://files.pythonhosted.org/packages/f6/e3/d95fbff21b27b06faa892c65621ea429391d0bfb926cdeb557db2d452a33/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:508688336dfc4667f8571115924857ae4629044ebeb4d3dedeabc33e287b2bca", size = 2797201, upload-time = "2025-05-31T23:19:14.674Z" }, - { url = "https://files.pythonhosted.org/packages/45/07/7ab65b01110e9459db2f2d37972826aa31a367ee98e95c7664f0eb13963d/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:31463bda8a942f5ae0e4a06c8fbe2424367b820d93f6f3b82c6f775f9a966780", size = 2709306, upload-time = "2025-05-31T23:19:17.913Z" }, - { url = "https://files.pythonhosted.org/packages/a7/15/0bebe279425bb70bd0a712dd45dcb4418deb9f32057ff5b9efd7947a65d3/arro3_core-0.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:0223d878f5f23c17600cab853cecce963c38fe365efa5f157f016706314018f1", size = 2611539, upload-time = "2025-05-31T23:19:21.358Z" }, +wheels = [ { url = "https://files.pythonhosted.org/packages/c9/9c/af3c6127548630beaa319746770265b2fb996bb3e6dba8d16f78910bc070/arro3_core-0.5.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:afccbaf951a84d6eafb4384692ea557ad06887c6db8825e9417647f805735936", size = 2438592, upload-time = "2025-05-31T23:19:24.494Z" }, { url = "https://files.pythonhosted.org/packages/d8/50/057c93a846bbc5e5e55a976ea4fc00255332f64e5f9b1abfc218bb184f48/arro3_core-0.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37325ec2f47a4dce40fa871935000708b545f3981c8e2bde7d7a031f2e098865", size = 2145488, upload-time = "2025-05-31T23:19:27.886Z" }, { url = "https://files.pythonhosted.org/packages/1f/8c/cbb785ecb9a0df254f5a761fc5ac7c8c5a6f93b0116e994ecf2797984f80/arro3_core-0.5.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:63ac803d46127d8c01bc4ffbb5911f10e51c063c9bcc76ba0258378bda683383", size = 2592145, upload-time = "2025-05-31T23:19:31.499Z" }, @@ -85,18 +140,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/7a/af901793fa426e8b86194654820c3612001b165b25f3bd7adde8d9e7bef4/arro3_core-0.5.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8c14b496f93906125baccef75703f0ea1c91608c201296bc21a1e916e5eb42c", size = 2792693, upload-time = "2025-05-31T23:20:47.071Z" }, { url = "https://files.pythonhosted.org/packages/2e/97/651eb8358d64d2bf5353db3d31ae6cb06529a07d2be699aa6a27434c6811/arro3_core-0.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:40e9db9564f22286310c5304884468b98d4eeb628f71c22f27d527e4219ae247", size = 2706150, upload-time = "2025-05-31T23:20:51.012Z" }, { url = "https://files.pythonhosted.org/packages/f3/af/0d591453490941e7cd2524ccac0398824eabafa745d0a25a758b1de2e361/arro3_core-0.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:bb0b13975c5394cb6a9887495aaf06cad8993893f99911c8aa2b827cd55dd6a8", size = 2612300, upload-time = "2025-05-31T23:20:54.249Z" }, - { url = "https://files.pythonhosted.org/packages/74/5c/c7135425c172d7fbc94c47ab48d46431d52df5b5f888bc140f7b2b710037/arro3_core-0.5.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f778d41f63cadb1b9e6bce3446e2758db271bc9b81878617232729053c7520fc", size = 2447436, upload-time = "2025-05-31T23:21:45.231Z" }, - { url = "https://files.pythonhosted.org/packages/5e/2c/b7f94e70101abaafa78a36445fdeadfc4461535a0acf55cd9c20bdc7e2b3/arro3_core-0.5.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:4df0b8594518bec2602d1b289dbabf22b9b0b63affc90ff0d6107990208c5e67", size = 2154852, upload-time = "2025-05-31T23:21:48.708Z" }, - { url = "https://files.pythonhosted.org/packages/7d/05/020b1cc1449755d35ba91d814d047fa20d18b9fb577a9fe9b87c72a1a217/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1165f2973c7472e564cef53505cc55852991733f00991b42d011d0f76c4c4c4a", size = 2593644, upload-time = "2025-05-31T23:21:52.812Z" }, - { url = "https://files.pythonhosted.org/packages/f8/92/5160d6adaad3a1db443ff5409353ec4df82d2068a8ed9b8e738036325c3c/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:abafcb7f2fe892700e5821b5158c98fad772a2c7412c9b35e4174ed919e24ed4", size = 2635380, upload-time = "2025-05-31T23:21:56.684Z" }, - { url = "https://files.pythonhosted.org/packages/53/21/4aa439cc2b597e0de66aef03f0f509afe206547b0794ce0ba004134fe716/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93120f0de07f2cac507219e74ef25a95a10fc5ec5a2d51c2fd117db2929220df", size = 2867549, upload-time = "2025-05-31T23:22:00.93Z" }, - { url = "https://files.pythonhosted.org/packages/5c/01/1338fff3c27366cd9ffc444c96aa74bfea3dc8ebb9dea4ee33346d74bccd/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:657896fc1e50e39d6ad9261f15cca103f26a7213dc30a6300dbcec6c5cc5a72d", size = 2539421, upload-time = "2025-05-31T23:22:04.631Z" }, - { url = "https://files.pythonhosted.org/packages/bc/78/3660ee1f71074a5195ae96c0cc9b58464c588705a5a93cc26b4f23a51cac/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_24_aarch64.whl", hash = "sha256:a8a6df4af193898b6e09902ba76a9c0c8699efaf91b3cff87d5f49cc97e04544", size = 2289147, upload-time = "2025-05-31T23:22:08.53Z" }, - { url = "https://files.pythonhosted.org/packages/85/cb/37d165bdb1633249e2e987d52d00308f790b4d24121b2a0a2a7817e1f8bb/arro3_core-0.5.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0d3faf92e45b479cd5556370db1c8895f153d9f59c52fdbd85af751838c8b218", size = 2723645, upload-time = "2025-05-31T23:22:12.604Z" }, - { url = "https://files.pythonhosted.org/packages/40/18/3edf9949cc09f9545e06abe8fd2b92eff71e86f8927062a3ab8cb1320377/arro3_core-0.5.1-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:07e358e8ea9c7b8fa38af79d0942b1e3174123541584370e9020394101d4198a", size = 2434306, upload-time = "2025-05-31T23:22:16.431Z" }, - { url = "https://files.pythonhosted.org/packages/87/2e/98a874f5f3b3baf911d8b87151b6654ac161ccb09ebb2cf621ac4da2edc3/arro3_core-0.5.1-pp310-pypy310_pp73-musllinux_1_2_armv7l.whl", hash = "sha256:70cfb884cfb465f4c0143a38e172a6de4a904afe884bd6773a89c4c6659c41e7", size = 2868790, upload-time = "2025-05-31T23:22:20.536Z" }, - { url = "https://files.pythonhosted.org/packages/1a/4c/0f7aa37d3374a82fa084517ac353378fc397685422ee1eac8884044cd487/arro3_core-0.5.1-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:9f47326af6c10cec993cee9cbcc4e554dc0c06269e2ba6f83c68235ae13ee98c", size = 2796671, upload-time = "2025-05-31T23:22:24.62Z" }, - { url = "https://files.pythonhosted.org/packages/0d/90/1c0714e2c1af68229e8d49c53a861399654b26152a19306927e48740dbd1/arro3_core-0.5.1-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5832859f53eb82c67bda2a655d466fb8520d096166df4ee9b0b17df748cbacb1", size = 2708649, upload-time = "2025-05-31T23:22:28.719Z" }, ] [[package]] @@ -108,15 +151,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, ] -[[package]] -name = "async-timeout" -version = "5.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, -] - [[package]] name = "attrs" version = "25.3.0" @@ -162,30 +196,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/90/07/f44ca684db4e4f08a3fdc6eeb9a0d15dc6883efc7b8c90357fdbf74e186c/cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14", size = 182191, upload-time = "2024-09-04T20:43:30.027Z" }, - { url = "https://files.pythonhosted.org/packages/08/fd/cc2fedbd887223f9f5d170c96e57cbf655df9831a6546c1727ae13fa977a/cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67", size = 178592, upload-time = "2024-09-04T20:43:32.108Z" }, - { url = "https://files.pythonhosted.org/packages/de/cc/4635c320081c78d6ffc2cab0a76025b691a91204f4aa317d568ff9280a2d/cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382", size = 426024, upload-time = "2024-09-04T20:43:34.186Z" }, - { url = "https://files.pythonhosted.org/packages/b6/7b/3b2b250f3aab91abe5f8a51ada1b717935fdaec53f790ad4100fe2ec64d1/cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702", size = 448188, upload-time = "2024-09-04T20:43:36.286Z" }, - { url = "https://files.pythonhosted.org/packages/d3/48/1b9283ebbf0ec065148d8de05d647a986c5f22586b18120020452fff8f5d/cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3", size = 455571, upload-time = "2024-09-04T20:43:38.586Z" }, - { url = "https://files.pythonhosted.org/packages/40/87/3b8452525437b40f39ca7ff70276679772ee7e8b394934ff60e63b7b090c/cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6", size = 436687, upload-time = "2024-09-04T20:43:40.084Z" }, - { url = "https://files.pythonhosted.org/packages/8d/fb/4da72871d177d63649ac449aec2e8a29efe0274035880c7af59101ca2232/cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17", size = 446211, upload-time = "2024-09-04T20:43:41.526Z" }, - { url = "https://files.pythonhosted.org/packages/ab/a0/62f00bcb411332106c02b663b26f3545a9ef136f80d5df746c05878f8c4b/cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8", size = 461325, upload-time = "2024-09-04T20:43:43.117Z" }, - { url = "https://files.pythonhosted.org/packages/36/83/76127035ed2e7e27b0787604d99da630ac3123bfb02d8e80c633f218a11d/cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e", size = 438784, upload-time = "2024-09-04T20:43:45.256Z" }, - { url = "https://files.pythonhosted.org/packages/21/81/a6cd025db2f08ac88b901b745c163d884641909641f9b826e8cb87645942/cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be", size = 461564, upload-time = "2024-09-04T20:43:46.779Z" }, - { url = "https://files.pythonhosted.org/packages/f8/fe/4d41c2f200c4a457933dbd98d3cf4e911870877bd94d9656cc0fcb390681/cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c", size = 171804, upload-time = "2024-09-04T20:43:48.186Z" }, - { url = "https://files.pythonhosted.org/packages/d1/b6/0b0f5ab93b0df4acc49cae758c81fe4e5ef26c3ae2e10cc69249dfd8b3ab/cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15", size = 181299, upload-time = "2024-09-04T20:43:49.812Z" }, - { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264, upload-time = "2024-09-04T20:43:51.124Z" }, - { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651, upload-time = "2024-09-04T20:43:52.872Z" }, - { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259, upload-time = "2024-09-04T20:43:56.123Z" }, - { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200, upload-time = "2024-09-04T20:43:57.891Z" }, - { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235, upload-time = "2024-09-04T20:44:00.18Z" }, - { url = "https://files.pythonhosted.org/packages/62/12/ce8710b5b8affbcdd5c6e367217c242524ad17a02fe5beec3ee339f69f85/cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6", size = 459721, upload-time = "2024-09-04T20:44:01.585Z" }, - { url = "https://files.pythonhosted.org/packages/ff/6b/d45873c5e0242196f042d555526f92aa9e0c32355a1be1ff8c27f077fd37/cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d", size = 467242, upload-time = "2024-09-04T20:44:03.467Z" }, - { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999, upload-time = "2024-09-04T20:44:05.023Z" }, - { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242, upload-time = "2024-09-04T20:44:06.444Z" }, - { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604, upload-time = "2024-09-04T20:44:08.206Z" }, - { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727, upload-time = "2024-09-04T20:44:09.481Z" }, - { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400, upload-time = "2024-09-04T20:44:10.873Z" }, { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" }, { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" }, { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" }, @@ -216,32 +226,6 @@ version = "3.4.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload-time = "2025-05-02T08:34:42.01Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/28/9901804da60055b406e1a1c5ba7aac1276fb77f1dde635aabfc7fd84b8ab/charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941", size = 201818, upload-time = "2025-05-02T08:31:46.725Z" }, - { url = "https://files.pythonhosted.org/packages/d9/9b/892a8c8af9110935e5adcbb06d9c6fe741b6bb02608c6513983048ba1a18/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd", size = 144649, upload-time = "2025-05-02T08:31:48.889Z" }, - { url = "https://files.pythonhosted.org/packages/7b/a5/4179abd063ff6414223575e008593861d62abfc22455b5d1a44995b7c101/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6", size = 155045, upload-time = "2025-05-02T08:31:50.757Z" }, - { url = "https://files.pythonhosted.org/packages/3b/95/bc08c7dfeddd26b4be8c8287b9bb055716f31077c8b0ea1cd09553794665/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d", size = 147356, upload-time = "2025-05-02T08:31:52.634Z" }, - { url = "https://files.pythonhosted.org/packages/a8/2d/7a5b635aa65284bf3eab7653e8b4151ab420ecbae918d3e359d1947b4d61/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86", size = 149471, upload-time = "2025-05-02T08:31:56.207Z" }, - { url = "https://files.pythonhosted.org/packages/ae/38/51fc6ac74251fd331a8cfdb7ec57beba8c23fd5493f1050f71c87ef77ed0/charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c", size = 151317, upload-time = "2025-05-02T08:31:57.613Z" }, - { url = "https://files.pythonhosted.org/packages/b7/17/edee1e32215ee6e9e46c3e482645b46575a44a2d72c7dfd49e49f60ce6bf/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0", size = 146368, upload-time = "2025-05-02T08:31:59.468Z" }, - { url = "https://files.pythonhosted.org/packages/26/2c/ea3e66f2b5f21fd00b2825c94cafb8c326ea6240cd80a91eb09e4a285830/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef", size = 154491, upload-time = "2025-05-02T08:32:01.219Z" }, - { url = "https://files.pythonhosted.org/packages/52/47/7be7fa972422ad062e909fd62460d45c3ef4c141805b7078dbab15904ff7/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6", size = 157695, upload-time = "2025-05-02T08:32:03.045Z" }, - { url = "https://files.pythonhosted.org/packages/2f/42/9f02c194da282b2b340f28e5fb60762de1151387a36842a92b533685c61e/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366", size = 154849, upload-time = "2025-05-02T08:32:04.651Z" }, - { url = "https://files.pythonhosted.org/packages/67/44/89cacd6628f31fb0b63201a618049be4be2a7435a31b55b5eb1c3674547a/charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db", size = 150091, upload-time = "2025-05-02T08:32:06.719Z" }, - { url = "https://files.pythonhosted.org/packages/1f/79/4b8da9f712bc079c0f16b6d67b099b0b8d808c2292c937f267d816ec5ecc/charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a", size = 98445, upload-time = "2025-05-02T08:32:08.66Z" }, - { url = "https://files.pythonhosted.org/packages/7d/d7/96970afb4fb66497a40761cdf7bd4f6fca0fc7bafde3a84f836c1f57a926/charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509", size = 105782, upload-time = "2025-05-02T08:32:10.46Z" }, - { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload-time = "2025-05-02T08:32:11.945Z" }, - { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload-time = "2025-05-02T08:32:13.946Z" }, - { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload-time = "2025-05-02T08:32:15.873Z" }, - { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload-time = "2025-05-02T08:32:17.283Z" }, - { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload-time = "2025-05-02T08:32:18.807Z" }, - { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload-time = "2025-05-02T08:32:20.333Z" }, - { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload-time = "2025-05-02T08:32:21.86Z" }, - { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload-time = "2025-05-02T08:32:23.434Z" }, - { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload-time = "2025-05-02T08:32:24.993Z" }, - { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload-time = "2025-05-02T08:32:26.435Z" }, - { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload-time = "2025-05-02T08:32:28.376Z" }, - { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload-time = "2025-05-02T08:32:30.281Z" }, - { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload-time = "2025-05-02T08:32:32.191Z" }, { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload-time = "2025-05-02T08:32:33.712Z" }, { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload-time = "2025-05-02T08:32:35.768Z" }, { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload-time = "2025-05-02T08:32:37.284Z" }, @@ -292,6 +276,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "colorful" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0c/0c/d180ebf230b771907f46981023a80f62cf592d49673cc5f8a5993aa67bb6/colorful-0.5.7.tar.gz", hash = "sha256:c5452179b56601c178b03d468a5326cc1fe37d9be81d24d0d6bdab36c4b93ad8", size = 209487, upload-time = "2025-06-30T15:24:03.936Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl", hash = "sha256:495dd3a23151a9568cee8a90fc1174c902ad7ef06655f50b6bddf9e80008da69", size = 201475, upload-time = "2025-06-30T15:24:02.693Z" }, +] + [[package]] name = "comm" version = "0.2.2" @@ -313,26 +309,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/12/a3/da4153ec8fe25d263aa48c1a4cbde7f49b59af86f0b6f7862788c60da737/contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934", size = 268551, upload-time = "2025-04-15T17:34:46.581Z" }, - { url = "https://files.pythonhosted.org/packages/2f/6c/330de89ae1087eb622bfca0177d32a7ece50c3ef07b28002de4757d9d875/contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989", size = 253399, upload-time = "2025-04-15T17:34:51.427Z" }, - { url = "https://files.pythonhosted.org/packages/c1/bd/20c6726b1b7f81a8bee5271bed5c165f0a8e1f572578a9d27e2ccb763cb2/contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d", size = 312061, upload-time = "2025-04-15T17:34:55.961Z" }, - { url = "https://files.pythonhosted.org/packages/22/fc/a9665c88f8a2473f823cf1ec601de9e5375050f1958cbb356cdf06ef1ab6/contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9", size = 351956, upload-time = "2025-04-15T17:35:00.992Z" }, - { url = "https://files.pythonhosted.org/packages/25/eb/9f0a0238f305ad8fb7ef42481020d6e20cf15e46be99a1fcf939546a177e/contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512", size = 320872, upload-time = "2025-04-15T17:35:06.177Z" }, - { url = "https://files.pythonhosted.org/packages/32/5c/1ee32d1c7956923202f00cf8d2a14a62ed7517bdc0ee1e55301227fc273c/contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631", size = 325027, upload-time = "2025-04-15T17:35:11.244Z" }, - { url = "https://files.pythonhosted.org/packages/83/bf/9baed89785ba743ef329c2b07fd0611d12bfecbedbdd3eeecf929d8d3b52/contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f", size = 1306641, upload-time = "2025-04-15T17:35:26.701Z" }, - { url = "https://files.pythonhosted.org/packages/d4/cc/74e5e83d1e35de2d28bd97033426b450bc4fd96e092a1f7a63dc7369b55d/contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2", size = 1374075, upload-time = "2025-04-15T17:35:43.204Z" }, - { url = "https://files.pythonhosted.org/packages/0c/42/17f3b798fd5e033b46a16f8d9fcb39f1aba051307f5ebf441bad1ecf78f8/contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0", size = 177534, upload-time = "2025-04-15T17:35:46.554Z" }, - { url = "https://files.pythonhosted.org/packages/54/ec/5162b8582f2c994721018d0c9ece9dc6ff769d298a8ac6b6a652c307e7df/contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a", size = 221188, upload-time = "2025-04-15T17:35:50.064Z" }, - { url = "https://files.pythonhosted.org/packages/b3/b9/ede788a0b56fc5b071639d06c33cb893f68b1178938f3425debebe2dab78/contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445", size = 269636, upload-time = "2025-04-15T17:35:54.473Z" }, - { url = "https://files.pythonhosted.org/packages/e6/75/3469f011d64b8bbfa04f709bfc23e1dd71be54d05b1b083be9f5b22750d1/contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773", size = 254636, upload-time = "2025-04-15T17:35:58.283Z" }, - { url = "https://files.pythonhosted.org/packages/8d/2f/95adb8dae08ce0ebca4fd8e7ad653159565d9739128b2d5977806656fcd2/contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1", size = 313053, upload-time = "2025-04-15T17:36:03.235Z" }, - { url = "https://files.pythonhosted.org/packages/c3/a6/8ccf97a50f31adfa36917707fe39c9a0cbc24b3bbb58185577f119736cc9/contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43", size = 352985, upload-time = "2025-04-15T17:36:08.275Z" }, - { url = "https://files.pythonhosted.org/packages/1d/b6/7925ab9b77386143f39d9c3243fdd101621b4532eb126743201160ffa7e6/contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab", size = 323750, upload-time = "2025-04-15T17:36:13.29Z" }, - { url = "https://files.pythonhosted.org/packages/c2/f3/20c5d1ef4f4748e52d60771b8560cf00b69d5c6368b5c2e9311bcfa2a08b/contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7", size = 326246, upload-time = "2025-04-15T17:36:18.329Z" }, - { url = "https://files.pythonhosted.org/packages/8c/e5/9dae809e7e0b2d9d70c52b3d24cba134dd3dad979eb3e5e71f5df22ed1f5/contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83", size = 1308728, upload-time = "2025-04-15T17:36:33.878Z" }, - { url = "https://files.pythonhosted.org/packages/e2/4a/0058ba34aeea35c0b442ae61a4f4d4ca84d6df8f91309bc2d43bb8dd248f/contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd", size = 1375762, upload-time = "2025-04-15T17:36:51.295Z" }, - { url = "https://files.pythonhosted.org/packages/09/33/7174bdfc8b7767ef2c08ed81244762d93d5c579336fc0b51ca57b33d1b80/contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f", size = 178196, upload-time = "2025-04-15T17:36:55.002Z" }, - { url = "https://files.pythonhosted.org/packages/5e/fe/4029038b4e1c4485cef18e480b0e2cd2d755448bb071eb9977caac80b77b/contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878", size = 222017, upload-time = "2025-04-15T17:36:58.576Z" }, { url = "https://files.pythonhosted.org/packages/34/f7/44785876384eff370c251d58fd65f6ad7f39adce4a093c934d4a67a7c6b6/contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2", size = 271580, upload-time = "2025-04-15T17:37:03.105Z" }, { url = "https://files.pythonhosted.org/packages/93/3b/0004767622a9826ea3d95f0e9d98cd8729015768075d61f9fea8eeca42a8/contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15", size = 255530, upload-time = "2025-04-15T17:37:07.026Z" }, { url = "https://files.pythonhosted.org/packages/e7/bb/7bd49e1f4fa805772d9fd130e0d375554ebc771ed7172f48dfcd4ca61549/contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92", size = 307688, upload-time = "2025-04-15T17:37:11.481Z" }, @@ -363,12 +339,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/73/69dd9a024444489e22d86108e7b913f3528f56cfc312b5c5727a44188471/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd", size = 1372168, upload-time = "2025-04-15T17:44:33.43Z" }, { url = "https://files.pythonhosted.org/packages/0f/1b/96d586ccf1b1a9d2004dd519b25fbf104a11589abfd05484ff12199cca21/contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1", size = 189550, upload-time = "2025-04-15T17:44:37.092Z" }, { url = "https://files.pythonhosted.org/packages/b0/e6/6000d0094e8a5e32ad62591c8609e269febb6e4db83a1c75ff8868b42731/contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69", size = 238214, upload-time = "2025-04-15T17:44:40.827Z" }, - { url = "https://files.pythonhosted.org/packages/33/05/b26e3c6ecc05f349ee0013f0bb850a761016d89cec528a98193a48c34033/contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c", size = 265681, upload-time = "2025-04-15T17:44:59.314Z" }, - { url = "https://files.pythonhosted.org/packages/2b/25/ac07d6ad12affa7d1ffed11b77417d0a6308170f44ff20fa1d5aa6333f03/contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16", size = 315101, upload-time = "2025-04-15T17:45:04.165Z" }, - { url = "https://files.pythonhosted.org/packages/8f/4d/5bb3192bbe9d3f27e3061a6a8e7733c9120e203cb8515767d30973f71030/contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad", size = 220599, upload-time = "2025-04-15T17:45:08.456Z" }, - { url = "https://files.pythonhosted.org/packages/ff/c0/91f1215d0d9f9f343e4773ba6c9b89e8c0cc7a64a6263f21139da639d848/contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0", size = 266807, upload-time = "2025-04-15T17:45:15.535Z" }, - { url = "https://files.pythonhosted.org/packages/d4/79/6be7e90c955c0487e7712660d6cead01fa17bff98e0ea275737cc2bc8e71/contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5", size = 318729, upload-time = "2025-04-15T17:45:20.166Z" }, - { url = "https://files.pythonhosted.org/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791, upload-time = "2025-04-15T17:45:24.794Z" }, ] [[package]] @@ -377,27 +347,6 @@ version = "7.8.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/ba/07/998afa4a0ecdf9b1981ae05415dad2d4e7716e1b1f00abbd91691ac09ac9/coverage-7.8.2.tar.gz", hash = "sha256:a886d531373a1f6ff9fad2a2ba4a045b68467b779ae729ee0b3b10ac20033b27", size = 812759, upload-time = "2025-05-23T11:39:57.856Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/26/6b/7dd06399a5c0b81007e3a6af0395cd60e6a30f959f8d407d3ee04642e896/coverage-7.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bd8ec21e1443fd7a447881332f7ce9d35b8fbd2849e761bb290b584535636b0a", size = 211573, upload-time = "2025-05-23T11:37:47.207Z" }, - { url = "https://files.pythonhosted.org/packages/f0/df/2b24090820a0bac1412955fb1a4dade6bc3b8dcef7b899c277ffaf16916d/coverage-7.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4c26c2396674816deaeae7ded0e2b42c26537280f8fe313335858ffff35019be", size = 212006, upload-time = "2025-05-23T11:37:50.289Z" }, - { url = "https://files.pythonhosted.org/packages/c5/c4/e4e3b998e116625562a872a342419652fa6ca73f464d9faf9f52f1aff427/coverage-7.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1aec326ed237e5880bfe69ad41616d333712c7937bcefc1343145e972938f9b3", size = 241128, upload-time = "2025-05-23T11:37:52.229Z" }, - { url = "https://files.pythonhosted.org/packages/b1/67/b28904afea3e87a895da850ba587439a61699bf4b73d04d0dfd99bbd33b4/coverage-7.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5e818796f71702d7a13e50c70de2a1924f729228580bcba1607cccf32eea46e6", size = 239026, upload-time = "2025-05-23T11:37:53.846Z" }, - { url = "https://files.pythonhosted.org/packages/8c/0f/47bf7c5630d81bc2cd52b9e13043685dbb7c79372a7f5857279cc442b37c/coverage-7.8.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:546e537d9e24efc765c9c891328f30f826e3e4808e31f5d0f87c4ba12bbd1622", size = 240172, upload-time = "2025-05-23T11:37:55.711Z" }, - { url = "https://files.pythonhosted.org/packages/ba/38/af3eb9d36d85abc881f5aaecf8209383dbe0fa4cac2d804c55d05c51cb04/coverage-7.8.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ab9b09a2349f58e73f8ebc06fac546dd623e23b063e5398343c5270072e3201c", size = 240086, upload-time = "2025-05-23T11:37:57.724Z" }, - { url = "https://files.pythonhosted.org/packages/9e/64/c40c27c2573adeba0fe16faf39a8aa57368a1f2148865d6bb24c67eadb41/coverage-7.8.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:fd51355ab8a372d89fb0e6a31719e825cf8df8b6724bee942fb5b92c3f016ba3", size = 238792, upload-time = "2025-05-23T11:37:59.737Z" }, - { url = "https://files.pythonhosted.org/packages/8e/ab/b7c85146f15457671c1412afca7c25a5696d7625e7158002aa017e2d7e3c/coverage-7.8.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0774df1e093acb6c9e4d58bce7f86656aeed6c132a16e2337692c12786b32404", size = 239096, upload-time = "2025-05-23T11:38:01.693Z" }, - { url = "https://files.pythonhosted.org/packages/d3/50/9446dad1310905fb1dc284d60d4320a5b25d4e3e33f9ea08b8d36e244e23/coverage-7.8.2-cp310-cp310-win32.whl", hash = "sha256:00f2e2f2e37f47e5f54423aeefd6c32a7dbcedc033fcd3928a4f4948e8b96af7", size = 214144, upload-time = "2025-05-23T11:38:03.68Z" }, - { url = "https://files.pythonhosted.org/packages/23/ed/792e66ad7b8b0df757db8d47af0c23659cdb5a65ef7ace8b111cacdbee89/coverage-7.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:145b07bea229821d51811bf15eeab346c236d523838eda395ea969d120d13347", size = 215043, upload-time = "2025-05-23T11:38:05.217Z" }, - { url = "https://files.pythonhosted.org/packages/6a/4d/1ff618ee9f134d0de5cc1661582c21a65e06823f41caf801aadf18811a8e/coverage-7.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b99058eef42e6a8dcd135afb068b3d53aff3921ce699e127602efff9956457a9", size = 211692, upload-time = "2025-05-23T11:38:08.485Z" }, - { url = "https://files.pythonhosted.org/packages/96/fa/c3c1b476de96f2bc7a8ca01a9f1fcb51c01c6b60a9d2c3e66194b2bdb4af/coverage-7.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5feb7f2c3e6ea94d3b877def0270dff0947b8d8c04cfa34a17be0a4dc1836879", size = 212115, upload-time = "2025-05-23T11:38:09.989Z" }, - { url = "https://files.pythonhosted.org/packages/f7/c2/5414c5a1b286c0f3881ae5adb49be1854ac5b7e99011501f81c8c1453065/coverage-7.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:670a13249b957bb9050fab12d86acef7bf8f6a879b9d1a883799276e0d4c674a", size = 244740, upload-time = "2025-05-23T11:38:11.947Z" }, - { url = "https://files.pythonhosted.org/packages/cd/46/1ae01912dfb06a642ef3dd9cf38ed4996fda8fe884dab8952da616f81a2b/coverage-7.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdc8bf760459a4a4187b452213e04d039990211f98644c7292adf1e471162b5", size = 242429, upload-time = "2025-05-23T11:38:13.955Z" }, - { url = "https://files.pythonhosted.org/packages/06/58/38c676aec594bfe2a87c7683942e5a30224791d8df99bcc8439fde140377/coverage-7.8.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07a989c867986c2a75f158f03fdb413128aad29aca9d4dbce5fc755672d96f11", size = 244218, upload-time = "2025-05-23T11:38:15.631Z" }, - { url = "https://files.pythonhosted.org/packages/80/0c/95b1023e881ce45006d9abc250f76c6cdab7134a1c182d9713878dfefcb2/coverage-7.8.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2db10dedeb619a771ef0e2949ccba7b75e33905de959c2643a4607bef2f3fb3a", size = 243865, upload-time = "2025-05-23T11:38:17.622Z" }, - { url = "https://files.pythonhosted.org/packages/57/37/0ae95989285a39e0839c959fe854a3ae46c06610439350d1ab860bf020ac/coverage-7.8.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e6ea7dba4e92926b7b5f0990634b78ea02f208d04af520c73a7c876d5a8d36cb", size = 242038, upload-time = "2025-05-23T11:38:19.966Z" }, - { url = "https://files.pythonhosted.org/packages/4d/82/40e55f7c0eb5e97cc62cbd9d0746fd24e8caf57be5a408b87529416e0c70/coverage-7.8.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ef2f22795a7aca99fc3c84393a55a53dd18ab8c93fb431004e4d8f0774150f54", size = 242567, upload-time = "2025-05-23T11:38:21.912Z" }, - { url = "https://files.pythonhosted.org/packages/f9/35/66a51adc273433a253989f0d9cc7aa6bcdb4855382cf0858200afe578861/coverage-7.8.2-cp311-cp311-win32.whl", hash = "sha256:641988828bc18a6368fe72355df5f1703e44411adbe49bba5644b941ce6f2e3a", size = 214194, upload-time = "2025-05-23T11:38:23.571Z" }, - { url = "https://files.pythonhosted.org/packages/f6/8f/a543121f9f5f150eae092b08428cb4e6b6d2d134152c3357b77659d2a605/coverage-7.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8ab4a51cb39dc1933ba627e0875046d150e88478dbe22ce145a68393e9652975", size = 215109, upload-time = "2025-05-23T11:38:25.137Z" }, - { url = "https://files.pythonhosted.org/packages/77/65/6cc84b68d4f35186463cd7ab1da1169e9abb59870c0f6a57ea6aba95f861/coverage-7.8.2-cp311-cp311-win_arm64.whl", hash = "sha256:8966a821e2083c74d88cca5b7dcccc0a3a888a596a04c0b9668a891de3a0cc53", size = 213521, upload-time = "2025-05-23T11:38:27.123Z" }, { url = "https://files.pythonhosted.org/packages/8d/2a/1da1ada2e3044fcd4a3254fb3576e160b8fe5b36d705c8a31f793423f763/coverage-7.8.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2f6fe3654468d061942591aef56686131335b7a8325684eda85dacdf311356c", size = 211876, upload-time = "2025-05-23T11:38:29.01Z" }, { url = "https://files.pythonhosted.org/packages/70/e9/3d715ffd5b6b17a8be80cd14a8917a002530a99943cc1939ad5bb2aa74b9/coverage-7.8.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76090fab50610798cc05241bf83b603477c40ee87acd358b66196ab0ca44ffa1", size = 212130, upload-time = "2025-05-23T11:38:30.675Z" }, { url = "https://files.pythonhosted.org/packages/a0/02/fdce62bb3c21649abfd91fbdcf041fb99be0d728ff00f3f9d54d97ed683e/coverage-7.8.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd0a0a5054be160777a7920b731a0570284db5142abaaf81bcbb282b8d99279", size = 246176, upload-time = "2025-05-23T11:38:32.395Z" }, @@ -431,15 +380,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/63/2d624ac7d7ccd4ebbd3c6a9eba9d7fc4491a1226071360d59dd84928ccb2/coverage-7.8.2-cp313-cp313t-win32.whl", hash = "sha256:3f5673888d3676d0a745c3d0e16da338c5eea300cb1f4ada9c872981265e76d8", size = 215109, upload-time = "2025-05-23T11:39:26.722Z" }, { url = "https://files.pythonhosted.org/packages/22/5e/7053b71462e970e869111c1853afd642212568a350eba796deefdfbd0770/coverage-7.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:2c08b05ee8d7861e45dc5a2cc4195c8c66dca5ac613144eb6ebeaff2d502e73d", size = 216268, upload-time = "2025-05-23T11:39:28.429Z" }, { url = "https://files.pythonhosted.org/packages/07/69/afa41aa34147655543dbe96994f8a246daf94b361ccf5edfd5df62ce066a/coverage-7.8.2-cp313-cp313t-win_arm64.whl", hash = "sha256:1e1448bb72b387755e1ff3ef1268a06617afd94188164960dba8d0245a46004b", size = 214071, upload-time = "2025-05-23T11:39:30.55Z" }, - { url = "https://files.pythonhosted.org/packages/69/2f/572b29496d8234e4a7773200dd835a0d32d9e171f2d974f3fe04a9dbc271/coverage-7.8.2-pp39.pp310.pp311-none-any.whl", hash = "sha256:ec455eedf3ba0bbdf8f5a570012617eb305c63cb9f03428d39bf544cb2b94837", size = 203636, upload-time = "2025-05-23T11:39:52.002Z" }, { url = "https://files.pythonhosted.org/packages/a0/1a/0b9c32220ad694d66062f571cc5cedfa9997b64a591e8a500bb63de1bd40/coverage-7.8.2-py3-none-any.whl", hash = "sha256:726f32ee3713f7359696331a18daf0c3b3a70bb0ae71141b9d3c52be7c595e32", size = 203623, upload-time = "2025-05-23T11:39:53.846Z" }, ] -[package.optional-dependencies] -toml = [ - { name = "tomli", marker = "python_full_version <= '3.11'" }, -] - [[package]] name = "cycler" version = "0.12.1" @@ -455,14 +398,6 @@ version = "1.8.14" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/bd/75/087fe07d40f490a78782ff3b0a30e3968936854105487decdb33446d4b0e/debugpy-1.8.14.tar.gz", hash = "sha256:7cd287184318416850aa8b60ac90105837bb1e59531898c07569d197d2ed5322", size = 1641444, upload-time = "2025-04-10T19:46:10.981Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/df/156df75a41aaebd97cee9d3870fe68f8001b6c1c4ca023e221cfce69bece/debugpy-1.8.14-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:93fee753097e85623cab1c0e6a68c76308cd9f13ffdf44127e6fab4fbf024339", size = 2076510, upload-time = "2025-04-10T19:46:13.315Z" }, - { url = "https://files.pythonhosted.org/packages/69/cd/4fc391607bca0996db5f3658762106e3d2427beaef9bfd363fd370a3c054/debugpy-1.8.14-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d937d93ae4fa51cdc94d3e865f535f185d5f9748efb41d0d49e33bf3365bd79", size = 3559614, upload-time = "2025-04-10T19:46:14.647Z" }, - { url = "https://files.pythonhosted.org/packages/1a/42/4e6d2b9d63e002db79edfd0cb5656f1c403958915e0e73ab3e9220012eec/debugpy-1.8.14-cp310-cp310-win32.whl", hash = "sha256:c442f20577b38cc7a9aafecffe1094f78f07fb8423c3dddb384e6b8f49fd2987", size = 5208588, upload-time = "2025-04-10T19:46:16.233Z" }, - { url = "https://files.pythonhosted.org/packages/97/b1/cc9e4e5faadc9d00df1a64a3c2d5c5f4b9df28196c39ada06361c5141f89/debugpy-1.8.14-cp310-cp310-win_amd64.whl", hash = "sha256:f117dedda6d969c5c9483e23f573b38f4e39412845c7bc487b6f2648df30fe84", size = 5241043, upload-time = "2025-04-10T19:46:17.768Z" }, - { url = "https://files.pythonhosted.org/packages/67/e8/57fe0c86915671fd6a3d2d8746e40485fd55e8d9e682388fbb3a3d42b86f/debugpy-1.8.14-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:1b2ac8c13b2645e0b1eaf30e816404990fbdb168e193322be8f545e8c01644a9", size = 2175064, upload-time = "2025-04-10T19:46:19.486Z" }, - { url = "https://files.pythonhosted.org/packages/3b/97/2b2fd1b1c9569c6764ccdb650a6f752e4ac31be465049563c9eb127a8487/debugpy-1.8.14-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf431c343a99384ac7eab2f763980724834f933a271e90496944195318c619e2", size = 3132359, upload-time = "2025-04-10T19:46:21.192Z" }, - { url = "https://files.pythonhosted.org/packages/c0/ee/b825c87ed06256ee2a7ed8bab8fb3bb5851293bf9465409fdffc6261c426/debugpy-1.8.14-cp311-cp311-win32.whl", hash = "sha256:c99295c76161ad8d507b413cd33422d7c542889fbb73035889420ac1fad354f2", size = 5133269, upload-time = "2025-04-10T19:46:23.047Z" }, - { url = "https://files.pythonhosted.org/packages/d5/a6/6c70cd15afa43d37839d60f324213843174c1d1e6bb616bd89f7c1341bac/debugpy-1.8.14-cp311-cp311-win_amd64.whl", hash = "sha256:7816acea4a46d7e4e50ad8d09d963a680ecc814ae31cdef3622eb05ccacf7b01", size = 5158156, upload-time = "2025-04-10T19:46:24.521Z" }, { url = "https://files.pythonhosted.org/packages/d9/2a/ac2df0eda4898f29c46eb6713a5148e6f8b2b389c8ec9e425a4a1d67bf07/debugpy-1.8.14-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:8899c17920d089cfa23e6005ad9f22582fd86f144b23acb9feeda59e84405b84", size = 2501268, upload-time = "2025-04-10T19:46:26.044Z" }, { url = "https://files.pythonhosted.org/packages/10/53/0a0cb5d79dd9f7039169f8bf94a144ad3efa52cc519940b3b7dde23bcb89/debugpy-1.8.14-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6bb5c0dcf80ad5dbc7b7d6eac484e2af34bdacdf81df09b6a3e62792b722826", size = 4221077, upload-time = "2025-04-10T19:46:27.464Z" }, { url = "https://files.pythonhosted.org/packages/f8/d5/84e01821f362327bf4828728aa31e907a2eca7c78cd7c6ec062780d249f8/debugpy-1.8.14-cp312-cp312-win32.whl", hash = "sha256:281d44d248a0e1791ad0eafdbbd2912ff0de9eec48022a5bfbc332957487ed3f", size = 5255127, upload-time = "2025-04-10T19:46:29.467Z" }, @@ -523,15 +458,12 @@ wheels = [ ] [[package]] -name = "exceptiongroup" -version = "1.3.0" +name = "distlib" +version = "0.4.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] [[package]] @@ -543,28 +475,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" }, ] +[[package]] +name = "filelock" +version = "3.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075, upload-time = "2025-03-14T07:11:40.47Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload-time = "2025-03-14T07:11:39.145Z" }, +] + [[package]] name = "fonttools" version = "4.58.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/3e/7a/30c581aeaa86d94e7a29344bccefd2408870bf5b0e7640b6f4ffede61bd0/fonttools-4.58.1.tar.gz", hash = "sha256:cbc8868e0a29c3e22628dfa1432adf7a104d86d1bc661cecc3e9173070b6ab2d", size = 3519505, upload-time = "2025-05-28T15:29:26.219Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/ed/94a7310e6ee87f6164d7cf273335445fb12b70625582df137b3692ec495b/fonttools-4.58.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4ebd423034ac4f74196c1ae29f8ed3b862f820345acbf35600af8596ebf62573", size = 2734333, upload-time = "2025-05-28T15:27:59.568Z" }, - { url = "https://files.pythonhosted.org/packages/09/d9/7f16d4aea0494dc02a284cb497ddd37a5b88d0d3da4ea41f7298ce96ca1a/fonttools-4.58.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9dc36f4b4044d95e6fb358da4c3e6a5c07c9b6f4c1e8c396e89bee3b65dae902", size = 2306563, upload-time = "2025-05-28T15:28:02.087Z" }, - { url = "https://files.pythonhosted.org/packages/cf/16/abdecf240d4fcc8badf6dbe3941500b64acd1401288bd9515e936ab2d27f/fonttools-4.58.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc4b74d7bb84189fe264d56a544ac5c818f8f1e8141856746768691fe185b229", size = 4717603, upload-time = "2025-05-28T15:28:03.849Z" }, - { url = "https://files.pythonhosted.org/packages/9c/3c/ad9bc6cfb4c4260689808b083c1d1a0c15b11d7c87bf7f6e61f77d4c106c/fonttools-4.58.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa4fa41e9cb43f78881a5896d6e41b6a0ec54e9d68e7eaaff6d7a1769b17017", size = 4750798, upload-time = "2025-05-28T15:28:05.956Z" }, - { url = "https://files.pythonhosted.org/packages/63/e7/d32080afcd754b78c7bedfa8475b6887792fca81a95ff7c634a59dc8eb4c/fonttools-4.58.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91335202f19c9edc04f2f6a7d9bb269b0a435d7de771e3f33c3ea9f87f19c8d4", size = 4800201, upload-time = "2025-05-28T15:28:07.731Z" }, - { url = "https://files.pythonhosted.org/packages/46/21/68f5285ba7c59c9df8fdc045b55a149c10af865b2615ea426daa47bcf287/fonttools-4.58.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e6b0ec2171e811a0d9e467225dc06b0fac39a84b4704f263c2d538c3c67b99b2", size = 4908504, upload-time = "2025-05-28T15:28:10.095Z" }, - { url = "https://files.pythonhosted.org/packages/66/77/abf1739cee99672b9bc3701bc3a51b01d325c4e117d7efd7e69315c28ce5/fonttools-4.58.1-cp310-cp310-win32.whl", hash = "sha256:a788983d522d02a9b457cc98aa60fc631dabae352fb3b30a56200890cd338ca0", size = 2190748, upload-time = "2025-05-28T15:28:12.232Z" }, - { url = "https://files.pythonhosted.org/packages/5e/18/e5a239f913f51e48a2d620be07a8f942fb8018850e0fbfeee2c11dd72723/fonttools-4.58.1-cp310-cp310-win_amd64.whl", hash = "sha256:c8c848a2d5961d277b85ac339480cecea90599059f72a42047ced25431e8b72a", size = 2235207, upload-time = "2025-05-28T15:28:14.687Z" }, - { url = "https://files.pythonhosted.org/packages/50/3f/9fecd69149b0eec5ca46ec58de83b2fd34d07204fe2c12c209255082507a/fonttools-4.58.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9966e14729669bcfbb56f83b747a2397c4d97c6d4798cb2e2adc28f9388fa008", size = 2754713, upload-time = "2025-05-28T15:28:18.998Z" }, - { url = "https://files.pythonhosted.org/packages/c8/19/d04ea5f3ab2afa7799f2b1ebe1d57ff71b479f99f29b82bddc7197d50220/fonttools-4.58.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64cc1647bbe83dea57f5496ec878ad19ccdba7185b0dd34955d3e6f03dc789e6", size = 2316637, upload-time = "2025-05-28T15:28:21.016Z" }, - { url = "https://files.pythonhosted.org/packages/5c/3f/375f59d756b17318336c050363849011e03ac82904538f39ebe8189835bc/fonttools-4.58.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:464f790ce681d08d1583df0735776aa9cb1999594bf336ddd0bf962c17b629ac", size = 4915730, upload-time = "2025-05-28T15:28:22.633Z" }, - { url = "https://files.pythonhosted.org/packages/2f/90/069f859d6f6480503574cda21b84ceee98bf5f5fd1764f26674e828a2600/fonttools-4.58.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c53c6a720ee70cc25746d511ba88c45c95ec510fd258026ed209b0b9e3ba92f", size = 4936194, upload-time = "2025-05-28T15:28:24.704Z" }, - { url = "https://files.pythonhosted.org/packages/01/11/339973e588e1c27f20c578f845bdcf84376c5e42bd35fca05419fd8d1648/fonttools-4.58.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6823a633bbce29cf3033508ebb54a433c473fb9833eff7f936bfdc5204fd98d", size = 4978982, upload-time = "2025-05-28T15:28:26.633Z" }, - { url = "https://files.pythonhosted.org/packages/a7/aa/1c627532a69715f54b8d96ab3a7bc8628f6e89989e9275dfc067dc2d6d56/fonttools-4.58.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5701fe66a1408c1974d2f78c00f964f8aad17cccbc32bc041e1b81421f31f448", size = 5090087, upload-time = "2025-05-28T15:28:29.608Z" }, - { url = "https://files.pythonhosted.org/packages/77/ce/cf7b624db35bce589ac1f2c98329ea91b28f0283d3b7e9e6126dfaeb5abd/fonttools-4.58.1-cp311-cp311-win32.whl", hash = "sha256:4cad2c74adf9ee31ae43be6b0b376fdb386d4d50c60979790e32c3548efec051", size = 2188923, upload-time = "2025-05-28T15:28:31.797Z" }, - { url = "https://files.pythonhosted.org/packages/b9/22/c4f1f76eeb1b9353e9cc81451d0ae08acc3d3aa31b9ab8f3791a18af1f89/fonttools-4.58.1-cp311-cp311-win_amd64.whl", hash = "sha256:7ade12485abccb0f6b6a6e2a88c50e587ff0e201e48e0153dd9b2e0ed67a2f38", size = 2236853, upload-time = "2025-05-28T15:28:33.381Z" }, { url = "https://files.pythonhosted.org/packages/32/97/ed1078b1e138fbc0b4ee75878000d549a70c02d83bb4e557e416efc34140/fonttools-4.58.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f56085a65769dc0100822c814069327541db9c3c4f21e599c6138f9dbda75e96", size = 2740473, upload-time = "2025-05-28T15:28:35.002Z" }, { url = "https://files.pythonhosted.org/packages/28/35/53d49fb7d6b30128153d11628b976fda3ce8ae44234b5a81c4edb3023798/fonttools-4.58.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:19c65a88e522c9f1be0c05d73541de20feada99d23d06e9b5354023cc3e517b0", size = 2309936, upload-time = "2025-05-28T15:28:37.145Z" }, { url = "https://files.pythonhosted.org/packages/0c/db/8b63c1d673b2bf0cfed77500d47769dc4aa85453b5f0ef525db2cf952895/fonttools-4.58.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b01bb37006e97703300bfde7a73d1c7038574dd1df9d8d92ca99af151becf2ca", size = 4814671, upload-time = "2025-05-28T15:28:39.339Z" }, @@ -584,6 +509,66 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/21/ff/995277586691c0cc314c28b24b4ec30610440fd7bf580072aed1409f95b0/fonttools-4.58.1-py3-none-any.whl", hash = "sha256:db88365d0962cd6f5bce54b190a4669aeed9c9941aa7bd60a5af084d8d9173d6", size = 1113429, upload-time = "2025-05-28T15:29:24.185Z" }, ] +[[package]] +name = "frozenlist" +version = "1.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a2/c8131383f1e66adad5f6ecfcce383d584ca94055a34d683bbb24ac5f2f1c/frozenlist-1.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3dbf9952c4bb0e90e98aec1bd992b3318685005702656bc6f67c1a32b76787f2", size = 81424, upload-time = "2025-06-09T23:00:42.24Z" }, + { url = "https://files.pythonhosted.org/packages/4c/9d/02754159955088cb52567337d1113f945b9e444c4960771ea90eb73de8db/frozenlist-1.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1f5906d3359300b8a9bb194239491122e6cf1444c2efb88865426f170c262cdb", size = 47952, upload-time = "2025-06-09T23:00:43.481Z" }, + { url = "https://files.pythonhosted.org/packages/01/7a/0046ef1bd6699b40acd2067ed6d6670b4db2f425c56980fa21c982c2a9db/frozenlist-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3dabd5a8f84573c8d10d8859a50ea2dec01eea372031929871368c09fa103478", size = 46688, upload-time = "2025-06-09T23:00:44.793Z" }, + { url = "https://files.pythonhosted.org/packages/d6/a2/a910bafe29c86997363fb4c02069df4ff0b5bc39d33c5198b4e9dd42d8f8/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa57daa5917f1738064f302bf2626281a1cb01920c32f711fbc7bc36111058a8", size = 243084, upload-time = "2025-06-09T23:00:46.125Z" }, + { url = "https://files.pythonhosted.org/packages/64/3e/5036af9d5031374c64c387469bfcc3af537fc0f5b1187d83a1cf6fab1639/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c193dda2b6d49f4c4398962810fa7d7c78f032bf45572b3e04dd5249dff27e08", size = 233524, upload-time = "2025-06-09T23:00:47.73Z" }, + { url = "https://files.pythonhosted.org/packages/06/39/6a17b7c107a2887e781a48ecf20ad20f1c39d94b2a548c83615b5b879f28/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe2b675cf0aaa6d61bf8fbffd3c274b3c9b7b1623beb3809df8a81399a4a9c4", size = 248493, upload-time = "2025-06-09T23:00:49.742Z" }, + { url = "https://files.pythonhosted.org/packages/be/00/711d1337c7327d88c44d91dd0f556a1c47fb99afc060ae0ef66b4d24793d/frozenlist-1.7.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8fc5d5cda37f62b262405cf9652cf0856839c4be8ee41be0afe8858f17f4c94b", size = 244116, upload-time = "2025-06-09T23:00:51.352Z" }, + { url = "https://files.pythonhosted.org/packages/24/fe/74e6ec0639c115df13d5850e75722750adabdc7de24e37e05a40527ca539/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0d5ce521d1dd7d620198829b87ea002956e4319002ef0bc8d3e6d045cb4646e", size = 224557, upload-time = "2025-06-09T23:00:52.855Z" }, + { url = "https://files.pythonhosted.org/packages/8d/db/48421f62a6f77c553575201e89048e97198046b793f4a089c79a6e3268bd/frozenlist-1.7.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:488d0a7d6a0008ca0db273c542098a0fa9e7dfaa7e57f70acef43f32b3f69dca", size = 241820, upload-time = "2025-06-09T23:00:54.43Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fa/cb4a76bea23047c8462976ea7b7a2bf53997a0ca171302deae9d6dd12096/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:15a7eaba63983d22c54d255b854e8108e7e5f3e89f647fc854bd77a237e767df", size = 236542, upload-time = "2025-06-09T23:00:56.409Z" }, + { url = "https://files.pythonhosted.org/packages/5d/32/476a4b5cfaa0ec94d3f808f193301debff2ea42288a099afe60757ef6282/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1eaa7e9c6d15df825bf255649e05bd8a74b04a4d2baa1ae46d9c2d00b2ca2cb5", size = 249350, upload-time = "2025-06-09T23:00:58.468Z" }, + { url = "https://files.pythonhosted.org/packages/8d/ba/9a28042f84a6bf8ea5dbc81cfff8eaef18d78b2a1ad9d51c7bc5b029ad16/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4389e06714cfa9d47ab87f784a7c5be91d3934cd6e9a7b85beef808297cc025", size = 225093, upload-time = "2025-06-09T23:01:00.015Z" }, + { url = "https://files.pythonhosted.org/packages/bc/29/3a32959e68f9cf000b04e79ba574527c17e8842e38c91d68214a37455786/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:73bd45e1488c40b63fe5a7df892baf9e2a4d4bb6409a2b3b78ac1c6236178e01", size = 245482, upload-time = "2025-06-09T23:01:01.474Z" }, + { url = "https://files.pythonhosted.org/packages/80/e8/edf2f9e00da553f07f5fa165325cfc302dead715cab6ac8336a5f3d0adc2/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99886d98e1643269760e5fe0df31e5ae7050788dd288947f7f007209b8c33f08", size = 249590, upload-time = "2025-06-09T23:01:02.961Z" }, + { url = "https://files.pythonhosted.org/packages/1c/80/9a0eb48b944050f94cc51ee1c413eb14a39543cc4f760ed12657a5a3c45a/frozenlist-1.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:290a172aae5a4c278c6da8a96222e6337744cd9c77313efe33d5670b9f65fc43", size = 237785, upload-time = "2025-06-09T23:01:05.095Z" }, + { url = "https://files.pythonhosted.org/packages/f3/74/87601e0fb0369b7a2baf404ea921769c53b7ae00dee7dcfe5162c8c6dbf0/frozenlist-1.7.0-cp312-cp312-win32.whl", hash = "sha256:426c7bc70e07cfebc178bc4c2bf2d861d720c4fff172181eeb4a4c41d4ca2ad3", size = 39487, upload-time = "2025-06-09T23:01:06.54Z" }, + { url = "https://files.pythonhosted.org/packages/0b/15/c026e9a9fc17585a9d461f65d8593d281fedf55fbf7eb53f16c6df2392f9/frozenlist-1.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:563b72efe5da92e02eb68c59cb37205457c977aa7a449ed1b37e6939e5c47c6a", size = 43874, upload-time = "2025-06-09T23:01:07.752Z" }, + { url = "https://files.pythonhosted.org/packages/24/90/6b2cebdabdbd50367273c20ff6b57a3dfa89bd0762de02c3a1eb42cb6462/frozenlist-1.7.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee80eeda5e2a4e660651370ebffd1286542b67e268aa1ac8d6dbe973120ef7ee", size = 79791, upload-time = "2025-06-09T23:01:09.368Z" }, + { url = "https://files.pythonhosted.org/packages/83/2e/5b70b6a3325363293fe5fc3ae74cdcbc3e996c2a11dde2fd9f1fb0776d19/frozenlist-1.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d1a81c85417b914139e3a9b995d4a1c84559afc839a93cf2cb7f15e6e5f6ed2d", size = 47165, upload-time = "2025-06-09T23:01:10.653Z" }, + { url = "https://files.pythonhosted.org/packages/f4/25/a0895c99270ca6966110f4ad98e87e5662eab416a17e7fd53c364bf8b954/frozenlist-1.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cbb65198a9132ebc334f237d7b0df163e4de83fb4f2bdfe46c1e654bdb0c5d43", size = 45881, upload-time = "2025-06-09T23:01:12.296Z" }, + { url = "https://files.pythonhosted.org/packages/19/7c/71bb0bbe0832793c601fff68cd0cf6143753d0c667f9aec93d3c323f4b55/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dab46c723eeb2c255a64f9dc05b8dd601fde66d6b19cdb82b2e09cc6ff8d8b5d", size = 232409, upload-time = "2025-06-09T23:01:13.641Z" }, + { url = "https://files.pythonhosted.org/packages/c0/45/ed2798718910fe6eb3ba574082aaceff4528e6323f9a8570be0f7028d8e9/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6aeac207a759d0dedd2e40745575ae32ab30926ff4fa49b1635def65806fddee", size = 225132, upload-time = "2025-06-09T23:01:15.264Z" }, + { url = "https://files.pythonhosted.org/packages/ba/e2/8417ae0f8eacb1d071d4950f32f229aa6bf68ab69aab797b72a07ea68d4f/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bd8c4e58ad14b4fa7802b8be49d47993182fdd4023393899632c88fd8cd994eb", size = 237638, upload-time = "2025-06-09T23:01:16.752Z" }, + { url = "https://files.pythonhosted.org/packages/f8/b7/2ace5450ce85f2af05a871b8c8719b341294775a0a6c5585d5e6170f2ce7/frozenlist-1.7.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04fb24d104f425da3540ed83cbfc31388a586a7696142004c577fa61c6298c3f", size = 233539, upload-time = "2025-06-09T23:01:18.202Z" }, + { url = "https://files.pythonhosted.org/packages/46/b9/6989292c5539553dba63f3c83dc4598186ab2888f67c0dc1d917e6887db6/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a5c505156368e4ea6b53b5ac23c92d7edc864537ff911d2fb24c140bb175e60", size = 215646, upload-time = "2025-06-09T23:01:19.649Z" }, + { url = "https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bd7eb96a675f18aa5c553eb7ddc24a43c8c18f22e1f9925528128c052cdbe00", size = 232233, upload-time = "2025-06-09T23:01:21.175Z" }, + { url = "https://files.pythonhosted.org/packages/59/52/460db4d7ba0811b9ccb85af996019f5d70831f2f5f255f7cc61f86199795/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:05579bf020096fe05a764f1f84cd104a12f78eaab68842d036772dc6d4870b4b", size = 227996, upload-time = "2025-06-09T23:01:23.098Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c9/f4b39e904c03927b7ecf891804fd3b4df3db29b9e487c6418e37988d6e9d/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:376b6222d114e97eeec13d46c486facd41d4f43bab626b7c3f6a8b4e81a5192c", size = 242280, upload-time = "2025-06-09T23:01:24.808Z" }, + { url = "https://files.pythonhosted.org/packages/b8/33/3f8d6ced42f162d743e3517781566b8481322be321b486d9d262adf70bfb/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0aa7e176ebe115379b5b1c95b4096fb1c17cce0847402e227e712c27bdb5a949", size = 217717, upload-time = "2025-06-09T23:01:26.28Z" }, + { url = "https://files.pythonhosted.org/packages/3e/e8/ad683e75da6ccef50d0ab0c2b2324b32f84fc88ceee778ed79b8e2d2fe2e/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3fbba20e662b9c2130dc771e332a99eff5da078b2b2648153a40669a6d0e36ca", size = 236644, upload-time = "2025-06-09T23:01:27.887Z" }, + { url = "https://files.pythonhosted.org/packages/b2/14/8d19ccdd3799310722195a72ac94ddc677541fb4bef4091d8e7775752360/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f3f4410a0a601d349dd406b5713fec59b4cee7e71678d5b17edda7f4655a940b", size = 238879, upload-time = "2025-06-09T23:01:29.524Z" }, + { url = "https://files.pythonhosted.org/packages/ce/13/c12bf657494c2fd1079a48b2db49fa4196325909249a52d8f09bc9123fd7/frozenlist-1.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e2cdfaaec6a2f9327bf43c933c0319a7c429058e8537c508964a133dffee412e", size = 232502, upload-time = "2025-06-09T23:01:31.287Z" }, + { url = "https://files.pythonhosted.org/packages/d7/8b/e7f9dfde869825489382bc0d512c15e96d3964180c9499efcec72e85db7e/frozenlist-1.7.0-cp313-cp313-win32.whl", hash = "sha256:5fc4df05a6591c7768459caba1b342d9ec23fa16195e744939ba5914596ae3e1", size = 39169, upload-time = "2025-06-09T23:01:35.503Z" }, + { url = "https://files.pythonhosted.org/packages/35/89/a487a98d94205d85745080a37860ff5744b9820a2c9acbcdd9440bfddf98/frozenlist-1.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:52109052b9791a3e6b5d1b65f4b909703984b770694d3eb64fad124c835d7cba", size = 43219, upload-time = "2025-06-09T23:01:36.784Z" }, + { url = "https://files.pythonhosted.org/packages/56/d5/5c4cf2319a49eddd9dd7145e66c4866bdc6f3dbc67ca3d59685149c11e0d/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a6f86e4193bb0e235ef6ce3dde5cbabed887e0b11f516ce8a0f4d3b33078ec2d", size = 84345, upload-time = "2025-06-09T23:01:38.295Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/ec2c1e1dc16b85bc9d526009961953df9cec8481b6886debb36ec9107799/frozenlist-1.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:82d664628865abeb32d90ae497fb93df398a69bb3434463d172b80fc25b0dd7d", size = 48880, upload-time = "2025-06-09T23:01:39.887Z" }, + { url = "https://files.pythonhosted.org/packages/69/86/f9596807b03de126e11e7d42ac91e3d0b19a6599c714a1989a4e85eeefc4/frozenlist-1.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:912a7e8375a1c9a68325a902f3953191b7b292aa3c3fb0d71a216221deca460b", size = 48498, upload-time = "2025-06-09T23:01:41.318Z" }, + { url = "https://files.pythonhosted.org/packages/5e/cb/df6de220f5036001005f2d726b789b2c0b65f2363b104bbc16f5be8084f8/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9537c2777167488d539bc5de2ad262efc44388230e5118868e172dd4a552b146", size = 292296, upload-time = "2025-06-09T23:01:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/83/1f/de84c642f17c8f851a2905cee2dae401e5e0daca9b5ef121e120e19aa825/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f34560fb1b4c3e30ba35fa9a13894ba39e5acfc5f60f57d8accde65f46cc5e74", size = 273103, upload-time = "2025-06-09T23:01:44.166Z" }, + { url = "https://files.pythonhosted.org/packages/88/3c/c840bfa474ba3fa13c772b93070893c6e9d5c0350885760376cbe3b6c1b3/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:acd03d224b0175f5a850edc104ac19040d35419eddad04e7cf2d5986d98427f1", size = 292869, upload-time = "2025-06-09T23:01:45.681Z" }, + { url = "https://files.pythonhosted.org/packages/a6/1c/3efa6e7d5a39a1d5ef0abeb51c48fb657765794a46cf124e5aca2c7a592c/frozenlist-1.7.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2038310bc582f3d6a09b3816ab01737d60bf7b1ec70f5356b09e84fb7408ab1", size = 291467, upload-time = "2025-06-09T23:01:47.234Z" }, + { url = "https://files.pythonhosted.org/packages/4f/00/d5c5e09d4922c395e2f2f6b79b9a20dab4b67daaf78ab92e7729341f61f6/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8c05e4c8e5f36e5e088caa1bf78a687528f83c043706640a92cb76cd6999384", size = 266028, upload-time = "2025-06-09T23:01:48.819Z" }, + { url = "https://files.pythonhosted.org/packages/4e/27/72765be905619dfde25a7f33813ac0341eb6b076abede17a2e3fbfade0cb/frozenlist-1.7.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:765bb588c86e47d0b68f23c1bee323d4b703218037765dcf3f25c838c6fecceb", size = 284294, upload-time = "2025-06-09T23:01:50.394Z" }, + { url = "https://files.pythonhosted.org/packages/88/67/c94103a23001b17808eb7dd1200c156bb69fb68e63fcf0693dde4cd6228c/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:32dc2e08c67d86d0969714dd484fd60ff08ff81d1a1e40a77dd34a387e6ebc0c", size = 281898, upload-time = "2025-06-09T23:01:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/42/34/a3e2c00c00f9e2a9db5653bca3fec306349e71aff14ae45ecc6d0951dd24/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:c0303e597eb5a5321b4de9c68e9845ac8f290d2ab3f3e2c864437d3c5a30cd65", size = 290465, upload-time = "2025-06-09T23:01:53.788Z" }, + { url = "https://files.pythonhosted.org/packages/bb/73/f89b7fbce8b0b0c095d82b008afd0590f71ccb3dee6eee41791cf8cd25fd/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:a47f2abb4e29b3a8d0b530f7c3598badc6b134562b1a5caee867f7c62fee51e3", size = 266385, upload-time = "2025-06-09T23:01:55.769Z" }, + { url = "https://files.pythonhosted.org/packages/cd/45/e365fdb554159462ca12df54bc59bfa7a9a273ecc21e99e72e597564d1ae/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:3d688126c242a6fabbd92e02633414d40f50bb6002fa4cf995a1d18051525657", size = 288771, upload-time = "2025-06-09T23:01:57.4Z" }, + { url = "https://files.pythonhosted.org/packages/00/11/47b6117002a0e904f004d70ec5194fe9144f117c33c851e3d51c765962d0/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:4e7e9652b3d367c7bd449a727dc79d5043f48b88d0cbfd4f9f1060cf2b414104", size = 288206, upload-time = "2025-06-09T23:01:58.936Z" }, + { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, + { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, + { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, +] + [[package]] name = "fsspec" version = "2025.5.1" @@ -593,6 +578,76 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bb/61/78c7b3851add1481b048b5fdc29067397a1784e2910592bc81bb3f608635/fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462", size = 199052, upload-time = "2025-05-24T12:03:21.66Z" }, ] +[[package]] +name = "google-api-core" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, +] + +[[package]] +name = "google-auth" +version = "2.40.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, +] + +[[package]] +name = "googleapis-common-protos" +version = "1.70.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload-time = "2025-04-14T10:17:02.924Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, +] + +[[package]] +name = "grpcio" +version = "1.74.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/38/b4/35feb8f7cab7239c5b94bd2db71abb3d6adb5f335ad8f131abb6060840b6/grpcio-1.74.0.tar.gz", hash = "sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1", size = 12756048, upload-time = "2025-07-24T18:54:23.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4c/5d/e504d5d5c4469823504f65687d6c8fb97b7f7bf0b34873b7598f1df24630/grpcio-1.74.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8", size = 5445551, upload-time = "2025-07-24T18:53:23.641Z" }, + { url = "https://files.pythonhosted.org/packages/43/01/730e37056f96f2f6ce9f17999af1556df62ee8dab7fa48bceeaab5fd3008/grpcio-1.74.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6", size = 10979810, upload-time = "2025-07-24T18:53:25.349Z" }, + { url = "https://files.pythonhosted.org/packages/79/3d/09fd100473ea5c47083889ca47ffd356576173ec134312f6aa0e13111dee/grpcio-1.74.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5", size = 5941946, upload-time = "2025-07-24T18:53:27.387Z" }, + { url = "https://files.pythonhosted.org/packages/8a/99/12d2cca0a63c874c6d3d195629dcd85cdf5d6f98a30d8db44271f8a97b93/grpcio-1.74.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3601274bc0523f6dc07666c0e01682c94472402ac2fd1226fd96e079863bfa49", size = 6621763, upload-time = "2025-07-24T18:53:29.193Z" }, + { url = "https://files.pythonhosted.org/packages/9d/2c/930b0e7a2f1029bbc193443c7bc4dc2a46fedb0203c8793dcd97081f1520/grpcio-1.74.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:176d60a5168d7948539def20b2a3adcce67d72454d9ae05969a2e73f3a0feee7", size = 6180664, upload-time = "2025-07-24T18:53:30.823Z" }, + { url = "https://files.pythonhosted.org/packages/db/d5/ff8a2442180ad0867717e670f5ec42bfd8d38b92158ad6bcd864e6d4b1ed/grpcio-1.74.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e759f9e8bc908aaae0412642afe5416c9f983a80499448fcc7fab8692ae044c3", size = 6301083, upload-time = "2025-07-24T18:53:32.454Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ba/b361d390451a37ca118e4ec7dccec690422e05bc85fba2ec72b06cefec9f/grpcio-1.74.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:9e7c4389771855a92934b2846bd807fc25a3dfa820fd912fe6bd8136026b2707", size = 6994132, upload-time = "2025-07-24T18:53:34.506Z" }, + { url = "https://files.pythonhosted.org/packages/3b/0c/3a5fa47d2437a44ced74141795ac0251bbddeae74bf81df3447edd767d27/grpcio-1.74.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:cce634b10aeab37010449124814b05a62fb5f18928ca878f1bf4750d1f0c815b", size = 6489616, upload-time = "2025-07-24T18:53:36.217Z" }, + { url = "https://files.pythonhosted.org/packages/ae/95/ab64703b436d99dc5217228babc76047d60e9ad14df129e307b5fec81fd0/grpcio-1.74.0-cp312-cp312-win32.whl", hash = "sha256:885912559974df35d92219e2dc98f51a16a48395f37b92865ad45186f294096c", size = 3807083, upload-time = "2025-07-24T18:53:37.911Z" }, + { url = "https://files.pythonhosted.org/packages/84/59/900aa2445891fc47a33f7d2f76e00ca5d6ae6584b20d19af9c06fa09bf9a/grpcio-1.74.0-cp312-cp312-win_amd64.whl", hash = "sha256:42f8fee287427b94be63d916c90399ed310ed10aadbf9e2e5538b3e497d269bc", size = 4490123, upload-time = "2025-07-24T18:53:39.528Z" }, + { url = "https://files.pythonhosted.org/packages/d4/d8/1004a5f468715221450e66b051c839c2ce9a985aa3ee427422061fcbb6aa/grpcio-1.74.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:2bc2d7d8d184e2362b53905cb1708c84cb16354771c04b490485fa07ce3a1d89", size = 5449488, upload-time = "2025-07-24T18:53:41.174Z" }, + { url = "https://files.pythonhosted.org/packages/94/0e/33731a03f63740d7743dced423846c831d8e6da808fcd02821a4416df7fa/grpcio-1.74.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:c14e803037e572c177ba54a3e090d6eb12efd795d49327c5ee2b3bddb836bf01", size = 10974059, upload-time = "2025-07-24T18:53:43.066Z" }, + { url = "https://files.pythonhosted.org/packages/0d/c6/3d2c14d87771a421205bdca991467cfe473ee4c6a1231c1ede5248c62ab8/grpcio-1.74.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:f6ec94f0e50eb8fa1744a731088b966427575e40c2944a980049798b127a687e", size = 5945647, upload-time = "2025-07-24T18:53:45.269Z" }, + { url = "https://files.pythonhosted.org/packages/c5/83/5a354c8aaff58594eef7fffebae41a0f8995a6258bbc6809b800c33d4c13/grpcio-1.74.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:566b9395b90cc3d0d0c6404bc8572c7c18786ede549cdb540ae27b58afe0fb91", size = 6626101, upload-time = "2025-07-24T18:53:47.015Z" }, + { url = "https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249", size = 6182562, upload-time = "2025-07-24T18:53:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/fd/48/2869e5b2c1922583686f7ae674937986807c2f676d08be70d0a541316270/grpcio-1.74.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:64229c1e9cea079420527fa8ac45d80fc1e8d3f94deaa35643c381fa8d98f362", size = 6303425, upload-time = "2025-07-24T18:53:50.847Z" }, + { url = "https://files.pythonhosted.org/packages/a6/0e/bac93147b9a164f759497bc6913e74af1cb632c733c7af62c0336782bd38/grpcio-1.74.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:0f87bddd6e27fc776aacf7ebfec367b6d49cad0455123951e4488ea99d9b9b8f", size = 6996533, upload-time = "2025-07-24T18:53:52.747Z" }, + { url = "https://files.pythonhosted.org/packages/84/35/9f6b2503c1fd86d068b46818bbd7329db26a87cdd8c01e0d1a9abea1104c/grpcio-1.74.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3b03d8f2a07f0fea8c8f74deb59f8352b770e3900d143b3d1475effcb08eec20", size = 6491489, upload-time = "2025-07-24T18:53:55.06Z" }, + { url = "https://files.pythonhosted.org/packages/75/33/a04e99be2a82c4cbc4039eb3a76f6c3632932b9d5d295221389d10ac9ca7/grpcio-1.74.0-cp313-cp313-win32.whl", hash = "sha256:b6a73b2ba83e663b2480a90b82fdae6a7aa6427f62bf43b29912c0cfd1aa2bfa", size = 3805811, upload-time = "2025-07-24T18:53:56.798Z" }, + { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, +] + [[package]] name = "httpie" version = "3.2.4" @@ -623,6 +678,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "importlib-metadata" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" }, +] + [[package]] name = "iniconfig" version = "2.1.0" @@ -640,8 +707,7 @@ dependencies = [ { name = "appnope", marker = "sys_platform == 'darwin'" }, { name = "comm" }, { name = "debugpy" }, - { name = "ipython", version = "8.36.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "ipython", version = "9.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "ipython" }, { name = "jupyter-client" }, { name = "jupyter-core" }, { name = "matplotlib-inline" }, @@ -657,51 +723,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/5c/368ae6c01c7628438358e6d337c19b05425727fbb221d2a3c4303c372f42/ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5", size = 117173, upload-time = "2024-07-01T14:07:19.603Z" }, ] -[[package]] -name = "ipython" -version = "8.36.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.11'", -] -dependencies = [ - { name = "colorama", marker = "python_full_version < '3.11' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version < '3.11'" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "jedi", marker = "python_full_version < '3.11'" }, - { name = "matplotlib-inline", marker = "python_full_version < '3.11'" }, - { name = "pexpect", marker = "python_full_version < '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "prompt-toolkit", marker = "python_full_version < '3.11'" }, - { name = "pygments", marker = "python_full_version < '3.11'" }, - { name = "stack-data", marker = "python_full_version < '3.11'" }, - { name = "traitlets", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a2/9f/d9a73710df947b7804bd9d93509463fb3a89e0ddc99c9fcc67279cddbeb6/ipython-8.36.0.tar.gz", hash = "sha256:24658e9fe5c5c819455043235ba59cfffded4a35936eefceceab6b192f7092ff", size = 5604997, upload-time = "2025-04-25T18:03:38.031Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/d7/c1c9f371790b3a181e343c4815a361e5a0cc7d90ef6642d64ba5d05de289/ipython-8.36.0-py3-none-any.whl", hash = "sha256:12b913914d010dcffa2711505ec8be4bf0180742d97f1e5175e51f22086428c1", size = 831074, upload-time = "2025-04-25T18:03:34.951Z" }, -] - [[package]] name = "ipython" version = "9.2.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", -] dependencies = [ - { name = "colorama", marker = "python_full_version >= '3.11' and sys_platform == 'win32'" }, - { name = "decorator", marker = "python_full_version >= '3.11'" }, - { name = "ipython-pygments-lexers", marker = "python_full_version >= '3.11'" }, - { name = "jedi", marker = "python_full_version >= '3.11'" }, - { name = "matplotlib-inline", marker = "python_full_version >= '3.11'" }, - { name = "pexpect", marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "prompt-toolkit", marker = "python_full_version >= '3.11'" }, - { name = "pygments", marker = "python_full_version >= '3.11'" }, - { name = "stack-data", marker = "python_full_version >= '3.11'" }, - { name = "traitlets", marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version == '3.11.*'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "decorator" }, + { name = "ipython-pygments-lexers" }, + { name = "jedi" }, + { name = "matplotlib-inline" }, + { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, + { name = "prompt-toolkit" }, + { name = "pygments" }, + { name = "stack-data" }, + { name = "traitlets" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9d/02/63a84444a7409b3c0acd1de9ffe524660e0e5d82ee473e78b45e5bfb64a4/ipython-9.2.0.tar.gz", hash = "sha256:62a9373dbc12f28f9feaf4700d052195bf89806279fc8ca11f3f54017d04751b", size = 4424394, upload-time = "2025-04-25T17:55:40.498Z" } wheels = [ @@ -713,13 +749,29 @@ name = "ipython-pygments-lexers" version = "1.1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pygments", marker = "python_full_version >= '3.11'" }, + { name = "pygments" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, ] +[[package]] +name = "ipywidgets" +version = "8.1.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "comm" }, + { name = "ipython" }, + { name = "jupyterlab-widgets" }, + { name = "traitlets" }, + { name = "widgetsnbextension" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3e/48/d3dbac45c2814cb73812f98dd6b38bbcc957a4e7bb31d6ea9c03bf94ed87/ipywidgets-8.1.7.tar.gz", hash = "sha256:15f1ac050b9ccbefd45dccfbb2ef6bed0029d8278682d569d71b8dd96bee0376", size = 116721, upload-time = "2025-05-05T12:42:03.489Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl", hash = "sha256:764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb", size = 139806, upload-time = "2025-05-05T12:41:56.833Z" }, +] + [[package]] name = "jedi" version = "0.19.2" @@ -789,42 +841,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, ] +[[package]] +name = "jupyterlab-widgets" +version = "3.0.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/7d/160595ca88ee87ac6ba95d82177d29ec60aaa63821d3077babb22ce031a5/jupyterlab_widgets-3.0.15.tar.gz", hash = "sha256:2920888a0c2922351a9202817957a68c07d99673504d6cd37345299e971bb08b", size = 213149, upload-time = "2025-05-05T12:32:31.004Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl", hash = "sha256:d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c", size = 216571, upload-time = "2025-05-05T12:32:29.534Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.8" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538, upload-time = "2024-12-24T18:30:51.519Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/47/5f/4d8e9e852d98ecd26cdf8eaf7ed8bc33174033bba5e07001b289f07308fd/kiwisolver-1.4.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db", size = 124623, upload-time = "2024-12-24T18:28:17.687Z" }, - { url = "https://files.pythonhosted.org/packages/1d/70/7f5af2a18a76fe92ea14675f8bd88ce53ee79e37900fa5f1a1d8e0b42998/kiwisolver-1.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c72941acb7b67138f35b879bbe85be0f6c6a70cab78fe3ef6db9c024d9223e5b", size = 66720, upload-time = "2024-12-24T18:28:19.158Z" }, - { url = "https://files.pythonhosted.org/packages/c6/13/e15f804a142353aefd089fadc8f1d985561a15358c97aca27b0979cb0785/kiwisolver-1.4.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce2cf1e5688edcb727fdf7cd1bbd0b6416758996826a8be1d958f91880d0809d", size = 65413, upload-time = "2024-12-24T18:28:20.064Z" }, - { url = "https://files.pythonhosted.org/packages/ce/6d/67d36c4d2054e83fb875c6b59d0809d5c530de8148846b1370475eeeece9/kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c8bf637892dc6e6aad2bc6d4d69d08764166e5e3f69d469e55427b6ac001b19d", size = 1650826, upload-time = "2024-12-24T18:28:21.203Z" }, - { url = "https://files.pythonhosted.org/packages/de/c6/7b9bb8044e150d4d1558423a1568e4f227193662a02231064e3824f37e0a/kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:034d2c891f76bd3edbdb3ea11140d8510dca675443da7304205a2eaa45d8334c", size = 1628231, upload-time = "2024-12-24T18:28:23.851Z" }, - { url = "https://files.pythonhosted.org/packages/b6/38/ad10d437563063eaaedbe2c3540a71101fc7fb07a7e71f855e93ea4de605/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47b28d1dfe0793d5e96bce90835e17edf9a499b53969b03c6c47ea5985844c3", size = 1408938, upload-time = "2024-12-24T18:28:26.687Z" }, - { url = "https://files.pythonhosted.org/packages/52/ce/c0106b3bd7f9e665c5f5bc1e07cc95b5dabd4e08e3dad42dbe2faad467e7/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb158fe28ca0c29f2260cca8c43005329ad58452c36f0edf298204de32a9a3ed", size = 1422799, upload-time = "2024-12-24T18:28:30.538Z" }, - { url = "https://files.pythonhosted.org/packages/d0/87/efb704b1d75dc9758087ba374c0f23d3254505edaedd09cf9d247f7878b9/kiwisolver-1.4.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5536185fce131780ebd809f8e623bf4030ce1b161353166c49a3c74c287897f", size = 1354362, upload-time = "2024-12-24T18:28:32.943Z" }, - { url = "https://files.pythonhosted.org/packages/eb/b3/fd760dc214ec9a8f208b99e42e8f0130ff4b384eca8b29dd0efc62052176/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:369b75d40abedc1da2c1f4de13f3482cb99e3237b38726710f4a793432b1c5ff", size = 2222695, upload-time = "2024-12-24T18:28:35.641Z" }, - { url = "https://files.pythonhosted.org/packages/a2/09/a27fb36cca3fc01700687cc45dae7a6a5f8eeb5f657b9f710f788748e10d/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:641f2ddf9358c80faa22e22eb4c9f54bd3f0e442e038728f500e3b978d00aa7d", size = 2370802, upload-time = "2024-12-24T18:28:38.357Z" }, - { url = "https://files.pythonhosted.org/packages/3d/c3/ba0a0346db35fe4dc1f2f2cf8b99362fbb922d7562e5f911f7ce7a7b60fa/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:d561d2d8883e0819445cfe58d7ddd673e4015c3c57261d7bdcd3710d0d14005c", size = 2334646, upload-time = "2024-12-24T18:28:40.941Z" }, - { url = "https://files.pythonhosted.org/packages/41/52/942cf69e562f5ed253ac67d5c92a693745f0bed3c81f49fc0cbebe4d6b00/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1732e065704b47c9afca7ffa272f845300a4eb959276bf6970dc07265e73b605", size = 2467260, upload-time = "2024-12-24T18:28:42.273Z" }, - { url = "https://files.pythonhosted.org/packages/32/26/2d9668f30d8a494b0411d4d7d4ea1345ba12deb6a75274d58dd6ea01e951/kiwisolver-1.4.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bcb1ebc3547619c3b58a39e2448af089ea2ef44b37988caf432447374941574e", size = 2288633, upload-time = "2024-12-24T18:28:44.87Z" }, - { url = "https://files.pythonhosted.org/packages/98/99/0dd05071654aa44fe5d5e350729961e7bb535372935a45ac89a8924316e6/kiwisolver-1.4.8-cp310-cp310-win_amd64.whl", hash = "sha256:89c107041f7b27844179ea9c85d6da275aa55ecf28413e87624d033cf1f6b751", size = 71885, upload-time = "2024-12-24T18:28:47.346Z" }, - { url = "https://files.pythonhosted.org/packages/6c/fc/822e532262a97442989335394d441cd1d0448c2e46d26d3e04efca84df22/kiwisolver-1.4.8-cp310-cp310-win_arm64.whl", hash = "sha256:b5773efa2be9eb9fcf5415ea3ab70fc785d598729fd6057bea38d539ead28271", size = 65175, upload-time = "2024-12-24T18:28:49.651Z" }, - { url = "https://files.pythonhosted.org/packages/da/ed/c913ee28936c371418cb167b128066ffb20bbf37771eecc2c97edf8a6e4c/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84", size = 124635, upload-time = "2024-12-24T18:28:51.826Z" }, - { url = "https://files.pythonhosted.org/packages/4c/45/4a7f896f7467aaf5f56ef093d1f329346f3b594e77c6a3c327b2d415f521/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561", size = 66717, upload-time = "2024-12-24T18:28:54.256Z" }, - { url = "https://files.pythonhosted.org/packages/5f/b4/c12b3ac0852a3a68f94598d4c8d569f55361beef6159dce4e7b624160da2/kiwisolver-1.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7", size = 65413, upload-time = "2024-12-24T18:28:55.184Z" }, - { url = "https://files.pythonhosted.org/packages/a9/98/1df4089b1ed23d83d410adfdc5947245c753bddfbe06541c4aae330e9e70/kiwisolver-1.4.8-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03", size = 1343994, upload-time = "2024-12-24T18:28:57.493Z" }, - { url = "https://files.pythonhosted.org/packages/8d/bf/b4b169b050c8421a7c53ea1ea74e4ef9c335ee9013216c558a047f162d20/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954", size = 1434804, upload-time = "2024-12-24T18:29:00.077Z" }, - { url = "https://files.pythonhosted.org/packages/66/5a/e13bd341fbcf73325ea60fdc8af752addf75c5079867af2e04cc41f34434/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79", size = 1450690, upload-time = "2024-12-24T18:29:01.401Z" }, - { url = "https://files.pythonhosted.org/packages/9b/4f/5955dcb376ba4a830384cc6fab7d7547bd6759fe75a09564910e9e3bb8ea/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6", size = 1376839, upload-time = "2024-12-24T18:29:02.685Z" }, - { url = "https://files.pythonhosted.org/packages/3a/97/5edbed69a9d0caa2e4aa616ae7df8127e10f6586940aa683a496c2c280b9/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0", size = 1435109, upload-time = "2024-12-24T18:29:04.113Z" }, - { url = "https://files.pythonhosted.org/packages/13/fc/e756382cb64e556af6c1809a1bbb22c141bbc2445049f2da06b420fe52bf/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab", size = 2245269, upload-time = "2024-12-24T18:29:05.488Z" }, - { url = "https://files.pythonhosted.org/packages/76/15/e59e45829d7f41c776d138245cabae6515cb4eb44b418f6d4109c478b481/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc", size = 2393468, upload-time = "2024-12-24T18:29:06.79Z" }, - { url = "https://files.pythonhosted.org/packages/e9/39/483558c2a913ab8384d6e4b66a932406f87c95a6080112433da5ed668559/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25", size = 2355394, upload-time = "2024-12-24T18:29:08.24Z" }, - { url = "https://files.pythonhosted.org/packages/01/aa/efad1fbca6570a161d29224f14b082960c7e08268a133fe5dc0f6906820e/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc", size = 2490901, upload-time = "2024-12-24T18:29:09.653Z" }, - { url = "https://files.pythonhosted.org/packages/c9/4f/15988966ba46bcd5ab9d0c8296914436720dd67fca689ae1a75b4ec1c72f/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67", size = 2312306, upload-time = "2024-12-24T18:29:12.644Z" }, - { url = "https://files.pythonhosted.org/packages/2d/27/bdf1c769c83f74d98cbc34483a972f221440703054894a37d174fba8aa68/kiwisolver-1.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34", size = 71966, upload-time = "2024-12-24T18:29:14.089Z" }, - { url = "https://files.pythonhosted.org/packages/4a/c9/9642ea855604aeb2968a8e145fc662edf61db7632ad2e4fb92424be6b6c0/kiwisolver-1.4.8-cp311-cp311-win_arm64.whl", hash = "sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2", size = 65311, upload-time = "2024-12-24T18:29:15.892Z" }, { url = "https://files.pythonhosted.org/packages/fc/aa/cea685c4ab647f349c3bc92d2daf7ae34c8e8cf405a6dcd3a497f58a2ac3/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502", size = 124152, upload-time = "2024-12-24T18:29:16.85Z" }, { url = "https://files.pythonhosted.org/packages/c5/0b/8db6d2e2452d60d5ebc4ce4b204feeb16176a851fd42462f66ade6808084/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31", size = 66555, upload-time = "2024-12-24T18:29:19.146Z" }, { url = "https://files.pythonhosted.org/packages/60/26/d6a0db6785dd35d3ba5bf2b2df0aedc5af089962c6eb2cbf67a15b81369e/kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb", size = 65067, upload-time = "2024-12-24T18:29:20.096Z" }, @@ -868,12 +899,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/f9/27e94c1b3eb29e6933b6986ffc5fa1177d2cd1f0c8efc5f02c91c9ac61de/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:151dffc4865e5fe6dafce5480fab84f950d14566c480c08a53c663a0020504b6", size = 2390661, upload-time = "2024-12-24T18:30:34.939Z" }, { url = "https://files.pythonhosted.org/packages/d9/d4/3c9735faa36ac591a4afcc2980d2691000506050b7a7e80bcfe44048daa7/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:577facaa411c10421314598b50413aa1ebcf5126f704f1e5d72d7e4e9f020d90", size = 2546710, upload-time = "2024-12-24T18:30:37.281Z" }, { url = "https://files.pythonhosted.org/packages/4c/fa/be89a49c640930180657482a74970cdcf6f7072c8d2471e1babe17a222dc/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85", size = 2349213, upload-time = "2024-12-24T18:30:40.019Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f9/ae81c47a43e33b93b0a9819cac6723257f5da2a5a60daf46aa5c7226ea85/kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:e7a019419b7b510f0f7c9dceff8c5eae2392037eae483a7f9162625233802b0a", size = 60403, upload-time = "2024-12-24T18:30:41.372Z" }, - { url = "https://files.pythonhosted.org/packages/58/ca/f92b5cb6f4ce0c1ebfcfe3e2e42b96917e16f7090e45b21102941924f18f/kiwisolver-1.4.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:286b18e86682fd2217a48fc6be6b0f20c1d0ed10958d8dc53453ad58d7be0bf8", size = 58657, upload-time = "2024-12-24T18:30:42.392Z" }, - { url = "https://files.pythonhosted.org/packages/80/28/ae0240f732f0484d3a4dc885d055653c47144bdf59b670aae0ec3c65a7c8/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4191ee8dfd0be1c3666ccbac178c5a05d5f8d689bbe3fc92f3c4abec817f8fe0", size = 84948, upload-time = "2024-12-24T18:30:44.703Z" }, - { url = "https://files.pythonhosted.org/packages/5d/eb/78d50346c51db22c7203c1611f9b513075f35c4e0e4877c5dde378d66043/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd2785b9391f2873ad46088ed7599a6a71e762e1ea33e87514b1a441ed1da1c", size = 81186, upload-time = "2024-12-24T18:30:45.654Z" }, - { url = "https://files.pythonhosted.org/packages/43/f8/7259f18c77adca88d5f64f9a522792e178b2691f3748817a8750c2d216ef/kiwisolver-1.4.8-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c07b29089b7ba090b6f1a669f1411f27221c3662b3a1b7010e67b59bb5a6f10b", size = 80279, upload-time = "2024-12-24T18:30:47.951Z" }, - { url = "https://files.pythonhosted.org/packages/3a/1d/50ad811d1c5dae091e4cf046beba925bcae0a610e79ae4c538f996f63ed5/kiwisolver-1.4.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:65ea09a5a3faadd59c2ce96dc7bf0f364986a315949dc6374f04396b0d60e09b", size = 71762, upload-time = "2024-12-24T18:30:48.903Z" }, ] [[package]] @@ -905,18 +930,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/26/91/d49359a21893183ed2a5b6c76bec40e0b1dcbf8ca148f864d134897cfc75/matplotlib-3.10.3.tar.gz", hash = "sha256:2f82d2c5bb7ae93aaaa4cd42aca65d76ce6376f83304fa3a630b569aca274df0", size = 34799811, upload-time = "2025-05-08T19:10:54.39Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d0/ea/2bba25d289d389c7451f331ecd593944b3705f06ddf593fa7be75037d308/matplotlib-3.10.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:213fadd6348d106ca7db99e113f1bea1e65e383c3ba76e8556ba4a3054b65ae7", size = 8167862, upload-time = "2025-05-08T19:09:39.563Z" }, - { url = "https://files.pythonhosted.org/packages/41/81/cc70b5138c926604e8c9ed810ed4c79e8116ba72e02230852f5c12c87ba2/matplotlib-3.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3bec61cb8221f0ca6313889308326e7bb303d0d302c5cc9e523b2f2e6c73deb", size = 8042149, upload-time = "2025-05-08T19:09:42.413Z" }, - { url = "https://files.pythonhosted.org/packages/4a/9a/0ff45b6bfa42bb16de597e6058edf2361c298ad5ef93b327728145161bbf/matplotlib-3.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c21ae75651c0231b3ba014b6d5e08fb969c40cdb5a011e33e99ed0c9ea86ecb", size = 8453719, upload-time = "2025-05-08T19:09:44.901Z" }, - { url = "https://files.pythonhosted.org/packages/85/c7/1866e972fed6d71ef136efbc980d4d1854ab7ef1ea8152bbd995ca231c81/matplotlib-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a49e39755580b08e30e3620efc659330eac5d6534ab7eae50fa5e31f53ee4e30", size = 8590801, upload-time = "2025-05-08T19:09:47.404Z" }, - { url = "https://files.pythonhosted.org/packages/5d/b9/748f6626d534ab7e255bdc39dc22634d337cf3ce200f261b5d65742044a1/matplotlib-3.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf4636203e1190871d3a73664dea03d26fb019b66692cbfd642faafdad6208e8", size = 9402111, upload-time = "2025-05-08T19:09:49.474Z" }, - { url = "https://files.pythonhosted.org/packages/1f/78/8bf07bd8fb67ea5665a6af188e70b57fcb2ab67057daa06b85a08e59160a/matplotlib-3.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:fd5641a9bb9d55f4dd2afe897a53b537c834b9012684c8444cc105895c8c16fd", size = 8057213, upload-time = "2025-05-08T19:09:51.489Z" }, - { url = "https://files.pythonhosted.org/packages/f5/bd/af9f655456f60fe1d575f54fb14704ee299b16e999704817a7645dfce6b0/matplotlib-3.10.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0ef061f74cd488586f552d0c336b2f078d43bc00dc473d2c3e7bfee2272f3fa8", size = 8178873, upload-time = "2025-05-08T19:09:53.857Z" }, - { url = "https://files.pythonhosted.org/packages/c2/86/e1c86690610661cd716eda5f9d0b35eaf606ae6c9b6736687cfc8f2d0cd8/matplotlib-3.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d96985d14dc5f4a736bbea4b9de9afaa735f8a0fc2ca75be2fa9e96b2097369d", size = 8052205, upload-time = "2025-05-08T19:09:55.684Z" }, - { url = "https://files.pythonhosted.org/packages/54/51/a9f8e49af3883dacddb2da1af5fca1f7468677f1188936452dd9aaaeb9ed/matplotlib-3.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5f0283da91e9522bdba4d6583ed9d5521566f63729ffb68334f86d0bb98049", size = 8465823, upload-time = "2025-05-08T19:09:57.442Z" }, - { url = "https://files.pythonhosted.org/packages/e7/e3/c82963a3b86d6e6d5874cbeaa390166458a7f1961bab9feb14d3d1a10f02/matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdfa07c0ec58035242bc8b2c8aae37037c9a886370eef6850703d7583e19964b", size = 8606464, upload-time = "2025-05-08T19:09:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/0e/34/24da1027e7fcdd9e82da3194c470143c551852757a4b473a09a012f5b945/matplotlib-3.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c0b9849a17bce080a16ebcb80a7b714b5677d0ec32161a2cc0a8e5a6030ae220", size = 9413103, upload-time = "2025-05-08T19:10:03.208Z" }, - { url = "https://files.pythonhosted.org/packages/a6/da/948a017c3ea13fd4a97afad5fdebe2f5bbc4d28c0654510ce6fd6b06b7bd/matplotlib-3.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:eef6ed6c03717083bc6d69c2d7ee8624205c29a8e6ea5a31cd3492ecdbaee1e1", size = 8065492, upload-time = "2025-05-08T19:10:05.271Z" }, { url = "https://files.pythonhosted.org/packages/eb/43/6b80eb47d1071f234ef0c96ca370c2ca621f91c12045f1401b5c9b28a639/matplotlib-3.10.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0ab1affc11d1f495ab9e6362b8174a25afc19c081ba5b0775ef00533a4236eea", size = 8179689, upload-time = "2025-05-08T19:10:07.602Z" }, { url = "https://files.pythonhosted.org/packages/0f/70/d61a591958325c357204870b5e7b164f93f2a8cca1dc6ce940f563909a13/matplotlib-3.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2a818d8bdcafa7ed2eed74487fdb071c09c1ae24152d403952adad11fa3c65b4", size = 8050466, upload-time = "2025-05-08T19:10:09.383Z" }, { url = "https://files.pythonhosted.org/packages/e7/75/70c9d2306203148cc7902a961240c5927dd8728afedf35e6a77e105a2985/matplotlib-3.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:748ebc3470c253e770b17d8b0557f0aa85cf8c63fd52f1a61af5b27ec0b7ffee", size = 8456252, upload-time = "2025-05-08T19:10:11.958Z" }, @@ -935,9 +948,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ba/c7/473bc559beec08ebee9f86ca77a844b65747e1a6c2691e8c92e40b9f42a8/matplotlib-3.10.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6929fc618cb6db9cb75086f73b3219bbb25920cb24cee2ea7a12b04971a4158", size = 8618082, upload-time = "2025-05-08T19:10:39.892Z" }, { url = "https://files.pythonhosted.org/packages/d8/e9/6ce8edd264c8819e37bbed8172e0ccdc7107fe86999b76ab5752276357a4/matplotlib-3.10.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6c7818292a5cc372a2dc4c795e5c356942eb8350b98ef913f7fda51fe175ac5d", size = 9413699, upload-time = "2025-05-08T19:10:42.376Z" }, { url = "https://files.pythonhosted.org/packages/1b/92/9a45c91089c3cf690b5badd4be81e392ff086ccca8a1d4e3a08463d8a966/matplotlib-3.10.3-cp313-cp313t-win_amd64.whl", hash = "sha256:4f23ffe95c5667ef8a2b56eea9b53db7f43910fa4a2d5472ae0f72b64deab4d5", size = 8139044, upload-time = "2025-05-08T19:10:44.551Z" }, - { url = "https://files.pythonhosted.org/packages/3d/d1/f54d43e95384b312ffa4a74a4326c722f3b8187aaaa12e9a84cdf3037131/matplotlib-3.10.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:86ab63d66bbc83fdb6733471d3bff40897c1e9921cba112accd748eee4bce5e4", size = 8162896, upload-time = "2025-05-08T19:10:46.432Z" }, - { url = "https://files.pythonhosted.org/packages/24/a4/fbfc00c2346177c95b353dcf9b5a004106abe8730a62cb6f27e79df0a698/matplotlib-3.10.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:a48f9c08bf7444b5d2391a83e75edb464ccda3c380384b36532a0962593a1751", size = 8039702, upload-time = "2025-05-08T19:10:49.634Z" }, - { url = "https://files.pythonhosted.org/packages/6a/b9/59e120d24a2ec5fc2d30646adb2efb4621aab3c6d83d66fb2a7a182db032/matplotlib-3.10.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb73d8aa75a237457988f9765e4dfe1c0d2453c5ca4eabc897d4309672c8e014", size = 8594298, upload-time = "2025-05-08T19:10:51.738Z" }, ] [[package]] @@ -967,38 +977,6 @@ version = "5.1.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/47/1b/1fc6888c74cbd8abad1292dde2ddfcf8fc059e114c97dd6bf16d12f36293/mmh3-5.1.0.tar.gz", hash = "sha256:136e1e670500f177f49ec106a4ebf0adf20d18d96990cc36ea492c651d2b406c", size = 33728, upload-time = "2025-01-25T08:39:43.386Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a1/01/9d06468928661765c0fc248a29580c760a4a53a9c6c52cf72528bae3582e/mmh3-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:eaf4ac5c6ee18ca9232238364d7f2a213278ae5ca97897cafaa123fcc7bb8bec", size = 56095, upload-time = "2025-01-25T08:37:53.621Z" }, - { url = "https://files.pythonhosted.org/packages/e4/d7/7b39307fc9db867b2a9a20c58b0de33b778dd6c55e116af8ea031f1433ba/mmh3-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:48f9aa8ccb9ad1d577a16104834ac44ff640d8de8c0caed09a2300df7ce8460a", size = 40512, upload-time = "2025-01-25T08:37:54.972Z" }, - { url = "https://files.pythonhosted.org/packages/4f/85/728ca68280d8ccc60c113ad119df70ff1748fbd44c89911fed0501faf0b8/mmh3-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d4ba8cac21e1f2d4e436ce03a82a7f87cda80378691f760e9ea55045ec480a3d", size = 40110, upload-time = "2025-01-25T08:37:57.86Z" }, - { url = "https://files.pythonhosted.org/packages/e4/96/beaf0e301472ffa00358bbbf771fe2d9c4d709a2fe30b1d929e569f8cbdf/mmh3-5.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d69281c281cb01994f054d862a6bb02a2e7acfe64917795c58934b0872b9ece4", size = 100151, upload-time = "2025-01-25T08:37:59.609Z" }, - { url = "https://files.pythonhosted.org/packages/c3/ee/9381f825c4e09ffafeffa213c3865c4bf7d39771640de33ab16f6faeb854/mmh3-5.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d05ed3962312fbda2a1589b97359d2467f677166952f6bd410d8c916a55febf", size = 106312, upload-time = "2025-01-25T08:38:02.102Z" }, - { url = "https://files.pythonhosted.org/packages/67/dc/350a54bea5cf397d357534198ab8119cfd0d8e8bad623b520f9c290af985/mmh3-5.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78ae6a03f4cff4aa92ddd690611168856f8c33a141bd3e5a1e0a85521dc21ea0", size = 104232, upload-time = "2025-01-25T08:38:03.852Z" }, - { url = "https://files.pythonhosted.org/packages/b2/5d/2c6eb4a4ec2f7293b98a9c07cb8c64668330b46ff2b6511244339e69a7af/mmh3-5.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:95f983535b39795d9fb7336438faae117424c6798f763d67c6624f6caf2c4c01", size = 91663, upload-time = "2025-01-25T08:38:06.24Z" }, - { url = "https://files.pythonhosted.org/packages/f1/ac/17030d24196f73ecbab8b5033591e5e0e2beca103181a843a135c78f4fee/mmh3-5.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d46fdd80d4c7ecadd9faa6181e92ccc6fe91c50991c9af0e371fdf8b8a7a6150", size = 99166, upload-time = "2025-01-25T08:38:07.988Z" }, - { url = "https://files.pythonhosted.org/packages/b9/ed/54ddc56603561a10b33da9b12e95a48a271d126f4a4951841bbd13145ebf/mmh3-5.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0f16e976af7365ea3b5c425124b2a7f0147eed97fdbb36d99857f173c8d8e096", size = 101555, upload-time = "2025-01-25T08:38:09.821Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c3/33fb3a940c9b70908a5cc9fcc26534aff8698180f9f63ab6b7cc74da8bcd/mmh3-5.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6fa97f7d1e1f74ad1565127229d510f3fd65d931fdedd707c1e15100bc9e5ebb", size = 94813, upload-time = "2025-01-25T08:38:11.682Z" }, - { url = "https://files.pythonhosted.org/packages/61/88/c9ff76a23abe34db8eee1a6fa4e449462a16c7eb547546fc5594b0860a72/mmh3-5.1.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4052fa4a8561bd62648e9eb993c8f3af3bdedadf3d9687aa4770d10e3709a80c", size = 109611, upload-time = "2025-01-25T08:38:12.602Z" }, - { url = "https://files.pythonhosted.org/packages/0b/8e/27d04f40e95554ebe782cac7bddda2d158cf3862387298c9c7b254fa7beb/mmh3-5.1.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:3f0e8ae9f961037f812afe3cce7da57abf734285961fffbeff9a4c011b737732", size = 100515, upload-time = "2025-01-25T08:38:16.407Z" }, - { url = "https://files.pythonhosted.org/packages/7b/00/504ca8f462f01048f3c87cd93f2e1f60b93dac2f930cd4ed73532a9337f5/mmh3-5.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:99297f207db967814f1f02135bb7fe7628b9eacb046134a34e1015b26b06edce", size = 100177, upload-time = "2025-01-25T08:38:18.186Z" }, - { url = "https://files.pythonhosted.org/packages/6f/1d/2efc3525fe6fdf8865972fcbb884bd1f4b0f923c19b80891cecf7e239fa5/mmh3-5.1.0-cp310-cp310-win32.whl", hash = "sha256:2e6c8dc3631a5e22007fbdb55e993b2dbce7985c14b25b572dd78403c2e79182", size = 40815, upload-time = "2025-01-25T08:38:19.176Z" }, - { url = "https://files.pythonhosted.org/packages/38/b5/c8fbe707cb0fea77a6d2d58d497bc9b67aff80deb84d20feb34d8fdd8671/mmh3-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:e4e8c7ad5a4dddcfde35fd28ef96744c1ee0f9d9570108aa5f7e77cf9cfdf0bf", size = 41479, upload-time = "2025-01-25T08:38:21.098Z" }, - { url = "https://files.pythonhosted.org/packages/a1/f1/663e16134f913fccfbcea5b300fb7dc1860d8f63dc71867b013eebc10aec/mmh3-5.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:45da549269883208912868a07d0364e1418d8292c4259ca11699ba1b2475bd26", size = 38883, upload-time = "2025-01-25T08:38:22.013Z" }, - { url = "https://files.pythonhosted.org/packages/56/09/fda7af7fe65928262098382e3bf55950cfbf67d30bf9e47731bf862161e9/mmh3-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b529dcda3f951ff363a51d5866bc6d63cf57f1e73e8961f864ae5010647079d", size = 56098, upload-time = "2025-01-25T08:38:22.917Z" }, - { url = "https://files.pythonhosted.org/packages/0c/ab/84c7bc3f366d6f3bd8b5d9325a10c367685bc17c26dac4c068e2001a4671/mmh3-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db1079b3ace965e562cdfc95847312f9273eb2ad3ebea983435c8423e06acd7", size = 40513, upload-time = "2025-01-25T08:38:25.079Z" }, - { url = "https://files.pythonhosted.org/packages/4f/21/25ea58ca4a652bdc83d1528bec31745cce35802381fb4fe3c097905462d2/mmh3-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:22d31e3a0ff89b8eb3b826d6fc8e19532998b2aa6b9143698043a1268da413e1", size = 40112, upload-time = "2025-01-25T08:38:25.947Z" }, - { url = "https://files.pythonhosted.org/packages/bd/78/4f12f16ae074ddda6f06745254fdb50f8cf3c85b0bbf7eaca58bed84bf58/mmh3-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2139bfbd354cd6cb0afed51c4b504f29bcd687a3b1460b7e89498329cc28a894", size = 102632, upload-time = "2025-01-25T08:38:26.939Z" }, - { url = "https://files.pythonhosted.org/packages/48/11/8f09dc999cf2a09b6138d8d7fc734efb7b7bfdd9adb9383380941caadff0/mmh3-5.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c8105c6a435bc2cd6ea2ef59558ab1a2976fd4a4437026f562856d08996673a", size = 108884, upload-time = "2025-01-25T08:38:29.159Z" }, - { url = "https://files.pythonhosted.org/packages/bd/91/e59a66538a3364176f6c3f7620eee0ab195bfe26f89a95cbcc7a1fb04b28/mmh3-5.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57730067174a7f36fcd6ce012fe359bd5510fdaa5fe067bc94ed03e65dafb769", size = 106835, upload-time = "2025-01-25T08:38:33.04Z" }, - { url = "https://files.pythonhosted.org/packages/25/14/b85836e21ab90e5cddb85fe79c494ebd8f81d96a87a664c488cc9277668b/mmh3-5.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bde80eb196d7fdc765a318604ded74a4378f02c5b46c17aa48a27d742edaded2", size = 93688, upload-time = "2025-01-25T08:38:34.987Z" }, - { url = "https://files.pythonhosted.org/packages/ac/aa/8bc964067df9262740c95e4cde2d19f149f2224f426654e14199a9e47df6/mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9c8eddcb441abddeb419c16c56fd74b3e2df9e57f7aa2903221996718435c7a", size = 101569, upload-time = "2025-01-25T08:38:35.983Z" }, - { url = "https://files.pythonhosted.org/packages/70/b6/1fb163cbf919046a64717466c00edabebece3f95c013853fec76dbf2df92/mmh3-5.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:99e07e4acafbccc7a28c076a847fb060ffc1406036bc2005acb1b2af620e53c3", size = 98483, upload-time = "2025-01-25T08:38:38.198Z" }, - { url = "https://files.pythonhosted.org/packages/70/49/ba64c050dd646060f835f1db6b2cd60a6485f3b0ea04976e7a29ace7312e/mmh3-5.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e25ba5b530e9a7d65f41a08d48f4b3fedc1e89c26486361166a5544aa4cad33", size = 96496, upload-time = "2025-01-25T08:38:39.257Z" }, - { url = "https://files.pythonhosted.org/packages/9e/07/f2751d6a0b535bb865e1066e9c6b80852571ef8d61bce7eb44c18720fbfc/mmh3-5.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bb9bf7475b4d99156ce2f0cf277c061a17560c8c10199c910a680869a278ddc7", size = 105109, upload-time = "2025-01-25T08:38:40.395Z" }, - { url = "https://files.pythonhosted.org/packages/b7/02/30360a5a66f7abba44596d747cc1e6fb53136b168eaa335f63454ab7bb79/mmh3-5.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2a1b0878dd281ea3003368ab53ff6f568e175f1b39f281df1da319e58a19c23a", size = 98231, upload-time = "2025-01-25T08:38:42.141Z" }, - { url = "https://files.pythonhosted.org/packages/8c/60/8526b0c750ff4d7ae1266e68b795f14b97758a1d9fcc19f6ecabf9c55656/mmh3-5.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:25f565093ac8b8aefe0f61f8f95c9a9d11dd69e6a9e9832ff0d293511bc36258", size = 97548, upload-time = "2025-01-25T08:38:43.402Z" }, - { url = "https://files.pythonhosted.org/packages/6d/4c/26e1222aca65769280d5427a1ce5875ef4213449718c8f03958d0bf91070/mmh3-5.1.0-cp311-cp311-win32.whl", hash = "sha256:1e3554d8792387eac73c99c6eaea0b3f884e7130eb67986e11c403e4f9b6d372", size = 40810, upload-time = "2025-01-25T08:38:45.143Z" }, - { url = "https://files.pythonhosted.org/packages/98/d5/424ba95062d1212ea615dc8debc8d57983f2242d5e6b82e458b89a117a1e/mmh3-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:8ad777a48197882492af50bf3098085424993ce850bdda406a358b6ab74be759", size = 41476, upload-time = "2025-01-25T08:38:46.029Z" }, - { url = "https://files.pythonhosted.org/packages/bd/08/0315ccaf087ba55bb19a6dd3b1e8acd491e74ce7f5f9c4aaa06a90d66441/mmh3-5.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f29dc4efd99bdd29fe85ed6c81915b17b2ef2cf853abf7213a48ac6fb3eaabe1", size = 38880, upload-time = "2025-01-25T08:38:47.035Z" }, { url = "https://files.pythonhosted.org/packages/f4/47/e5f452bdf16028bfd2edb4e2e35d0441e4a4740f30e68ccd4cfd2fb2c57e/mmh3-5.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:45712987367cb9235026e3cbf4334670522a97751abfd00b5bc8bfa022c3311d", size = 56152, upload-time = "2025-01-25T08:38:47.902Z" }, { url = "https://files.pythonhosted.org/packages/60/38/2132d537dc7a7fdd8d2e98df90186c7fcdbd3f14f95502a24ba443c92245/mmh3-5.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b1020735eb35086ab24affbea59bb9082f7f6a0ad517cb89f0fc14f16cea4dae", size = 40564, upload-time = "2025-01-25T08:38:48.839Z" }, { url = "https://files.pythonhosted.org/packages/c0/2a/c52cf000581bfb8d94794f58865658e7accf2fa2e90789269d4ae9560b16/mmh3-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:babf2a78ce5513d120c358722a2e3aa7762d6071cd10cede026f8b32452be322", size = 40104, upload-time = "2025-01-25T08:38:49.773Z" }, @@ -1033,49 +1011,40 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/71/4ad9a42f2772793a03cb698f0fc42499f04e6e8d2560ba2f7da0fb059a8e/mmh3-5.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:b22fe2e54be81f6c07dcb36b96fa250fb72effe08aa52fbb83eade6e1e2d5fd7", size = 38890, upload-time = "2025-01-25T08:39:25.28Z" }, ] +[[package]] +name = "msgpack" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/b1/ea4f68038a18c77c9467400d166d74c4ffa536f34761f7983a104357e614/msgpack-1.1.1.tar.gz", hash = "sha256:77b79ce34a2bdab2594f490c8e80dd62a02d650b91a75159a63ec413b8d104cd", size = 173555, upload-time = "2025-06-13T06:52:51.324Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/26/389b9c593eda2b8551b2e7126ad3a06af6f9b44274eb3a4f054d48ff7e47/msgpack-1.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ae497b11f4c21558d95de9f64fff7053544f4d1a17731c866143ed6bb4591238", size = 82359, upload-time = "2025-06-13T06:52:03.909Z" }, + { url = "https://files.pythonhosted.org/packages/ab/65/7d1de38c8a22cf8b1551469159d4b6cf49be2126adc2482de50976084d78/msgpack-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:33be9ab121df9b6b461ff91baac6f2731f83d9b27ed948c5b9d1978ae28bf157", size = 79172, upload-time = "2025-06-13T06:52:05.246Z" }, + { url = "https://files.pythonhosted.org/packages/0f/bd/cacf208b64d9577a62c74b677e1ada005caa9b69a05a599889d6fc2ab20a/msgpack-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f64ae8fe7ffba251fecb8408540c34ee9df1c26674c50c4544d72dbf792e5ce", size = 425013, upload-time = "2025-06-13T06:52:06.341Z" }, + { url = "https://files.pythonhosted.org/packages/4d/ec/fd869e2567cc9c01278a736cfd1697941ba0d4b81a43e0aa2e8d71dab208/msgpack-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a494554874691720ba5891c9b0b39474ba43ffb1aaf32a5dac874effb1619e1a", size = 426905, upload-time = "2025-06-13T06:52:07.501Z" }, + { url = "https://files.pythonhosted.org/packages/55/2a/35860f33229075bce803a5593d046d8b489d7ba2fc85701e714fc1aaf898/msgpack-1.1.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb643284ab0ed26f6957d969fe0dd8bb17beb567beb8998140b5e38a90974f6c", size = 407336, upload-time = "2025-06-13T06:52:09.047Z" }, + { url = "https://files.pythonhosted.org/packages/8c/16/69ed8f3ada150bf92745fb4921bd621fd2cdf5a42e25eb50bcc57a5328f0/msgpack-1.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d275a9e3c81b1093c060c3837e580c37f47c51eca031f7b5fb76f7b8470f5f9b", size = 409485, upload-time = "2025-06-13T06:52:10.382Z" }, + { url = "https://files.pythonhosted.org/packages/c6/b6/0c398039e4c6d0b2e37c61d7e0e9d13439f91f780686deb8ee64ecf1ae71/msgpack-1.1.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4fd6b577e4541676e0cc9ddc1709d25014d3ad9a66caa19962c4f5de30fc09ef", size = 412182, upload-time = "2025-06-13T06:52:11.644Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d0/0cf4a6ecb9bc960d624c93effaeaae75cbf00b3bc4a54f35c8507273cda1/msgpack-1.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb29aaa613c0a1c40d1af111abf025f1732cab333f96f285d6a93b934738a68a", size = 419883, upload-time = "2025-06-13T06:52:12.806Z" }, + { url = "https://files.pythonhosted.org/packages/62/83/9697c211720fa71a2dfb632cad6196a8af3abea56eece220fde4674dc44b/msgpack-1.1.1-cp312-cp312-win32.whl", hash = "sha256:870b9a626280c86cff9c576ec0d9cbcc54a1e5ebda9cd26dab12baf41fee218c", size = 65406, upload-time = "2025-06-13T06:52:14.271Z" }, + { url = "https://files.pythonhosted.org/packages/c0/23/0abb886e80eab08f5e8c485d6f13924028602829f63b8f5fa25a06636628/msgpack-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:5692095123007180dca3e788bb4c399cc26626da51629a31d40207cb262e67f4", size = 72558, upload-time = "2025-06-13T06:52:15.252Z" }, + { url = "https://files.pythonhosted.org/packages/a1/38/561f01cf3577430b59b340b51329803d3a5bf6a45864a55f4ef308ac11e3/msgpack-1.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3765afa6bd4832fc11c3749be4ba4b69a0e8d7b728f78e68120a157a4c5d41f0", size = 81677, upload-time = "2025-06-13T06:52:16.64Z" }, + { url = "https://files.pythonhosted.org/packages/09/48/54a89579ea36b6ae0ee001cba8c61f776451fad3c9306cd80f5b5c55be87/msgpack-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8ddb2bcfd1a8b9e431c8d6f4f7db0773084e107730ecf3472f1dfe9ad583f3d9", size = 78603, upload-time = "2025-06-13T06:52:17.843Z" }, + { url = "https://files.pythonhosted.org/packages/a0/60/daba2699b308e95ae792cdc2ef092a38eb5ee422f9d2fbd4101526d8a210/msgpack-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:196a736f0526a03653d829d7d4c5500a97eea3648aebfd4b6743875f28aa2af8", size = 420504, upload-time = "2025-06-13T06:52:18.982Z" }, + { url = "https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a", size = 423749, upload-time = "2025-06-13T06:52:20.211Z" }, + { url = "https://files.pythonhosted.org/packages/40/1b/54c08dd5452427e1179a40b4b607e37e2664bca1c790c60c442c8e972e47/msgpack-1.1.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4df2311b0ce24f06ba253fda361f938dfecd7b961576f9be3f3fbd60e87130ac", size = 404458, upload-time = "2025-06-13T06:52:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/2e/60/6bb17e9ffb080616a51f09928fdd5cac1353c9becc6c4a8abd4e57269a16/msgpack-1.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e4141c5a32b5e37905b5940aacbc59739f036930367d7acce7a64e4dec1f5e0b", size = 405976, upload-time = "2025-06-13T06:52:22.995Z" }, + { url = "https://files.pythonhosted.org/packages/ee/97/88983e266572e8707c1f4b99c8fd04f9eb97b43f2db40e3172d87d8642db/msgpack-1.1.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b1ce7f41670c5a69e1389420436f41385b1aa2504c3b0c30620764b15dded2e7", size = 408607, upload-time = "2025-06-13T06:52:24.152Z" }, + { url = "https://files.pythonhosted.org/packages/bc/66/36c78af2efaffcc15a5a61ae0df53a1d025f2680122e2a9eb8442fed3ae4/msgpack-1.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4147151acabb9caed4e474c3344181e91ff7a388b888f1e19ea04f7e73dc7ad5", size = 424172, upload-time = "2025-06-13T06:52:25.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/a75eb622b555708fe0427fab96056d39d4c9892b0c784b3a721088c7ee37/msgpack-1.1.1-cp313-cp313-win32.whl", hash = "sha256:500e85823a27d6d9bba1d057c871b4210c1dd6fb01fbb764e37e4e8847376323", size = 65347, upload-time = "2025-06-13T06:52:26.846Z" }, + { url = "https://files.pythonhosted.org/packages/ca/91/7dc28d5e2a11a5ad804cf2b7f7a5fcb1eb5a4966d66a5d2b41aee6376543/msgpack-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:6d489fba546295983abd142812bda76b57e33d0b9f5d5b71c09a583285506f69", size = 72341, upload-time = "2025-06-13T06:52:27.835Z" }, +] + [[package]] name = "multidict" version = "6.4.4" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/91/2f/a3470242707058fe856fe59241eee5635d79087100b7042a867368863a27/multidict-6.4.4.tar.gz", hash = "sha256:69ee9e6ba214b5245031b76233dd95408a0fd57fdb019ddcc1ead4790932a8e8", size = 90183, upload-time = "2025-05-19T14:16:37.381Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1f/92/0926a5baafa164b5d0ade3cd7932be39310375d7e25c9d7ceca05cb26a45/multidict-6.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8adee3ac041145ffe4488ea73fa0a622b464cc25340d98be76924d0cda8545ff", size = 66052, upload-time = "2025-05-19T14:13:49.944Z" }, - { url = "https://files.pythonhosted.org/packages/b2/54/8a857ae4f8f643ec444d91f419fdd49cc7a90a2ca0e42d86482b604b63bd/multidict-6.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b61e98c3e2a861035aaccd207da585bdcacef65fe01d7a0d07478efac005e028", size = 38867, upload-time = "2025-05-19T14:13:51.92Z" }, - { url = "https://files.pythonhosted.org/packages/9e/5f/63add9069f945c19bc8b217ea6b0f8a1ad9382eab374bb44fae4354b3baf/multidict-6.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:75493f28dbadecdbb59130e74fe935288813301a8554dc32f0c631b6bdcdf8b0", size = 38138, upload-time = "2025-05-19T14:13:53.778Z" }, - { url = "https://files.pythonhosted.org/packages/97/8b/fbd9c0fc13966efdb4a47f5bcffff67a4f2a3189fbeead5766eaa4250b20/multidict-6.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ffc3c6a37e048b5395ee235e4a2a0d639c2349dffa32d9367a42fc20d399772", size = 220433, upload-time = "2025-05-19T14:13:55.346Z" }, - { url = "https://files.pythonhosted.org/packages/a9/c4/5132b2d75b3ea2daedb14d10f91028f09f74f5b4d373b242c1b8eec47571/multidict-6.4.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:87cb72263946b301570b0f63855569a24ee8758aaae2cd182aae7d95fbc92ca7", size = 218059, upload-time = "2025-05-19T14:13:56.993Z" }, - { url = "https://files.pythonhosted.org/packages/1a/70/f1e818c7a29b908e2d7b4fafb1d7939a41c64868e79de2982eea0a13193f/multidict-6.4.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bbf7bd39822fd07e3609b6b4467af4c404dd2b88ee314837ad1830a7f4a8299", size = 231120, upload-time = "2025-05-19T14:13:58.333Z" }, - { url = "https://files.pythonhosted.org/packages/b4/7e/95a194d85f27d5ef9cbe48dff9ded722fc6d12fedf641ec6e1e680890be7/multidict-6.4.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1f7cbd4f1f44ddf5fd86a8675b7679176eae770f2fc88115d6dddb6cefb59bc", size = 227457, upload-time = "2025-05-19T14:13:59.663Z" }, - { url = "https://files.pythonhosted.org/packages/25/2b/590ad220968d1babb42f265debe7be5c5c616df6c5688c995a06d8a9b025/multidict-6.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb5ac9e5bfce0e6282e7f59ff7b7b9a74aa8e5c60d38186a4637f5aa764046ad", size = 219111, upload-time = "2025-05-19T14:14:01.019Z" }, - { url = "https://files.pythonhosted.org/packages/e0/f0/b07682b995d3fb5313f339b59d7de02db19ba0c02d1f77c27bdf8212d17c/multidict-6.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4efc31dfef8c4eeb95b6b17d799eedad88c4902daba39ce637e23a17ea078915", size = 213012, upload-time = "2025-05-19T14:14:02.396Z" }, - { url = "https://files.pythonhosted.org/packages/24/56/c77b5f36feef2ec92f1119756e468ac9c3eebc35aa8a4c9e51df664cbbc9/multidict-6.4.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9fcad2945b1b91c29ef2b4050f590bfcb68d8ac8e0995a74e659aa57e8d78e01", size = 225408, upload-time = "2025-05-19T14:14:04.826Z" }, - { url = "https://files.pythonhosted.org/packages/cc/b3/e8189b82af9b198b47bc637766208fc917189eea91d674bad417e657bbdf/multidict-6.4.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:d877447e7368c7320832acb7159557e49b21ea10ffeb135c1077dbbc0816b598", size = 214396, upload-time = "2025-05-19T14:14:06.187Z" }, - { url = "https://files.pythonhosted.org/packages/20/e0/200d14c84e35ae13ee99fd65dc106e1a1acb87a301f15e906fc7d5b30c17/multidict-6.4.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:33a12ebac9f380714c298cbfd3e5b9c0c4e89c75fe612ae496512ee51028915f", size = 222237, upload-time = "2025-05-19T14:14:07.778Z" }, - { url = "https://files.pythonhosted.org/packages/13/f3/bb3df40045ca8262694a3245298732ff431dc781414a89a6a364ebac6840/multidict-6.4.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0f14ea68d29b43a9bf37953881b1e3eb75b2739e896ba4a6aa4ad4c5b9ffa145", size = 231425, upload-time = "2025-05-19T14:14:09.516Z" }, - { url = "https://files.pythonhosted.org/packages/85/3b/538563dc18514384dac169bcba938753ad9ab4d4c8d49b55d6ae49fb2579/multidict-6.4.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0327ad2c747a6600e4797d115d3c38a220fdb28e54983abe8964fd17e95ae83c", size = 226251, upload-time = "2025-05-19T14:14:10.82Z" }, - { url = "https://files.pythonhosted.org/packages/56/79/77e1a65513f09142358f1beb1d4cbc06898590b34a7de2e47023e3c5a3a2/multidict-6.4.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d1a20707492db9719a05fc62ee215fd2c29b22b47c1b1ba347f9abc831e26683", size = 220363, upload-time = "2025-05-19T14:14:12.638Z" }, - { url = "https://files.pythonhosted.org/packages/16/57/67b0516c3e348f8daaa79c369b3de4359a19918320ab82e2e586a1c624ef/multidict-6.4.4-cp310-cp310-win32.whl", hash = "sha256:d83f18315b9fca5db2452d1881ef20f79593c4aa824095b62cb280019ef7aa3d", size = 35175, upload-time = "2025-05-19T14:14:14.805Z" }, - { url = "https://files.pythonhosted.org/packages/86/5a/4ed8fec642d113fa653777cda30ef67aa5c8a38303c091e24c521278a6c6/multidict-6.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:9c17341ee04545fd962ae07330cb5a39977294c883485c8d74634669b1f7fe04", size = 38678, upload-time = "2025-05-19T14:14:16.949Z" }, - { url = "https://files.pythonhosted.org/packages/19/1b/4c6e638195851524a63972c5773c7737bea7e47b1ba402186a37773acee2/multidict-6.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4f5f29794ac0e73d2a06ac03fd18870adc0135a9d384f4a306a951188ed02f95", size = 65515, upload-time = "2025-05-19T14:14:19.767Z" }, - { url = "https://files.pythonhosted.org/packages/25/d5/10e6bca9a44b8af3c7f920743e5fc0c2bcf8c11bf7a295d4cfe00b08fb46/multidict-6.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c04157266344158ebd57b7120d9b0b35812285d26d0e78193e17ef57bfe2979a", size = 38609, upload-time = "2025-05-19T14:14:21.538Z" }, - { url = "https://files.pythonhosted.org/packages/26/b4/91fead447ccff56247edc7f0535fbf140733ae25187a33621771ee598a18/multidict-6.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb61ffd3ab8310d93427e460f565322c44ef12769f51f77277b4abad7b6f7223", size = 37871, upload-time = "2025-05-19T14:14:22.666Z" }, - { url = "https://files.pythonhosted.org/packages/3b/37/cbc977cae59277e99d15bbda84cc53b5e0c4929ffd91d958347200a42ad0/multidict-6.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e0ba18a9afd495f17c351d08ebbc4284e9c9f7971d715f196b79636a4d0de44", size = 226661, upload-time = "2025-05-19T14:14:24.124Z" }, - { url = "https://files.pythonhosted.org/packages/15/cd/7e0b57fbd4dc2fc105169c4ecce5be1a63970f23bb4ec8c721b67e11953d/multidict-6.4.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9faf1b1dcaadf9f900d23a0e6d6c8eadd6a95795a0e57fcca73acce0eb912065", size = 223422, upload-time = "2025-05-19T14:14:25.437Z" }, - { url = "https://files.pythonhosted.org/packages/f1/01/1de268da121bac9f93242e30cd3286f6a819e5f0b8896511162d6ed4bf8d/multidict-6.4.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a4d1cb1327c6082c4fce4e2a438483390964c02213bc6b8d782cf782c9b1471f", size = 235447, upload-time = "2025-05-19T14:14:26.793Z" }, - { url = "https://files.pythonhosted.org/packages/d2/8c/8b9a5e4aaaf4f2de14e86181a3a3d7b105077f668b6a06f043ec794f684c/multidict-6.4.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:941f1bec2f5dbd51feeb40aea654c2747f811ab01bdd3422a48a4e4576b7d76a", size = 231455, upload-time = "2025-05-19T14:14:28.149Z" }, - { url = "https://files.pythonhosted.org/packages/35/db/e1817dcbaa10b319c412769cf999b1016890849245d38905b73e9c286862/multidict-6.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5f8a146184da7ea12910a4cec51ef85e44f6268467fb489c3caf0cd512f29c2", size = 223666, upload-time = "2025-05-19T14:14:29.584Z" }, - { url = "https://files.pythonhosted.org/packages/4a/e1/66e8579290ade8a00e0126b3d9a93029033ffd84f0e697d457ed1814d0fc/multidict-6.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:232b7237e57ec3c09be97206bfb83a0aa1c5d7d377faa019c68a210fa35831f1", size = 217392, upload-time = "2025-05-19T14:14:30.961Z" }, - { url = "https://files.pythonhosted.org/packages/7b/6f/f8639326069c24a48c7747c2a5485d37847e142a3f741ff3340c88060a9a/multidict-6.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:55ae0721c1513e5e3210bca4fc98456b980b0c2c016679d3d723119b6b202c42", size = 228969, upload-time = "2025-05-19T14:14:32.672Z" }, - { url = "https://files.pythonhosted.org/packages/d2/c3/3d58182f76b960eeade51c89fcdce450f93379340457a328e132e2f8f9ed/multidict-6.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:51d662c072579f63137919d7bb8fc250655ce79f00c82ecf11cab678f335062e", size = 217433, upload-time = "2025-05-19T14:14:34.016Z" }, - { url = "https://files.pythonhosted.org/packages/e1/4b/f31a562906f3bd375f3d0e83ce314e4a660c01b16c2923e8229b53fba5d7/multidict-6.4.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0e05c39962baa0bb19a6b210e9b1422c35c093b651d64246b6c2e1a7e242d9fd", size = 225418, upload-time = "2025-05-19T14:14:35.376Z" }, - { url = "https://files.pythonhosted.org/packages/99/89/78bb95c89c496d64b5798434a3deee21996114d4d2c28dd65850bf3a691e/multidict-6.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d5b1cc3ab8c31d9ebf0faa6e3540fb91257590da330ffe6d2393d4208e638925", size = 235042, upload-time = "2025-05-19T14:14:36.723Z" }, - { url = "https://files.pythonhosted.org/packages/74/91/8780a6e5885a8770442a8f80db86a0887c4becca0e5a2282ba2cae702bc4/multidict-6.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:93ec84488a384cd7b8a29c2c7f467137d8a73f6fe38bb810ecf29d1ade011a7c", size = 230280, upload-time = "2025-05-19T14:14:38.194Z" }, - { url = "https://files.pythonhosted.org/packages/68/c1/fcf69cabd542eb6f4b892469e033567ee6991d361d77abdc55e3a0f48349/multidict-6.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b308402608493638763abc95f9dc0030bbd6ac6aff784512e8ac3da73a88af08", size = 223322, upload-time = "2025-05-19T14:14:40.015Z" }, - { url = "https://files.pythonhosted.org/packages/b8/85/5b80bf4b83d8141bd763e1d99142a9cdfd0db83f0739b4797172a4508014/multidict-6.4.4-cp311-cp311-win32.whl", hash = "sha256:343892a27d1a04d6ae455ecece12904d242d299ada01633d94c4f431d68a8c49", size = 35070, upload-time = "2025-05-19T14:14:41.904Z" }, - { url = "https://files.pythonhosted.org/packages/09/66/0bed198ffd590ab86e001f7fa46b740d58cf8ff98c2f254e4a36bf8861ad/multidict-6.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:73484a94f55359780c0f458bbd3c39cb9cf9c182552177d2136e828269dee529", size = 38667, upload-time = "2025-05-19T14:14:43.534Z" }, { url = "https://files.pythonhosted.org/packages/d2/b5/5675377da23d60875fe7dae6be841787755878e315e2f517235f22f59e18/multidict-6.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dc388f75a1c00000824bf28b7633e40854f4127ede80512b44c3cfeeea1839a2", size = 64293, upload-time = "2025-05-19T14:14:44.724Z" }, { url = "https://files.pythonhosted.org/packages/34/a7/be384a482754bb8c95d2bbe91717bf7ccce6dc38c18569997a11f95aa554/multidict-6.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:98af87593a666f739d9dba5d0ae86e01b0e1a9cfcd2e30d2d361fbbbd1a9162d", size = 38096, upload-time = "2025-05-19T14:14:45.95Z" }, { url = "https://files.pythonhosted.org/packages/66/6d/d59854bb4352306145bdfd1704d210731c1bb2c890bfee31fb7bbc1c4c7f/multidict-6.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aff4cafea2d120327d55eadd6b7f1136a8e5a0ecf6fb3b6863e8aca32cd8e50a", size = 37214, upload-time = "2025-05-19T14:14:47.158Z" }, @@ -1139,26 +1108,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, ] -[[package]] -name = "networkx" -version = "3.4.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.11'", -] -sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263, upload-time = "2024-10-21T12:39:36.247Z" }, -] - [[package]] name = "networkx" version = "3.5" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, @@ -1170,26 +1123,6 @@ version = "2.2.6" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" }, - { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" }, - { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" }, - { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" }, - { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" }, - { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" }, - { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" }, - { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" }, - { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" }, - { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" }, - { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" }, - { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" }, - { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" }, - { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" }, - { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" }, - { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" }, - { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" }, - { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" }, - { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" }, - { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" }, { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" }, { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" }, { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" }, @@ -1220,10 +1153,95 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" }, { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" }, { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, - { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" }, - { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" }, - { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" }, - { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" }, +] + +[[package]] +name = "opencensus" +version = "0.11.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "opencensus-context" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/15/a7/a46dcffa1b63084f9f17fe3c8cb20724c4c8f91009fd0b2cfdb27d5d2b35/opencensus-0.11.4.tar.gz", hash = "sha256:cbef87d8b8773064ab60e5c2a1ced58bbaa38a6d052c41aec224958ce544eff2", size = 64966, upload-time = "2024-01-03T18:04:07.085Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl", hash = "sha256:a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864", size = 128225, upload-time = "2024-01-03T18:04:05.127Z" }, +] + +[[package]] +name = "opencensus-context" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/96/3b6f638f6275a8abbd45e582448723bffa29c1fb426721dedb5c72f7d056/opencensus-context-0.1.3.tar.gz", hash = "sha256:a03108c3c10d8c80bb5ddf5c8a1f033161fa61972a9917f9b9b3a18517f0088c", size = 4066, upload-time = "2022-08-03T22:20:22.359Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl", hash = "sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039", size = 5060, upload-time = "2022-08-03T22:20:20.352Z" }, +] + +[[package]] +name = "opentelemetry-api" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/d2/c782c88b8afbf961d6972428821c302bd1e9e7bc361352172f0ca31296e2/opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0", size = 64780, upload-time = "2025-07-29T15:12:06.02Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c", size = 65564, upload-time = "2025-07-29T15:11:47.998Z" }, +] + +[[package]] +name = "opentelemetry-exporter-prometheus" +version = "0.57b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, + { name = "prometheus-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/d8/5f04c6d51c0823c3d8ac973a2a38db6fcf2d040ca3f08fc66b3c14b6e164/opentelemetry_exporter_prometheus-0.57b0.tar.gz", hash = "sha256:9eb15bdc189235cf03c3f93abf56f8ff0ab57a493a189263bd7fe77a4249e689", size = 14906, upload-time = "2025-07-29T15:12:09.96Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl", hash = "sha256:c5b893d1cdd593fb022af2c7de3258c2d5a4d04402ae80d9fa35675fed77f05c", size = 12922, upload-time = "2025-07-29T15:11:54.055Z" }, +] + +[[package]] +name = "opentelemetry-proto" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/02/f6556142301d136e3b7e95ab8ea6a5d9dc28d879a99f3dd673b5f97dca06/opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f", size = 46152, upload-time = "2025-07-29T15:12:15.717Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e", size = 72537, upload-time = "2025-07-29T15:12:02.243Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.36.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/85/8567a966b85a2d3f971c4d42f781c305b2b91c043724fa08fd37d158e9dc/opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581", size = 162557, upload-time = "2025-07-29T15:12:16.76Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb", size = 119995, upload-time = "2025-07-29T15:12:03.181Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.57b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/31/67dfa252ee88476a29200b0255bda8dfc2cf07b56ad66dc9a6221f7dc787/opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32", size = 124225, upload-time = "2025-07-29T15:12:17.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78", size = 201627, upload-time = "2025-07-29T15:12:04.174Z" }, ] [[package]] @@ -1232,8 +1250,7 @@ source = { editable = "." } dependencies = [ { name = "beartype" }, { name = "matplotlib" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "networkx" }, { name = "pandas" }, { name = "polars" }, { name = "pyarrow" }, @@ -1243,6 +1260,15 @@ dependencies = [ ] [package.optional-dependencies] +all = [ + { name = "ipywidgets" }, + { name = "ray", extra = ["default"] }, + { name = "redis" }, +] +ray = [ + { name = "ipywidgets" }, + { name = "ray", extra = ["default"] }, +] redis = [ { name = "redis" }, ] @@ -1252,11 +1278,13 @@ dev = [ { name = "deltalake" }, { name = "httpie" }, { name = "ipykernel" }, + { name = "ipywidgets" }, { name = "jsonschema" }, { name = "pyarrow-stubs" }, { name = "pyiceberg" }, { name = "pytest" }, { name = "pytest-cov" }, + { name = "ray" }, { name = "redis" }, { name = "ruff" }, { name = "tqdm" }, @@ -1265,28 +1293,34 @@ dev = [ [package.metadata] requires-dist = [ { name = "beartype", specifier = ">=0.21.0" }, + { name = "ipywidgets", marker = "extra == 'ray'", specifier = ">=8.1.7" }, { name = "matplotlib", specifier = ">=3.10.3" }, { name = "networkx" }, + { name = "orcapod", extras = ["ray"], marker = "extra == 'all'" }, + { name = "orcapod", extras = ["redis"], marker = "extra == 'all'" }, { name = "pandas", specifier = ">=2.2.3" }, { name = "polars", specifier = ">=1.31.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = ">=2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, { name = "typing-extensions" }, { name = "xxhash" }, ] -provides-extras = ["redis"] +provides-extras = ["redis", "ray", "all"] [package.metadata.requires-dev] dev = [ { name = "deltalake", specifier = ">=1.0.2" }, { name = "httpie", specifier = ">=3.2.4" }, { name = "ipykernel", specifier = ">=6.29.5" }, + { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jsonschema", specifier = ">=4.25.0" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-cov", specifier = ">=6.1.1" }, + { name = "ray", specifier = ">=2.48.0" }, { name = "redis", specifier = ">=6.2.0" }, { name = "ruff", specifier = ">=0.11.11" }, { name = "tqdm", specifier = ">=4.67.1" }, @@ -1313,20 +1347,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213, upload-time = "2024-09-20T13:10:04.827Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/aa/70/c853aec59839bceed032d52010ff5f1b8d87dc3114b762e4ba2727661a3b/pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5", size = 12580827, upload-time = "2024-09-20T13:08:42.347Z" }, - { url = "https://files.pythonhosted.org/packages/99/f2/c4527768739ffa4469b2b4fff05aa3768a478aed89a2f271a79a40eee984/pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348", size = 11303897, upload-time = "2024-09-20T13:08:45.807Z" }, - { url = "https://files.pythonhosted.org/packages/ed/12/86c1747ea27989d7a4064f806ce2bae2c6d575b950be087837bdfcabacc9/pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed", size = 66480908, upload-time = "2024-09-20T18:37:13.513Z" }, - { url = "https://files.pythonhosted.org/packages/44/50/7db2cd5e6373ae796f0ddad3675268c8d59fb6076e66f0c339d61cea886b/pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57", size = 13064210, upload-time = "2024-09-20T13:08:48.325Z" }, - { url = "https://files.pythonhosted.org/packages/61/61/a89015a6d5536cb0d6c3ba02cebed51a95538cf83472975275e28ebf7d0c/pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42", size = 16754292, upload-time = "2024-09-20T19:01:54.443Z" }, - { url = "https://files.pythonhosted.org/packages/ce/0d/4cc7b69ce37fac07645a94e1d4b0880b15999494372c1523508511b09e40/pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f", size = 14416379, upload-time = "2024-09-20T13:08:50.882Z" }, - { url = "https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645", size = 11598471, upload-time = "2024-09-20T13:08:53.332Z" }, - { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222, upload-time = "2024-09-20T13:08:56.254Z" }, - { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274, upload-time = "2024-09-20T13:08:58.645Z" }, - { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836, upload-time = "2024-09-20T19:01:57.571Z" }, - { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505, upload-time = "2024-09-20T13:09:01.501Z" }, - { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420, upload-time = "2024-09-20T19:02:00.678Z" }, - { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457, upload-time = "2024-09-20T13:09:04.105Z" }, - { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166, upload-time = "2024-09-20T13:09:06.917Z" }, { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893, upload-time = "2024-09-20T13:09:09.655Z" }, { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475, upload-time = "2024-09-20T13:09:14.718Z" }, { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645, upload-time = "2024-09-20T19:02:03.88Z" }, @@ -1376,28 +1396,6 @@ version = "11.2.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707, upload-time = "2025-04-12T17:50:03.289Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0d/8b/b158ad57ed44d3cc54db8d68ad7c0a58b8fc0e4c7a3f995f9d62d5b464a1/pillow-11.2.1-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:d57a75d53922fc20c165016a20d9c44f73305e67c351bbc60d1adaf662e74047", size = 3198442, upload-time = "2025-04-12T17:47:10.666Z" }, - { url = "https://files.pythonhosted.org/packages/b1/f8/bb5d956142f86c2d6cc36704943fa761f2d2e4c48b7436fd0a85c20f1713/pillow-11.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:127bf6ac4a5b58b3d32fc8289656f77f80567d65660bc46f72c0d77e6600cc95", size = 3030553, upload-time = "2025-04-12T17:47:13.153Z" }, - { url = "https://files.pythonhosted.org/packages/22/7f/0e413bb3e2aa797b9ca2c5c38cb2e2e45d88654e5b12da91ad446964cfae/pillow-11.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4ba4be812c7a40280629e55ae0b14a0aafa150dd6451297562e1764808bbe61", size = 4405503, upload-time = "2025-04-12T17:47:15.36Z" }, - { url = "https://files.pythonhosted.org/packages/f3/b4/cc647f4d13f3eb837d3065824aa58b9bcf10821f029dc79955ee43f793bd/pillow-11.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8bd62331e5032bc396a93609982a9ab6b411c05078a52f5fe3cc59234a3abd1", size = 4490648, upload-time = "2025-04-12T17:47:17.37Z" }, - { url = "https://files.pythonhosted.org/packages/c2/6f/240b772a3b35cdd7384166461567aa6713799b4e78d180c555bd284844ea/pillow-11.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:562d11134c97a62fe3af29581f083033179f7ff435f78392565a1ad2d1c2c45c", size = 4508937, upload-time = "2025-04-12T17:47:19.066Z" }, - { url = "https://files.pythonhosted.org/packages/f3/5e/7ca9c815ade5fdca18853db86d812f2f188212792780208bdb37a0a6aef4/pillow-11.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c97209e85b5be259994eb5b69ff50c5d20cca0f458ef9abd835e262d9d88b39d", size = 4599802, upload-time = "2025-04-12T17:47:21.404Z" }, - { url = "https://files.pythonhosted.org/packages/02/81/c3d9d38ce0c4878a77245d4cf2c46d45a4ad0f93000227910a46caff52f3/pillow-11.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0c3e6d0f59171dfa2e25d7116217543310908dfa2770aa64b8f87605f8cacc97", size = 4576717, upload-time = "2025-04-12T17:47:23.571Z" }, - { url = "https://files.pythonhosted.org/packages/42/49/52b719b89ac7da3185b8d29c94d0e6aec8140059e3d8adcaa46da3751180/pillow-11.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc1c3bc53befb6096b84165956e886b1729634a799e9d6329a0c512ab651e579", size = 4654874, upload-time = "2025-04-12T17:47:25.783Z" }, - { url = "https://files.pythonhosted.org/packages/5b/0b/ede75063ba6023798267023dc0d0401f13695d228194d2242d5a7ba2f964/pillow-11.2.1-cp310-cp310-win32.whl", hash = "sha256:312c77b7f07ab2139924d2639860e084ec2a13e72af54d4f08ac843a5fc9c79d", size = 2331717, upload-time = "2025-04-12T17:47:28.922Z" }, - { url = "https://files.pythonhosted.org/packages/ed/3c/9831da3edea527c2ed9a09f31a2c04e77cd705847f13b69ca60269eec370/pillow-11.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:9bc7ae48b8057a611e5fe9f853baa88093b9a76303937449397899385da06fad", size = 2676204, upload-time = "2025-04-12T17:47:31.283Z" }, - { url = "https://files.pythonhosted.org/packages/01/97/1f66ff8a1503d8cbfc5bae4dc99d54c6ec1e22ad2b946241365320caabc2/pillow-11.2.1-cp310-cp310-win_arm64.whl", hash = "sha256:2728567e249cdd939f6cc3d1f049595c66e4187f3c34078cbc0a7d21c47482d2", size = 2414767, upload-time = "2025-04-12T17:47:34.655Z" }, - { url = "https://files.pythonhosted.org/packages/68/08/3fbf4b98924c73037a8e8b4c2c774784805e0fb4ebca6c5bb60795c40125/pillow-11.2.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35ca289f712ccfc699508c4658a1d14652e8033e9b69839edf83cbdd0ba39e70", size = 3198450, upload-time = "2025-04-12T17:47:37.135Z" }, - { url = "https://files.pythonhosted.org/packages/84/92/6505b1af3d2849d5e714fc75ba9e69b7255c05ee42383a35a4d58f576b16/pillow-11.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0409af9f829f87a2dfb7e259f78f317a5351f2045158be321fd135973fff7bf", size = 3030550, upload-time = "2025-04-12T17:47:39.345Z" }, - { url = "https://files.pythonhosted.org/packages/3c/8c/ac2f99d2a70ff966bc7eb13dacacfaab57c0549b2ffb351b6537c7840b12/pillow-11.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4e5c5edee874dce4f653dbe59db7c73a600119fbea8d31f53423586ee2aafd7", size = 4415018, upload-time = "2025-04-12T17:47:41.128Z" }, - { url = "https://files.pythonhosted.org/packages/1f/e3/0a58b5d838687f40891fff9cbaf8669f90c96b64dc8f91f87894413856c6/pillow-11.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b93a07e76d13bff9444f1a029e0af2964e654bfc2e2c2d46bfd080df5ad5f3d8", size = 4498006, upload-time = "2025-04-12T17:47:42.912Z" }, - { url = "https://files.pythonhosted.org/packages/21/f5/6ba14718135f08fbfa33308efe027dd02b781d3f1d5c471444a395933aac/pillow-11.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:e6def7eed9e7fa90fde255afaf08060dc4b343bbe524a8f69bdd2a2f0018f600", size = 4517773, upload-time = "2025-04-12T17:47:44.611Z" }, - { url = "https://files.pythonhosted.org/packages/20/f2/805ad600fc59ebe4f1ba6129cd3a75fb0da126975c8579b8f57abeb61e80/pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8f4f3724c068be008c08257207210c138d5f3731af6c155a81c2b09a9eb3a788", size = 4607069, upload-time = "2025-04-12T17:47:46.46Z" }, - { url = "https://files.pythonhosted.org/packages/71/6b/4ef8a288b4bb2e0180cba13ca0a519fa27aa982875882392b65131401099/pillow-11.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0a6709b47019dff32e678bc12c63008311b82b9327613f534e496dacaefb71e", size = 4583460, upload-time = "2025-04-12T17:47:49.255Z" }, - { url = "https://files.pythonhosted.org/packages/62/ae/f29c705a09cbc9e2a456590816e5c234382ae5d32584f451c3eb41a62062/pillow-11.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f6b0c664ccb879109ee3ca702a9272d877f4fcd21e5eb63c26422fd6e415365e", size = 4661304, upload-time = "2025-04-12T17:47:51.067Z" }, - { url = "https://files.pythonhosted.org/packages/6e/1a/c8217b6f2f73794a5e219fbad087701f412337ae6dbb956db37d69a9bc43/pillow-11.2.1-cp311-cp311-win32.whl", hash = "sha256:cc5d875d56e49f112b6def6813c4e3d3036d269c008bf8aef72cd08d20ca6df6", size = 2331809, upload-time = "2025-04-12T17:47:54.425Z" }, - { url = "https://files.pythonhosted.org/packages/e2/72/25a8f40170dc262e86e90f37cb72cb3de5e307f75bf4b02535a61afcd519/pillow-11.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:0f5c7eda47bf8e3c8a283762cab94e496ba977a420868cb819159980b6709193", size = 2676338, upload-time = "2025-04-12T17:47:56.535Z" }, - { url = "https://files.pythonhosted.org/packages/06/9e/76825e39efee61efea258b479391ca77d64dbd9e5804e4ad0fa453b4ba55/pillow-11.2.1-cp311-cp311-win_arm64.whl", hash = "sha256:4d375eb838755f2528ac8cbc926c3e31cc49ca4ad0cf79cff48b20e30634a4a7", size = 2414918, upload-time = "2025-04-12T17:47:58.217Z" }, { url = "https://files.pythonhosted.org/packages/c7/40/052610b15a1b8961f52537cc8326ca6a881408bc2bdad0d852edeb6ed33b/pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f", size = 3190185, upload-time = "2025-04-12T17:48:00.417Z" }, { url = "https://files.pythonhosted.org/packages/e5/7e/b86dbd35a5f938632093dc40d1682874c33dcfe832558fc80ca56bfcb774/pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b", size = 3030306, upload-time = "2025-04-12T17:48:02.391Z" }, { url = "https://files.pythonhosted.org/packages/a4/5c/467a161f9ed53e5eab51a42923c33051bf8d1a2af4626ac04f5166e58e0c/pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d", size = 4416121, upload-time = "2025-04-12T17:48:04.554Z" }, @@ -1431,20 +1429,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/92/1ca0c3f09233bd7decf8f7105a1c4e3162fb9142128c74adad0fb361b7eb/pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd", size = 2335774, upload-time = "2025-04-12T17:49:04.889Z" }, { url = "https://files.pythonhosted.org/packages/a5/ac/77525347cb43b83ae905ffe257bbe2cc6fd23acb9796639a1f56aa59d191/pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e", size = 2681895, upload-time = "2025-04-12T17:49:06.635Z" }, { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234, upload-time = "2025-04-12T17:49:08.399Z" }, - { url = "https://files.pythonhosted.org/packages/33/49/c8c21e4255b4f4a2c0c68ac18125d7f5460b109acc6dfdef1a24f9b960ef/pillow-11.2.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:9b7b0d4fd2635f54ad82785d56bc0d94f147096493a79985d0ab57aedd563156", size = 3181727, upload-time = "2025-04-12T17:49:31.898Z" }, - { url = "https://files.pythonhosted.org/packages/6d/f1/f7255c0838f8c1ef6d55b625cfb286835c17e8136ce4351c5577d02c443b/pillow-11.2.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:aa442755e31c64037aa7c1cb186e0b369f8416c567381852c63444dd666fb772", size = 2999833, upload-time = "2025-04-12T17:49:34.2Z" }, - { url = "https://files.pythonhosted.org/packages/e2/57/9968114457bd131063da98d87790d080366218f64fa2943b65ac6739abb3/pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0d3348c95b766f54b76116d53d4cb171b52992a1027e7ca50c81b43b9d9e363", size = 3437472, upload-time = "2025-04-12T17:49:36.294Z" }, - { url = "https://files.pythonhosted.org/packages/b2/1b/e35d8a158e21372ecc48aac9c453518cfe23907bb82f950d6e1c72811eb0/pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85d27ea4c889342f7e35f6d56e7e1cb345632ad592e8c51b693d7b7556043ce0", size = 3459976, upload-time = "2025-04-12T17:49:38.988Z" }, - { url = "https://files.pythonhosted.org/packages/26/da/2c11d03b765efff0ccc473f1c4186dc2770110464f2177efaed9cf6fae01/pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bf2c33d6791c598142f00c9c4c7d47f6476731c31081331664eb26d6ab583e01", size = 3527133, upload-time = "2025-04-12T17:49:40.985Z" }, - { url = "https://files.pythonhosted.org/packages/79/1a/4e85bd7cadf78412c2a3069249a09c32ef3323650fd3005c97cca7aa21df/pillow-11.2.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e616e7154c37669fc1dfc14584f11e284e05d1c650e1c0f972f281c4ccc53193", size = 3571555, upload-time = "2025-04-12T17:49:42.964Z" }, - { url = "https://files.pythonhosted.org/packages/69/03/239939915216de1e95e0ce2334bf17a7870ae185eb390fab6d706aadbfc0/pillow-11.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:39ad2e0f424394e3aebc40168845fee52df1394a4673a6ee512d840d14ab3013", size = 2674713, upload-time = "2025-04-12T17:49:44.944Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ad/2613c04633c7257d9481ab21d6b5364b59fc5d75faafd7cb8693523945a3/pillow-11.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:80f1df8dbe9572b4b7abdfa17eb5d78dd620b1d55d9e25f834efdbee872d3aed", size = 3181734, upload-time = "2025-04-12T17:49:46.789Z" }, - { url = "https://files.pythonhosted.org/packages/a4/fd/dcdda4471ed667de57bb5405bb42d751e6cfdd4011a12c248b455c778e03/pillow-11.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea926cfbc3957090becbcbbb65ad177161a2ff2ad578b5a6ec9bb1e1cd78753c", size = 2999841, upload-time = "2025-04-12T17:49:48.812Z" }, - { url = "https://files.pythonhosted.org/packages/ac/89/8a2536e95e77432833f0db6fd72a8d310c8e4272a04461fb833eb021bf94/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:738db0e0941ca0376804d4de6a782c005245264edaa253ffce24e5a15cbdc7bd", size = 3437470, upload-time = "2025-04-12T17:49:50.831Z" }, - { url = "https://files.pythonhosted.org/packages/9d/8f/abd47b73c60712f88e9eda32baced7bfc3e9bd6a7619bb64b93acff28c3e/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db98ab6565c69082ec9b0d4e40dd9f6181dab0dd236d26f7a50b8b9bfbd5076", size = 3460013, upload-time = "2025-04-12T17:49:53.278Z" }, - { url = "https://files.pythonhosted.org/packages/f6/20/5c0a0aa83b213b7a07ec01e71a3d6ea2cf4ad1d2c686cc0168173b6089e7/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:036e53f4170e270ddb8797d4c590e6dd14d28e15c7da375c18978045f7e6c37b", size = 3527165, upload-time = "2025-04-12T17:49:55.164Z" }, - { url = "https://files.pythonhosted.org/packages/58/0e/2abab98a72202d91146abc839e10c14f7cf36166f12838ea0c4db3ca6ecb/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14f73f7c291279bd65fda51ee87affd7c1e097709f7fdd0188957a16c264601f", size = 3571586, upload-time = "2025-04-12T17:49:57.171Z" }, - { url = "https://files.pythonhosted.org/packages/21/2c/5e05f58658cf49b6667762cca03d6e7d85cededde2caf2ab37b81f80e574/pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044", size = 2674751, upload-time = "2025-04-12T17:49:59.628Z" }, ] [[package]] @@ -1488,6 +1472,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/4b/0673a68ac4d6527fac951970e929c3b4440c654f994f0c957bd5556deb38/polars-1.31.0-cp39-abi3-win_arm64.whl", hash = "sha256:62ef23bb9d10dca4c2b945979f9a50812ac4ace4ed9e158a6b5d32a7322e6f75", size = 31469078, upload-time = "2025-06-18T11:59:59.242Z" }, ] +[[package]] +name = "prometheus-client" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/cf/40dde0a2be27cc1eb41e333d1a674a74ce8b8b0457269cc640fd42b07cf7/prometheus_client-0.22.1.tar.gz", hash = "sha256:190f1331e783cf21eb60bca559354e0a4d4378facecf78f5428c39b675d20d28", size = 69746, upload-time = "2025-06-02T14:29:01.152Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl", hash = "sha256:cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094", size = 58694, upload-time = "2025-06-02T14:29:00.068Z" }, +] + [[package]] name = "prompt-toolkit" version = "3.0.51" @@ -1500,6 +1493,89 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl", hash = "sha256:52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07", size = 387810, upload-time = "2025-04-15T09:18:44.753Z" }, ] +[[package]] +name = "propcache" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/42/9ca01b0a6f48e81615dca4765a8f1dd2c057e0540f6116a27dc5ee01dfb6/propcache-0.3.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8de106b6c84506b31c27168582cd3cb3000a6412c16df14a8628e5871ff83c10", size = 73674, upload-time = "2025-06-09T22:54:30.551Z" }, + { url = "https://files.pythonhosted.org/packages/af/6e/21293133beb550f9c901bbece755d582bfaf2176bee4774000bd4dd41884/propcache-0.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:28710b0d3975117239c76600ea351934ac7b5ff56e60953474342608dbbb6154", size = 43570, upload-time = "2025-06-09T22:54:32.296Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c8/0393a0a3a2b8760eb3bde3c147f62b20044f0ddac81e9d6ed7318ec0d852/propcache-0.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce26862344bdf836650ed2487c3d724b00fbfec4233a1013f597b78c1cb73615", size = 43094, upload-time = "2025-06-09T22:54:33.929Z" }, + { url = "https://files.pythonhosted.org/packages/37/2c/489afe311a690399d04a3e03b069225670c1d489eb7b044a566511c1c498/propcache-0.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bca54bd347a253af2cf4544bbec232ab982f4868de0dd684246b67a51bc6b1db", size = 226958, upload-time = "2025-06-09T22:54:35.186Z" }, + { url = "https://files.pythonhosted.org/packages/9d/ca/63b520d2f3d418c968bf596839ae26cf7f87bead026b6192d4da6a08c467/propcache-0.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55780d5e9a2ddc59711d727226bb1ba83a22dd32f64ee15594b9392b1f544eb1", size = 234894, upload-time = "2025-06-09T22:54:36.708Z" }, + { url = "https://files.pythonhosted.org/packages/11/60/1d0ed6fff455a028d678df30cc28dcee7af77fa2b0e6962ce1df95c9a2a9/propcache-0.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:035e631be25d6975ed87ab23153db6a73426a48db688070d925aa27e996fe93c", size = 233672, upload-time = "2025-06-09T22:54:38.062Z" }, + { url = "https://files.pythonhosted.org/packages/37/7c/54fd5301ef38505ab235d98827207176a5c9b2aa61939b10a460ca53e123/propcache-0.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee6f22b6eaa39297c751d0e80c0d3a454f112f5c6481214fcf4c092074cecd67", size = 224395, upload-time = "2025-06-09T22:54:39.634Z" }, + { url = "https://files.pythonhosted.org/packages/ee/1a/89a40e0846f5de05fdc6779883bf46ba980e6df4d2ff8fb02643de126592/propcache-0.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ca3aee1aa955438c4dba34fc20a9f390e4c79967257d830f137bd5a8a32ed3b", size = 212510, upload-time = "2025-06-09T22:54:41.565Z" }, + { url = "https://files.pythonhosted.org/packages/5e/33/ca98368586c9566a6b8d5ef66e30484f8da84c0aac3f2d9aec6d31a11bd5/propcache-0.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7a4f30862869fa2b68380d677cc1c5fcf1e0f2b9ea0cf665812895c75d0ca3b8", size = 222949, upload-time = "2025-06-09T22:54:43.038Z" }, + { url = "https://files.pythonhosted.org/packages/ba/11/ace870d0aafe443b33b2f0b7efdb872b7c3abd505bfb4890716ad7865e9d/propcache-0.3.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b77ec3c257d7816d9f3700013639db7491a434644c906a2578a11daf13176251", size = 217258, upload-time = "2025-06-09T22:54:44.376Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d2/86fd6f7adffcfc74b42c10a6b7db721d1d9ca1055c45d39a1a8f2a740a21/propcache-0.3.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cab90ac9d3f14b2d5050928483d3d3b8fb6b4018893fc75710e6aa361ecb2474", size = 213036, upload-time = "2025-06-09T22:54:46.243Z" }, + { url = "https://files.pythonhosted.org/packages/07/94/2d7d1e328f45ff34a0a284cf5a2847013701e24c2a53117e7c280a4316b3/propcache-0.3.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0b504d29f3c47cf6b9e936c1852246c83d450e8e063d50562115a6be6d3a2535", size = 227684, upload-time = "2025-06-09T22:54:47.63Z" }, + { url = "https://files.pythonhosted.org/packages/b7/05/37ae63a0087677e90b1d14710e532ff104d44bc1efa3b3970fff99b891dc/propcache-0.3.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:ce2ac2675a6aa41ddb2a0c9cbff53780a617ac3d43e620f8fd77ba1c84dcfc06", size = 234562, upload-time = "2025-06-09T22:54:48.982Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7c/3f539fcae630408d0bd8bf3208b9a647ccad10976eda62402a80adf8fc34/propcache-0.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b4239611205294cc433845b914131b2a1f03500ff3c1ed093ed216b82621e1", size = 222142, upload-time = "2025-06-09T22:54:50.424Z" }, + { url = "https://files.pythonhosted.org/packages/7c/d2/34b9eac8c35f79f8a962546b3e97e9d4b990c420ee66ac8255d5d9611648/propcache-0.3.2-cp312-cp312-win32.whl", hash = "sha256:df4a81b9b53449ebc90cc4deefb052c1dd934ba85012aa912c7ea7b7e38b60c1", size = 37711, upload-time = "2025-06-09T22:54:52.072Z" }, + { url = "https://files.pythonhosted.org/packages/19/61/d582be5d226cf79071681d1b46b848d6cb03d7b70af7063e33a2787eaa03/propcache-0.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:7046e79b989d7fe457bb755844019e10f693752d169076138abf17f31380800c", size = 41479, upload-time = "2025-06-09T22:54:53.234Z" }, + { url = "https://files.pythonhosted.org/packages/dc/d1/8c747fafa558c603c4ca19d8e20b288aa0c7cda74e9402f50f31eb65267e/propcache-0.3.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ca592ed634a73ca002967458187109265e980422116c0a107cf93d81f95af945", size = 71286, upload-time = "2025-06-09T22:54:54.369Z" }, + { url = "https://files.pythonhosted.org/packages/61/99/d606cb7986b60d89c36de8a85d58764323b3a5ff07770a99d8e993b3fa73/propcache-0.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9ecb0aad4020e275652ba3975740f241bd12a61f1a784df044cf7477a02bc252", size = 42425, upload-time = "2025-06-09T22:54:55.642Z" }, + { url = "https://files.pythonhosted.org/packages/8c/96/ef98f91bbb42b79e9bb82bdd348b255eb9d65f14dbbe3b1594644c4073f7/propcache-0.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7f08f1cc28bd2eade7a8a3d2954ccc673bb02062e3e7da09bc75d843386b342f", size = 41846, upload-time = "2025-06-09T22:54:57.246Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ad/3f0f9a705fb630d175146cd7b1d2bf5555c9beaed54e94132b21aac098a6/propcache-0.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a342c834734edb4be5ecb1e9fb48cb64b1e2320fccbd8c54bf8da8f2a84c33", size = 208871, upload-time = "2025-06-09T22:54:58.975Z" }, + { url = "https://files.pythonhosted.org/packages/3a/38/2085cda93d2c8b6ec3e92af2c89489a36a5886b712a34ab25de9fbca7992/propcache-0.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a544caaae1ac73f1fecfae70ded3e93728831affebd017d53449e3ac052ac1e", size = 215720, upload-time = "2025-06-09T22:55:00.471Z" }, + { url = "https://files.pythonhosted.org/packages/61/c1/d72ea2dc83ac7f2c8e182786ab0fc2c7bd123a1ff9b7975bee671866fe5f/propcache-0.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310d11aa44635298397db47a3ebce7db99a4cc4b9bbdfcf6c98a60c8d5261cf1", size = 215203, upload-time = "2025-06-09T22:55:01.834Z" }, + { url = "https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1396592321ac83157ac03a2023aa6cc4a3cc3cfdecb71090054c09e5a7cce3", size = 206365, upload-time = "2025-06-09T22:55:03.199Z" }, + { url = "https://files.pythonhosted.org/packages/09/73/88549128bb89e66d2aff242488f62869014ae092db63ccea53c1cc75a81d/propcache-0.3.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cabf5b5902272565e78197edb682017d21cf3b550ba0460ee473753f28d23c1", size = 196016, upload-time = "2025-06-09T22:55:04.518Z" }, + { url = "https://files.pythonhosted.org/packages/b9/3f/3bdd14e737d145114a5eb83cb172903afba7242f67c5877f9909a20d948d/propcache-0.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0a2f2235ac46a7aa25bdeb03a9e7060f6ecbd213b1f9101c43b3090ffb971ef6", size = 205596, upload-time = "2025-06-09T22:55:05.942Z" }, + { url = "https://files.pythonhosted.org/packages/0f/ca/2f4aa819c357d3107c3763d7ef42c03980f9ed5c48c82e01e25945d437c1/propcache-0.3.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:92b69e12e34869a6970fd2f3da91669899994b47c98f5d430b781c26f1d9f387", size = 200977, upload-time = "2025-06-09T22:55:07.792Z" }, + { url = "https://files.pythonhosted.org/packages/cd/4a/e65276c7477533c59085251ae88505caf6831c0e85ff8b2e31ebcbb949b1/propcache-0.3.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:54e02207c79968ebbdffc169591009f4474dde3b4679e16634d34c9363ff56b4", size = 197220, upload-time = "2025-06-09T22:55:09.173Z" }, + { url = "https://files.pythonhosted.org/packages/7c/54/fc7152e517cf5578278b242396ce4d4b36795423988ef39bb8cd5bf274c8/propcache-0.3.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4adfb44cb588001f68c5466579d3f1157ca07f7504fc91ec87862e2b8e556b88", size = 210642, upload-time = "2025-06-09T22:55:10.62Z" }, + { url = "https://files.pythonhosted.org/packages/b9/80/abeb4a896d2767bf5f1ea7b92eb7be6a5330645bd7fb844049c0e4045d9d/propcache-0.3.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fd3e6019dc1261cd0291ee8919dd91fbab7b169bb76aeef6c716833a3f65d206", size = 212789, upload-time = "2025-06-09T22:55:12.029Z" }, + { url = "https://files.pythonhosted.org/packages/b3/db/ea12a49aa7b2b6d68a5da8293dcf50068d48d088100ac016ad92a6a780e6/propcache-0.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4c181cad81158d71c41a2bce88edce078458e2dd5ffee7eddd6b05da85079f43", size = 205880, upload-time = "2025-06-09T22:55:13.45Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e5/9076a0bbbfb65d1198007059c65639dfd56266cf8e477a9707e4b1999ff4/propcache-0.3.2-cp313-cp313-win32.whl", hash = "sha256:8a08154613f2249519e549de2330cf8e2071c2887309a7b07fb56098f5170a02", size = 37220, upload-time = "2025-06-09T22:55:15.284Z" }, + { url = "https://files.pythonhosted.org/packages/d3/f5/b369e026b09a26cd77aa88d8fffd69141d2ae00a2abaaf5380d2603f4b7f/propcache-0.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e41671f1594fc4ab0a6dec1351864713cb3a279910ae8b58f884a88a0a632c05", size = 40678, upload-time = "2025-06-09T22:55:16.445Z" }, + { url = "https://files.pythonhosted.org/packages/a4/3a/6ece377b55544941a08d03581c7bc400a3c8cd3c2865900a68d5de79e21f/propcache-0.3.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:9a3cf035bbaf035f109987d9d55dc90e4b0e36e04bbbb95af3055ef17194057b", size = 76560, upload-time = "2025-06-09T22:55:17.598Z" }, + { url = "https://files.pythonhosted.org/packages/0c/da/64a2bb16418740fa634b0e9c3d29edff1db07f56d3546ca2d86ddf0305e1/propcache-0.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:156c03d07dc1323d8dacaa221fbe028c5c70d16709cdd63502778e6c3ccca1b0", size = 44676, upload-time = "2025-06-09T22:55:18.922Z" }, + { url = "https://files.pythonhosted.org/packages/36/7b/f025e06ea51cb72c52fb87e9b395cced02786610b60a3ed51da8af017170/propcache-0.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74413c0ba02ba86f55cf60d18daab219f7e531620c15f1e23d95563f505efe7e", size = 44701, upload-time = "2025-06-09T22:55:20.106Z" }, + { url = "https://files.pythonhosted.org/packages/a4/00/faa1b1b7c3b74fc277f8642f32a4c72ba1d7b2de36d7cdfb676db7f4303e/propcache-0.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f066b437bb3fa39c58ff97ab2ca351db465157d68ed0440abecb21715eb24b28", size = 276934, upload-time = "2025-06-09T22:55:21.5Z" }, + { url = "https://files.pythonhosted.org/packages/74/ab/935beb6f1756e0476a4d5938ff44bf0d13a055fed880caf93859b4f1baf4/propcache-0.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f1304b085c83067914721e7e9d9917d41ad87696bf70f0bc7dee450e9c71ad0a", size = 278316, upload-time = "2025-06-09T22:55:22.918Z" }, + { url = "https://files.pythonhosted.org/packages/f8/9d/994a5c1ce4389610838d1caec74bdf0e98b306c70314d46dbe4fcf21a3e2/propcache-0.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab50cef01b372763a13333b4e54021bdcb291fc9a8e2ccb9c2df98be51bcde6c", size = 282619, upload-time = "2025-06-09T22:55:24.651Z" }, + { url = "https://files.pythonhosted.org/packages/2b/00/a10afce3d1ed0287cef2e09506d3be9822513f2c1e96457ee369adb9a6cd/propcache-0.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fad3b2a085ec259ad2c2842666b2a0a49dea8463579c606426128925af1ed725", size = 265896, upload-time = "2025-06-09T22:55:26.049Z" }, + { url = "https://files.pythonhosted.org/packages/2e/a8/2aa6716ffa566ca57c749edb909ad27884680887d68517e4be41b02299f3/propcache-0.3.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:261fa020c1c14deafd54c76b014956e2f86991af198c51139faf41c4d5e83892", size = 252111, upload-time = "2025-06-09T22:55:27.381Z" }, + { url = "https://files.pythonhosted.org/packages/36/4f/345ca9183b85ac29c8694b0941f7484bf419c7f0fea2d1e386b4f7893eed/propcache-0.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:46d7f8aa79c927e5f987ee3a80205c987717d3659f035c85cf0c3680526bdb44", size = 268334, upload-time = "2025-06-09T22:55:28.747Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ca/fcd54f78b59e3f97b3b9715501e3147f5340167733d27db423aa321e7148/propcache-0.3.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:6d8f3f0eebf73e3c0ff0e7853f68be638b4043c65a70517bb575eff54edd8dbe", size = 255026, upload-time = "2025-06-09T22:55:30.184Z" }, + { url = "https://files.pythonhosted.org/packages/8b/95/8e6a6bbbd78ac89c30c225210a5c687790e532ba4088afb8c0445b77ef37/propcache-0.3.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:03c89c1b14a5452cf15403e291c0ccd7751d5b9736ecb2c5bab977ad6c5bcd81", size = 250724, upload-time = "2025-06-09T22:55:31.646Z" }, + { url = "https://files.pythonhosted.org/packages/ee/b0/0dd03616142baba28e8b2d14ce5df6631b4673850a3d4f9c0f9dd714a404/propcache-0.3.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:0cc17efde71e12bbaad086d679ce575268d70bc123a5a71ea7ad76f70ba30bba", size = 268868, upload-time = "2025-06-09T22:55:33.209Z" }, + { url = "https://files.pythonhosted.org/packages/c5/98/2c12407a7e4fbacd94ddd32f3b1e3d5231e77c30ef7162b12a60e2dd5ce3/propcache-0.3.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:acdf05d00696bc0447e278bb53cb04ca72354e562cf88ea6f9107df8e7fd9770", size = 271322, upload-time = "2025-06-09T22:55:35.065Z" }, + { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, + { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, + { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, +] + +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, +] + +[[package]] +name = "protobuf" +version = "6.31.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/52/f3/b9655a711b32c19720253f6f06326faf90580834e2e83f840472d752bc8b/protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a", size = 441797, upload-time = "2025-05-28T19:25:54.947Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/6f/6ab8e4bf962fd5570d3deaa2d5c38f0a363f57b4501047b5ebeb83ab1125/protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9", size = 423603, upload-time = "2025-05-28T19:25:41.198Z" }, + { url = "https://files.pythonhosted.org/packages/44/3a/b15c4347dd4bf3a1b0ee882f384623e2063bb5cf9fa9d57990a4f7df2fb6/protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447", size = 435283, upload-time = "2025-05-28T19:25:44.275Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c9/b9689a2a250264a84e66c46d8862ba788ee7a641cdca39bccf64f59284b7/protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402", size = 425604, upload-time = "2025-05-28T19:25:45.702Z" }, + { url = "https://files.pythonhosted.org/packages/76/a1/7a5a94032c83375e4fe7e7f56e3976ea6ac90c5e85fac8576409e25c39c3/protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39", size = 322115, upload-time = "2025-05-28T19:25:47.128Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6", size = 321070, upload-time = "2025-05-28T19:25:50.036Z" }, + { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload-time = "2025-05-28T19:25:53.926Z" }, +] + [[package]] name = "psutil" version = "7.0.0" @@ -1533,30 +1609,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "py-spy" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/e2/ff811a367028b87e86714945bb9ecb5c1cc69114a8039a67b3a862cef921/py_spy-0.4.1.tar.gz", hash = "sha256:e53aa53daa2e47c2eef97dd2455b47bb3a7e7f962796a86cc3e7dbde8e6f4db4", size = 244726, upload-time = "2025-07-31T19:33:25.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/e3/3a32500d845bdd94f6a2b4ed6244982f42ec2bc64602ea8fcfe900678ae7/py_spy-0.4.1-py2.py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:809094208c6256c8f4ccadd31e9a513fe2429253f48e20066879239ba12cd8cc", size = 3682508, upload-time = "2025-07-31T19:33:13.753Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/e4d280e9e0bec71d39fc646654097027d4bbe8e04af18fb68e49afcff404/py_spy-0.4.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:1fb8bf71ab8df95a95cc387deed6552934c50feef2cf6456bc06692a5508fd0c", size = 1796395, upload-time = "2025-07-31T19:33:15.325Z" }, + { url = "https://files.pythonhosted.org/packages/df/79/9ed50bb0a9de63ed023aa2db8b6265b04a7760d98c61eb54def6a5fddb68/py_spy-0.4.1-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee776b9d512a011d1ad3907ed53ae32ce2f3d9ff3e1782236554e22103b5c084", size = 2034938, upload-time = "2025-07-31T19:33:17.194Z" }, + { url = "https://files.pythonhosted.org/packages/53/a5/36862e3eea59f729dfb70ee6f9e14b051d8ddce1aa7e70e0b81d9fe18536/py_spy-0.4.1-py2.py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:532d3525538254d1859b49de1fbe9744df6b8865657c9f0e444bf36ce3f19226", size = 2658968, upload-time = "2025-07-31T19:33:18.916Z" }, + { url = "https://files.pythonhosted.org/packages/08/f8/9ea0b586b065a623f591e5e7961282ec944b5fbbdca33186c7c0296645b3/py_spy-0.4.1-py2.py3-none-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4972c21890b6814017e39ac233c22572c4a61fd874524ebc5ccab0f2237aee0a", size = 2147541, upload-time = "2025-07-31T19:33:20.565Z" }, + { url = "https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29", size = 2763338, upload-time = "2025-07-31T19:33:22.202Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/fcc9a9fcd4ca946ff402cff20348e838b051d69f50f5d1f5dca4cd3c5eb8/py_spy-0.4.1-py2.py3-none-win_amd64.whl", hash = "sha256:d92e522bd40e9bf7d87c204033ce5bb5c828fca45fa28d970f58d71128069fdc", size = 1818784, upload-time = "2025-07-31T19:33:23.802Z" }, +] + [[package]] name = "pyarrow" version = "20.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5b/23/77094eb8ee0dbe88441689cb6afc40ac312a1e15d3a7acc0586999518222/pyarrow-20.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c7dd06fd7d7b410ca5dc839cc9d485d2bc4ae5240851bcd45d85105cc90a47d7", size = 30832591, upload-time = "2025-04-27T12:27:27.89Z" }, - { url = "https://files.pythonhosted.org/packages/c3/d5/48cc573aff00d62913701d9fac478518f693b30c25f2c157550b0b2565cb/pyarrow-20.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d5382de8dc34c943249b01c19110783d0d64b207167c728461add1ecc2db88e4", size = 32273686, upload-time = "2025-04-27T12:27:36.816Z" }, - { url = "https://files.pythonhosted.org/packages/37/df/4099b69a432b5cb412dd18adc2629975544d656df3d7fda6d73c5dba935d/pyarrow-20.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6415a0d0174487456ddc9beaead703d0ded5966129fa4fd3114d76b5d1c5ceae", size = 41337051, upload-time = "2025-04-27T12:27:44.4Z" }, - { url = "https://files.pythonhosted.org/packages/4c/27/99922a9ac1c9226f346e3a1e15e63dee6f623ed757ff2893f9d6994a69d3/pyarrow-20.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15aa1b3b2587e74328a730457068dc6c89e6dcbf438d4369f572af9d320a25ee", size = 42404659, upload-time = "2025-04-27T12:27:51.715Z" }, - { url = "https://files.pythonhosted.org/packages/21/d1/71d91b2791b829c9e98f1e0d85be66ed93aff399f80abb99678511847eaa/pyarrow-20.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5605919fbe67a7948c1f03b9f3727d82846c053cd2ce9303ace791855923fd20", size = 40695446, upload-time = "2025-04-27T12:27:59.643Z" }, - { url = "https://files.pythonhosted.org/packages/f1/ca/ae10fba419a6e94329707487835ec721f5a95f3ac9168500bcf7aa3813c7/pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a5704f29a74b81673d266e5ec1fe376f060627c2e42c5c7651288ed4b0db29e9", size = 42278528, upload-time = "2025-04-27T12:28:07.297Z" }, - { url = "https://files.pythonhosted.org/packages/7a/a6/aba40a2bf01b5d00cf9cd16d427a5da1fad0fb69b514ce8c8292ab80e968/pyarrow-20.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:00138f79ee1b5aca81e2bdedb91e3739b987245e11fa3c826f9e57c5d102fb75", size = 42918162, upload-time = "2025-04-27T12:28:15.716Z" }, - { url = "https://files.pythonhosted.org/packages/93/6b/98b39650cd64f32bf2ec6d627a9bd24fcb3e4e6ea1873c5e1ea8a83b1a18/pyarrow-20.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f2d67ac28f57a362f1a2c1e6fa98bfe2f03230f7e15927aecd067433b1e70ce8", size = 44550319, upload-time = "2025-04-27T12:28:27.026Z" }, - { url = "https://files.pythonhosted.org/packages/ab/32/340238be1eb5037e7b5de7e640ee22334417239bc347eadefaf8c373936d/pyarrow-20.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:4a8b029a07956b8d7bd742ffca25374dd3f634b35e46cc7a7c3fa4c75b297191", size = 25770759, upload-time = "2025-04-27T12:28:33.702Z" }, - { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload-time = "2025-04-27T12:28:40.78Z" }, - { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload-time = "2025-04-27T12:28:47.051Z" }, - { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload-time = "2025-04-27T12:28:55.064Z" }, - { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload-time = "2025-04-27T12:29:02.13Z" }, - { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload-time = "2025-04-27T12:29:09.951Z" }, - { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload-time = "2025-04-27T12:29:17.187Z" }, - { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload-time = "2025-04-27T12:29:24.253Z" }, - { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload-time = "2025-04-27T12:29:32.782Z" }, - { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload-time = "2025-04-27T12:29:38.464Z" }, { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload-time = "2025-04-27T12:29:44.384Z" }, { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload-time = "2025-04-27T12:29:52.038Z" }, { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload-time = "2025-04-27T12:29:59.452Z" }, @@ -1598,6 +1671,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl", hash = "sha256:8ecfdd215af468d6b993e2290da7f3d51a32991c1d230b90682f7ee4bc5ee7cd", size = 235661, upload-time = "2025-07-16T02:28:53.394Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pycparser" version = "2.22" @@ -1631,33 +1725,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/92/b31726561b5dae176c2d2c2dc43a9c5bfba5d32f96f8b4c0a600dd492447/pydantic_core-2.33.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2b3d326aaef0c0399d9afffeb6367d5e26ddc24d351dbc9c636840ac355dc5d8", size = 2028817, upload-time = "2025-04-23T18:30:43.919Z" }, - { url = "https://files.pythonhosted.org/packages/a3/44/3f0b95fafdaca04a483c4e685fe437c6891001bf3ce8b2fded82b9ea3aa1/pydantic_core-2.33.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e5b2671f05ba48b94cb90ce55d8bdcaaedb8ba00cc5359f6810fc918713983d", size = 1861357, upload-time = "2025-04-23T18:30:46.372Z" }, - { url = "https://files.pythonhosted.org/packages/30/97/e8f13b55766234caae05372826e8e4b3b96e7b248be3157f53237682e43c/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0069c9acc3f3981b9ff4cdfaf088e98d83440a4c7ea1bc07460af3d4dc22e72d", size = 1898011, upload-time = "2025-04-23T18:30:47.591Z" }, - { url = "https://files.pythonhosted.org/packages/9b/a3/99c48cf7bafc991cc3ee66fd544c0aae8dc907b752f1dad2d79b1b5a471f/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d53b22f2032c42eaaf025f7c40c2e3b94568ae077a606f006d206a463bc69572", size = 1982730, upload-time = "2025-04-23T18:30:49.328Z" }, - { url = "https://files.pythonhosted.org/packages/de/8e/a5b882ec4307010a840fb8b58bd9bf65d1840c92eae7534c7441709bf54b/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0405262705a123b7ce9f0b92f123334d67b70fd1f20a9372b907ce1080c7ba02", size = 2136178, upload-time = "2025-04-23T18:30:50.907Z" }, - { url = "https://files.pythonhosted.org/packages/e4/bb/71e35fc3ed05af6834e890edb75968e2802fe98778971ab5cba20a162315/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b25d91e288e2c4e0662b8038a28c6a07eaac3e196cfc4ff69de4ea3db992a1b", size = 2736462, upload-time = "2025-04-23T18:30:52.083Z" }, - { url = "https://files.pythonhosted.org/packages/31/0d/c8f7593e6bc7066289bbc366f2235701dcbebcd1ff0ef8e64f6f239fb47d/pydantic_core-2.33.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bdfe4b3789761f3bcb4b1ddf33355a71079858958e3a552f16d5af19768fef2", size = 2005652, upload-time = "2025-04-23T18:30:53.389Z" }, - { url = "https://files.pythonhosted.org/packages/d2/7a/996d8bd75f3eda405e3dd219ff5ff0a283cd8e34add39d8ef9157e722867/pydantic_core-2.33.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:efec8db3266b76ef9607c2c4c419bdb06bf335ae433b80816089ea7585816f6a", size = 2113306, upload-time = "2025-04-23T18:30:54.661Z" }, - { url = "https://files.pythonhosted.org/packages/ff/84/daf2a6fb2db40ffda6578a7e8c5a6e9c8affb251a05c233ae37098118788/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:031c57d67ca86902726e0fae2214ce6770bbe2f710dc33063187a68744a5ecac", size = 2073720, upload-time = "2025-04-23T18:30:56.11Z" }, - { url = "https://files.pythonhosted.org/packages/77/fb/2258da019f4825128445ae79456a5499c032b55849dbd5bed78c95ccf163/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:f8de619080e944347f5f20de29a975c2d815d9ddd8be9b9b7268e2e3ef68605a", size = 2244915, upload-time = "2025-04-23T18:30:57.501Z" }, - { url = "https://files.pythonhosted.org/packages/d8/7a/925ff73756031289468326e355b6fa8316960d0d65f8b5d6b3a3e7866de7/pydantic_core-2.33.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:73662edf539e72a9440129f231ed3757faab89630d291b784ca99237fb94db2b", size = 2241884, upload-time = "2025-04-23T18:30:58.867Z" }, - { url = "https://files.pythonhosted.org/packages/0b/b0/249ee6d2646f1cdadcb813805fe76265745c4010cf20a8eba7b0e639d9b2/pydantic_core-2.33.2-cp310-cp310-win32.whl", hash = "sha256:0a39979dcbb70998b0e505fb1556a1d550a0781463ce84ebf915ba293ccb7e22", size = 1910496, upload-time = "2025-04-23T18:31:00.078Z" }, - { url = "https://files.pythonhosted.org/packages/66/ff/172ba8f12a42d4b552917aa65d1f2328990d3ccfc01d5b7c943ec084299f/pydantic_core-2.33.2-cp310-cp310-win_amd64.whl", hash = "sha256:b0379a2b24882fef529ec3b4987cb5d003b9cda32256024e6fe1586ac45fc640", size = 1955019, upload-time = "2025-04-23T18:31:01.335Z" }, - { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, - { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, - { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, - { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, - { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, - { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, - { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, - { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, - { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, - { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, - { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, - { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, - { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, - { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, @@ -1689,24 +1756,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, - { url = "https://files.pythonhosted.org/packages/30/68/373d55e58b7e83ce371691f6eaa7175e3a24b956c44628eb25d7da007917/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5c4aa4e82353f65e548c476b37e64189783aa5384903bfea4f41580f255fddfa", size = 2023982, upload-time = "2025-04-23T18:32:53.14Z" }, - { url = "https://files.pythonhosted.org/packages/a4/16/145f54ac08c96a63d8ed6442f9dec17b2773d19920b627b18d4f10a061ea/pydantic_core-2.33.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d946c8bf0d5c24bf4fe333af284c59a19358aa3ec18cb3dc4370080da1e8ad29", size = 1858412, upload-time = "2025-04-23T18:32:55.52Z" }, - { url = "https://files.pythonhosted.org/packages/41/b1/c6dc6c3e2de4516c0bb2c46f6a373b91b5660312342a0cf5826e38ad82fa/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87b31b6846e361ef83fedb187bb5b4372d0da3f7e28d85415efa92d6125d6e6d", size = 1892749, upload-time = "2025-04-23T18:32:57.546Z" }, - { url = "https://files.pythonhosted.org/packages/12/73/8cd57e20afba760b21b742106f9dbdfa6697f1570b189c7457a1af4cd8a0/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa9d91b338f2df0508606f7009fde642391425189bba6d8c653afd80fd6bb64e", size = 2067527, upload-time = "2025-04-23T18:32:59.771Z" }, - { url = "https://files.pythonhosted.org/packages/e3/d5/0bb5d988cc019b3cba4a78f2d4b3854427fc47ee8ec8e9eaabf787da239c/pydantic_core-2.33.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2058a32994f1fde4ca0480ab9d1e75a0e8c87c22b53a3ae66554f9af78f2fe8c", size = 2108225, upload-time = "2025-04-23T18:33:04.51Z" }, - { url = "https://files.pythonhosted.org/packages/f1/c5/00c02d1571913d496aabf146106ad8239dc132485ee22efe08085084ff7c/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:0e03262ab796d986f978f79c943fc5f620381be7287148b8010b4097f79a39ec", size = 2069490, upload-time = "2025-04-23T18:33:06.391Z" }, - { url = "https://files.pythonhosted.org/packages/22/a8/dccc38768274d3ed3a59b5d06f59ccb845778687652daa71df0cab4040d7/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:1a8695a8d00c73e50bff9dfda4d540b7dee29ff9b8053e38380426a85ef10052", size = 2237525, upload-time = "2025-04-23T18:33:08.44Z" }, - { url = "https://files.pythonhosted.org/packages/d4/e7/4f98c0b125dda7cf7ccd14ba936218397b44f50a56dd8c16a3091df116c3/pydantic_core-2.33.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:fa754d1850735a0b0e03bcffd9d4b4343eb417e47196e4485d9cca326073a42c", size = 2238446, upload-time = "2025-04-23T18:33:10.313Z" }, - { url = "https://files.pythonhosted.org/packages/ce/91/2ec36480fdb0b783cd9ef6795753c1dea13882f2e68e73bce76ae8c21e6a/pydantic_core-2.33.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a11c8d26a50bfab49002947d3d237abe4d9e4b5bdc8846a63537b6488e197808", size = 2066678, upload-time = "2025-04-23T18:33:12.224Z" }, - { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, - { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, - { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, - { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, - { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, - { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, - { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, - { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, - { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, ] [[package]] @@ -1737,25 +1786,11 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz", hash = "sha256:3634134ce33859a441768b39df179b2c6f3de2bbbf506622884f553b013ee799", size = 617629, upload-time = "2025-04-30T14:59:34.306Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/5d/bb10c86b85895d4ba471b8a0e187031d4aaa82592a639242b83dd9354861/pyiceberg-0.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a183d9217eb82159c01b23c683057f96c8b2375f592b921721d1c157895e2df", size = 527097, upload-time = "2025-04-30T14:58:52.39Z" }, - { url = "https://files.pythonhosted.org/packages/ec/b9/1d6f0d334bc51cd64a58b7320d521e54af3810a6bd748fe2e89db1ad8d5f/pyiceberg-0.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:57030bb15c397b0379242907c5611f5b4338fb799e972353fd0edafde6cfd2ef", size = 523267, upload-time = "2025-04-30T14:58:53.978Z" }, - { url = "https://files.pythonhosted.org/packages/02/f5/bd43a9c1d2cd3aeb987cbf2b7f25e2b10306fa81522ea00df250fb23cc84/pyiceberg-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ba4cd9a8f6a04cfbc68e0c83f2db3ffd14244da8601a142cc05965d4b343645", size = 838616, upload-time = "2025-04-30T14:58:55.252Z" }, - { url = "https://files.pythonhosted.org/packages/d0/01/c68f9e03413dc983ddadc2c471038af2ff792449fc451731f58a958a7696/pyiceberg-0.9.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d5a48c6a2016d0dcde8c9079cc5e6b6d2e2ac663eddfe4697e7ea03a0edc40b7", size = 838290, upload-time = "2025-04-30T14:58:56.412Z" }, - { url = "https://files.pythonhosted.org/packages/ab/80/b7cba54a33b8b7be3655ff656d6bb8594fec0316eec5cafa231ec7f6ff74/pyiceberg-0.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:8bebfa5a804a95a9f3d98d88cbeb37430b09add04592238bba2a2b2e0466d60d", size = 523612, upload-time = "2025-04-30T14:58:59.507Z" }, - { url = "https://files.pythonhosted.org/packages/f6/75/c8b4ebba7d345b5e736ebf4976121b97dd7091dcad401a17ca57152704c5/pyiceberg-0.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e75c502dd56ac3d77036ce8a3b2566348da5ff4367c7c671981616ef6dcc883", size = 566274, upload-time = "2025-04-30T14:59:00.626Z" }, - { url = "https://files.pythonhosted.org/packages/e0/a0/9494c7930e5e4dc951d95abba584d8ffdb7403368398796ede21ff25c26f/pyiceberg-0.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0a8189c9b3ba81dd12493d6bb874a656a4d4909904552b97a629d1d43b3a0e90", size = 560157, upload-time = "2025-04-30T14:59:02.082Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d4/351776b1ae83de187d7cf37b100f4f124c7a71e35337182d3aef308156d1/pyiceberg-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c03065d5c5b704444ab8fb18cdd232ec43994db95b9e53444008ebc2cf9dc2c", size = 1052290, upload-time = "2025-04-30T14:59:03.232Z" }, - { url = "https://files.pythonhosted.org/packages/40/17/d8fea681afb52f20bf6a640f9044dcf621a47165f67cc5320bf3c6e82e4e/pyiceberg-0.9.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:93f2586a5da737de6e4643bf096a01772f068d1eedb7ffde6b36c60b6b9e6bd3", size = 1047503, upload-time = "2025-04-30T14:59:04.38Z" }, - { url = "https://files.pythonhosted.org/packages/d0/e0/d173fc2aa8dc252d7aac71703ba2c0491e4988b3a160cf5abb531cfb9086/pyiceberg-0.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:94e45c10051110ba7a43b85a1f0a680b4a31d1d6cee593c8e62e14d22d18c47d", size = 559491, upload-time = "2025-04-30T14:59:05.615Z" }, { url = "https://files.pythonhosted.org/packages/52/26/77983c2884b4a5f13f8a35e5c5e762ae699f6c511efd16730ab883000c1b/pyiceberg-0.9.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b8a958e3bbe919026533cee1f0fb6b7040928fce8d42c2ecea228de7c17578fa", size = 605755, upload-time = "2025-04-30T14:59:07.087Z" }, { url = "https://files.pythonhosted.org/packages/6d/67/e6ea7fcc43aebc85aea5a67a69d01c9015283478061c3121b6b8aa158ce4/pyiceberg-0.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7e956b35c6822600c45fd8f3ea8cfea328cc406fefa534afeb6fdb325d05406", size = 597325, upload-time = "2025-04-30T14:59:08.644Z" }, { url = "https://files.pythonhosted.org/packages/7f/cf/178a9f63fac1bfdd13bc85169e7ab903955d082e2cd80507b1921a6f64dc/pyiceberg-0.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e4e585164d7d86f5c9a609a1bc2abeae2f0ea0680a11a2064d3a945866b5311", size = 1277399, upload-time = "2025-04-30T14:59:10.193Z" }, { url = "https://files.pythonhosted.org/packages/d1/6b/78d1739eb1d5b18529ee438aed75dac3e0b246f5e4d800931f9d1e37cda2/pyiceberg-0.9.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fee08dac30e8524526f7d18468f9670f8606905b850b261314c597c6633f3b4", size = 1269083, upload-time = "2025-04-30T14:59:11.964Z" }, { url = "https://files.pythonhosted.org/packages/67/69/c0087d19c8d8e8530acee3ba485d54aedeebf2963784a16692ca4b439566/pyiceberg-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:124793c54a0c2fb5ac4ab19c38da116c068e277c85cbaa7e4064e635a70b595e", size = 595512, upload-time = "2025-04-30T14:59:14.464Z" }, - { url = "https://files.pythonhosted.org/packages/aa/62/0153ed3a39d6f4b3235d430123703d4684eec7ba780404bbc118ace7406a/pyiceberg-0.9.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:77aec1c77d675603e0c5358e74adcae8d13b323753d702011be3f309d26af355", size = 668261, upload-time = "2025-04-30T14:59:21.751Z" }, - { url = "https://files.pythonhosted.org/packages/24/bd/c4cec142686dd8124032c69b6b02ba3703abc114ce787d0f02088b1f43d8/pyiceberg-0.9.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:cf567438bf6267bbb67fdfdfc72ac500d523725fca9a6a38f93e8acd4146190e", size = 657439, upload-time = "2025-04-30T14:59:23.304Z" }, - { url = "https://files.pythonhosted.org/packages/ae/74/bbfc70bb1857f9d55d06fee1330a0236876b8ae4aa6fc5d815e2c4fef4f7/pyiceberg-0.9.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5992db7c00d789a33ff117700d453126803e769507a5edeb79bb6510ff72fc00", size = 1352983, upload-time = "2025-04-30T14:59:25.023Z" }, - { url = "https://files.pythonhosted.org/packages/90/20/e33e1716d1368b2471b80d9f1e338110f1e781b34ebffc5e320523102ffc/pyiceberg-0.9.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e460fca26162a3822c0e8d50b49c80928a0e35cb41698748d7a26f8c016215", size = 657563, upload-time = "2025-04-30T14:59:27.004Z" }, ] [[package]] @@ -1782,11 +1817,9 @@ version = "8.3.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "iniconfig" }, { name = "packaging" }, { name = "pluggy" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" } wheels = [ @@ -1798,7 +1831,7 @@ name = "pytest-cov" version = "6.1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "coverage", extra = ["toml"] }, + { name = "coverage" }, { name = "pytest" }, ] sdist = { url = "https://files.pythonhosted.org/packages/25/69/5f1e57f6c5a39f81411b550027bf72842c4567ff5fd572bed1edc9e4b5d9/pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a", size = 66857, upload-time = "2025-04-05T14:07:51.592Z" } @@ -1832,12 +1865,6 @@ name = "pywin32" version = "310" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/95/da/a5f38fffbba2fb99aa4aa905480ac4b8e83ca486659ac8c95bce47fb5276/pywin32-310-cp310-cp310-win32.whl", hash = "sha256:6dd97011efc8bf51d6793a82292419eba2c71cf8e7250cfac03bba284454abc1", size = 8848240, upload-time = "2025-03-17T00:55:46.783Z" }, - { url = "https://files.pythonhosted.org/packages/aa/fe/d873a773324fa565619ba555a82c9dabd677301720f3660a731a5d07e49a/pywin32-310-cp310-cp310-win_amd64.whl", hash = "sha256:c3e78706e4229b915a0821941a84e7ef420bf2b77e08c9dae3c76fd03fd2ae3d", size = 9601854, upload-time = "2025-03-17T00:55:48.783Z" }, - { url = "https://files.pythonhosted.org/packages/3c/84/1a8e3d7a15490d28a5d816efa229ecb4999cdc51a7c30dd8914f669093b8/pywin32-310-cp310-cp310-win_arm64.whl", hash = "sha256:33babed0cf0c92a6f94cc6cc13546ab24ee13e3e800e61ed87609ab91e4c8213", size = 8522963, upload-time = "2025-03-17T00:55:50.969Z" }, - { url = "https://files.pythonhosted.org/packages/f7/b1/68aa2986129fb1011dabbe95f0136f44509afaf072b12b8f815905a39f33/pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd", size = 8784284, upload-time = "2025-03-17T00:55:53.124Z" }, - { url = "https://files.pythonhosted.org/packages/b3/bd/d1592635992dd8db5bb8ace0551bc3a769de1ac8850200cfa517e72739fb/pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c", size = 9520748, upload-time = "2025-03-17T00:55:55.203Z" }, - { url = "https://files.pythonhosted.org/packages/90/b1/ac8b1ffce6603849eb45a91cf126c0fa5431f186c2e768bf56889c46f51c/pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582", size = 8455941, upload-time = "2025-03-17T00:55:57.048Z" }, { url = "https://files.pythonhosted.org/packages/6b/ec/4fdbe47932f671d6e348474ea35ed94227fb5df56a7c30cbbb42cd396ed0/pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d", size = 8796239, upload-time = "2025-03-17T00:55:58.807Z" }, { url = "https://files.pythonhosted.org/packages/e3/e5/b0627f8bb84e06991bea89ad8153a9e50ace40b2e1195d68e9dff6b03d0f/pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060", size = 9503839, upload-time = "2025-03-17T00:56:00.8Z" }, { url = "https://files.pythonhosted.org/packages/1f/32/9ccf53748df72301a89713936645a664ec001abd35ecc8578beda593d37d/pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966", size = 8459470, upload-time = "2025-03-17T00:56:02.601Z" }, @@ -1852,24 +1879,6 @@ version = "6.0.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/95/a3fac87cb7158e231b5a6012e438c647e1a87f09f8e0d123acec8ab8bf71/PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086", size = 184199, upload-time = "2024-08-06T20:31:40.178Z" }, - { url = "https://files.pythonhosted.org/packages/c7/7a/68bd47624dab8fd4afbfd3c48e3b79efe09098ae941de5b58abcbadff5cb/PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf", size = 171758, upload-time = "2024-08-06T20:31:42.173Z" }, - { url = "https://files.pythonhosted.org/packages/49/ee/14c54df452143b9ee9f0f29074d7ca5516a36edb0b4cc40c3f280131656f/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237", size = 718463, upload-time = "2024-08-06T20:31:44.263Z" }, - { url = "https://files.pythonhosted.org/packages/4d/61/de363a97476e766574650d742205be468921a7b532aa2499fcd886b62530/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b", size = 719280, upload-time = "2024-08-06T20:31:50.199Z" }, - { url = "https://files.pythonhosted.org/packages/6b/4e/1523cb902fd98355e2e9ea5e5eb237cbc5f3ad5f3075fa65087aa0ecb669/PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed", size = 751239, upload-time = "2024-08-06T20:31:52.292Z" }, - { url = "https://files.pythonhosted.org/packages/b7/33/5504b3a9a4464893c32f118a9cc045190a91637b119a9c881da1cf6b7a72/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180", size = 695802, upload-time = "2024-08-06T20:31:53.836Z" }, - { url = "https://files.pythonhosted.org/packages/5c/20/8347dcabd41ef3a3cdc4f7b7a2aff3d06598c8779faa189cdbf878b626a4/PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68", size = 720527, upload-time = "2024-08-06T20:31:55.565Z" }, - { url = "https://files.pythonhosted.org/packages/be/aa/5afe99233fb360d0ff37377145a949ae258aaab831bde4792b32650a4378/PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99", size = 144052, upload-time = "2024-08-06T20:31:56.914Z" }, - { url = "https://files.pythonhosted.org/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e", size = 161774, upload-time = "2024-08-06T20:31:58.304Z" }, - { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" }, - { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" }, - { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" }, - { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" }, - { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" }, - { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" }, - { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" }, - { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" }, - { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" }, { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, @@ -1899,28 +1908,6 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/b1/11/b9213d25230ac18a71b39b3723494e57adebe36e066397b961657b3b41c1/pyzmq-26.4.0.tar.gz", hash = "sha256:4bd13f85f80962f91a651a7356fe0472791a5f7a92f227822b5acf44795c626d", size = 278293, upload-time = "2025-04-04T12:05:44.049Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/b8/af1d814ffc3ff9730f9a970cbf216b6f078e5d251a25ef5201d7bc32a37c/pyzmq-26.4.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:0329bdf83e170ac133f44a233fc651f6ed66ef8e66693b5af7d54f45d1ef5918", size = 1339238, upload-time = "2025-04-04T12:03:07.022Z" }, - { url = "https://files.pythonhosted.org/packages/ee/e4/5aafed4886c264f2ea6064601ad39c5fc4e9b6539c6ebe598a859832eeee/pyzmq-26.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:398a825d2dea96227cf6460ce0a174cf7657d6f6827807d4d1ae9d0f9ae64315", size = 672848, upload-time = "2025-04-04T12:03:08.591Z" }, - { url = "https://files.pythonhosted.org/packages/79/39/026bf49c721cb42f1ef3ae0ee3d348212a7621d2adb739ba97599b6e4d50/pyzmq-26.4.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6d52d62edc96787f5c1dfa6c6ccff9b581cfae5a70d94ec4c8da157656c73b5b", size = 911299, upload-time = "2025-04-04T12:03:10Z" }, - { url = "https://files.pythonhosted.org/packages/03/23/b41f936a9403b8f92325c823c0f264c6102a0687a99c820f1aaeb99c1def/pyzmq-26.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1410c3a3705db68d11eb2424d75894d41cff2f64d948ffe245dd97a9debfebf4", size = 867920, upload-time = "2025-04-04T12:03:11.311Z" }, - { url = "https://files.pythonhosted.org/packages/c1/3e/2de5928cdadc2105e7c8f890cc5f404136b41ce5b6eae5902167f1d5641c/pyzmq-26.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:7dacb06a9c83b007cc01e8e5277f94c95c453c5851aac5e83efe93e72226353f", size = 862514, upload-time = "2025-04-04T12:03:13.013Z" }, - { url = "https://files.pythonhosted.org/packages/ce/57/109569514dd32e05a61d4382bc88980c95bfd2f02e58fea47ec0ccd96de1/pyzmq-26.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6bab961c8c9b3a4dc94d26e9b2cdf84de9918931d01d6ff38c721a83ab3c0ef5", size = 1204494, upload-time = "2025-04-04T12:03:14.795Z" }, - { url = "https://files.pythonhosted.org/packages/aa/02/dc51068ff2ca70350d1151833643a598625feac7b632372d229ceb4de3e1/pyzmq-26.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a5c09413b924d96af2aa8b57e76b9b0058284d60e2fc3730ce0f979031d162a", size = 1514525, upload-time = "2025-04-04T12:03:16.246Z" }, - { url = "https://files.pythonhosted.org/packages/48/2a/a7d81873fff0645eb60afaec2b7c78a85a377af8f1d911aff045d8955bc7/pyzmq-26.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7d489ac234d38e57f458fdbd12a996bfe990ac028feaf6f3c1e81ff766513d3b", size = 1414659, upload-time = "2025-04-04T12:03:17.652Z" }, - { url = "https://files.pythonhosted.org/packages/ef/ea/813af9c42ae21845c1ccfe495bd29c067622a621e85d7cda6bc437de8101/pyzmq-26.4.0-cp310-cp310-win32.whl", hash = "sha256:dea1c8db78fb1b4b7dc9f8e213d0af3fc8ecd2c51a1d5a3ca1cde1bda034a980", size = 580348, upload-time = "2025-04-04T12:03:19.384Z" }, - { url = "https://files.pythonhosted.org/packages/20/68/318666a89a565252c81d3fed7f3b4c54bd80fd55c6095988dfa2cd04a62b/pyzmq-26.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:fa59e1f5a224b5e04dc6c101d7186058efa68288c2d714aa12d27603ae93318b", size = 643838, upload-time = "2025-04-04T12:03:20.795Z" }, - { url = "https://files.pythonhosted.org/packages/91/f8/fb1a15b5f4ecd3e588bfde40c17d32ed84b735195b5c7d1d7ce88301a16f/pyzmq-26.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:a651fe2f447672f4a815e22e74630b6b1ec3a1ab670c95e5e5e28dcd4e69bbb5", size = 559565, upload-time = "2025-04-04T12:03:22.676Z" }, - { url = "https://files.pythonhosted.org/packages/32/6d/234e3b0aa82fd0290b1896e9992f56bdddf1f97266110be54d0177a9d2d9/pyzmq-26.4.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:bfcf82644c9b45ddd7cd2a041f3ff8dce4a0904429b74d73a439e8cab1bd9e54", size = 1339723, upload-time = "2025-04-04T12:03:24.358Z" }, - { url = "https://files.pythonhosted.org/packages/4f/11/6d561efe29ad83f7149a7cd48e498e539ed09019c6cd7ecc73f4cc725028/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9bcae3979b2654d5289d3490742378b2f3ce804b0b5fd42036074e2bf35b030", size = 672645, upload-time = "2025-04-04T12:03:25.693Z" }, - { url = "https://files.pythonhosted.org/packages/19/fd/81bfe3e23f418644660bad1a90f0d22f0b3eebe33dd65a79385530bceb3d/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccdff8ac4246b6fb60dcf3982dfaeeff5dd04f36051fe0632748fc0aa0679c01", size = 910133, upload-time = "2025-04-04T12:03:27.625Z" }, - { url = "https://files.pythonhosted.org/packages/97/68/321b9c775595ea3df832a9516252b653fe32818db66fdc8fa31c9b9fce37/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4550af385b442dc2d55ab7717837812799d3674cb12f9a3aa897611839c18e9e", size = 867428, upload-time = "2025-04-04T12:03:29.004Z" }, - { url = "https://files.pythonhosted.org/packages/4e/6e/159cbf2055ef36aa2aa297e01b24523176e5b48ead283c23a94179fb2ba2/pyzmq-26.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f7ffe9db1187a253fca95191854b3fda24696f086e8789d1d449308a34b88", size = 862409, upload-time = "2025-04-04T12:03:31.032Z" }, - { url = "https://files.pythonhosted.org/packages/05/1c/45fb8db7be5a7d0cadea1070a9cbded5199a2d578de2208197e592f219bd/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3709c9ff7ba61589b7372923fd82b99a81932b592a5c7f1a24147c91da9a68d6", size = 1205007, upload-time = "2025-04-04T12:03:32.687Z" }, - { url = "https://files.pythonhosted.org/packages/f8/fa/658c7f583af6498b463f2fa600f34e298e1b330886f82f1feba0dc2dd6c3/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f8f3c30fb2d26ae5ce36b59768ba60fb72507ea9efc72f8f69fa088450cff1df", size = 1514599, upload-time = "2025-04-04T12:03:34.084Z" }, - { url = "https://files.pythonhosted.org/packages/4d/d7/44d641522353ce0a2bbd150379cb5ec32f7120944e6bfba4846586945658/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:382a4a48c8080e273427fc692037e3f7d2851959ffe40864f2db32646eeb3cef", size = 1414546, upload-time = "2025-04-04T12:03:35.478Z" }, - { url = "https://files.pythonhosted.org/packages/72/76/c8ed7263218b3d1e9bce07b9058502024188bd52cc0b0a267a9513b431fc/pyzmq-26.4.0-cp311-cp311-win32.whl", hash = "sha256:d56aad0517d4c09e3b4f15adebba8f6372c5102c27742a5bdbfc74a7dceb8fca", size = 579247, upload-time = "2025-04-04T12:03:36.846Z" }, - { url = "https://files.pythonhosted.org/packages/c3/d0/2d9abfa2571a0b1a67c0ada79a8aa1ba1cce57992d80f771abcdf99bb32c/pyzmq-26.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:963977ac8baed7058c1e126014f3fe58b3773f45c78cce7af5c26c09b6823896", size = 644727, upload-time = "2025-04-04T12:03:38.578Z" }, - { url = "https://files.pythonhosted.org/packages/0d/d1/c8ad82393be6ccedfc3c9f3adb07f8f3976e3c4802640fe3f71441941e70/pyzmq-26.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:c0c8e8cadc81e44cc5088fcd53b9b3b4ce9344815f6c4a03aec653509296fae3", size = 559942, upload-time = "2025-04-04T12:03:40.143Z" }, { url = "https://files.pythonhosted.org/packages/10/44/a778555ebfdf6c7fc00816aad12d185d10a74d975800341b1bc36bad1187/pyzmq-26.4.0-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:5227cb8da4b6f68acfd48d20c588197fd67745c278827d5238c707daf579227b", size = 1341586, upload-time = "2025-04-04T12:03:41.954Z" }, { url = "https://files.pythonhosted.org/packages/9c/4f/f3a58dc69ac757e5103be3bd41fb78721a5e17da7cc617ddb56d973a365c/pyzmq-26.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1c07a7fa7f7ba86554a2b1bef198c9fed570c08ee062fd2fd6a4dcacd45f905", size = 665880, upload-time = "2025-04-04T12:03:43.45Z" }, { url = "https://files.pythonhosted.org/packages/fe/45/50230bcfb3ae5cb98bee683b6edeba1919f2565d7cc1851d3c38e2260795/pyzmq-26.4.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae775fa83f52f52de73183f7ef5395186f7105d5ed65b1ae65ba27cb1260de2b", size = 902216, upload-time = "2025-04-04T12:03:45.572Z" }, @@ -1951,25 +1938,56 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/bc/f88b0bad0f7a7f500547d71e99f10336f2314e525d4ebf576a1ea4a1d903/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b30f862f6768b17040929a68432c8a8be77780317f45a353cb17e423127d250c", size = 1189183, upload-time = "2025-04-04T12:04:27.035Z" }, { url = "https://files.pythonhosted.org/packages/d9/8c/db446a3dd9cf894406dec2e61eeffaa3c07c3abb783deaebb9812c4af6a5/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_i686.whl", hash = "sha256:c80fcd3504232f13617c6ab501124d373e4895424e65de8b72042333316f64a8", size = 1495501, upload-time = "2025-04-04T12:04:28.833Z" }, { url = "https://files.pythonhosted.org/packages/05/4c/bf3cad0d64c3214ac881299c4562b815f05d503bccc513e3fd4fdc6f67e4/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:26a2a7451606b87f67cdeca2c2789d86f605da08b4bd616b1a9981605ca3a364", size = 1395540, upload-time = "2025-04-04T12:04:30.562Z" }, - { url = "https://files.pythonhosted.org/packages/47/03/96004704a84095f493be8d2b476641f5c967b269390173f85488a53c1c13/pyzmq-26.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:98d948288ce893a2edc5ec3c438fe8de2daa5bbbd6e2e865ec5f966e237084ba", size = 834408, upload-time = "2025-04-04T12:05:04.569Z" }, - { url = "https://files.pythonhosted.org/packages/e4/7f/68d8f3034a20505db7551cb2260248be28ca66d537a1ac9a257913d778e4/pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9f34f5c9e0203ece706a1003f1492a56c06c0632d86cb77bcfe77b56aacf27b", size = 569580, upload-time = "2025-04-04T12:05:06.283Z" }, - { url = "https://files.pythonhosted.org/packages/9b/a6/2b0d6801ec33f2b2a19dd8d02e0a1e8701000fec72926e6787363567d30c/pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80c9b48aef586ff8b698359ce22f9508937c799cc1d2c9c2f7c95996f2300c94", size = 798250, upload-time = "2025-04-04T12:05:07.88Z" }, - { url = "https://files.pythonhosted.org/packages/96/2a/0322b3437de977dcac8a755d6d7ce6ec5238de78e2e2d9353730b297cf12/pyzmq-26.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f2a5b74009fd50b53b26f65daff23e9853e79aa86e0aa08a53a7628d92d44a", size = 756758, upload-time = "2025-04-04T12:05:09.483Z" }, - { url = "https://files.pythonhosted.org/packages/c2/33/43704f066369416d65549ccee366cc19153911bec0154da7c6b41fca7e78/pyzmq-26.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:61c5f93d7622d84cb3092d7f6398ffc77654c346545313a3737e266fc11a3beb", size = 555371, upload-time = "2025-04-04T12:05:11.062Z" }, - { url = "https://files.pythonhosted.org/packages/04/52/a70fcd5592715702248306d8e1729c10742c2eac44529984413b05c68658/pyzmq-26.4.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4478b14cb54a805088299c25a79f27eaf530564a7a4f72bf432a040042b554eb", size = 834405, upload-time = "2025-04-04T12:05:13.3Z" }, - { url = "https://files.pythonhosted.org/packages/25/f9/1a03f1accff16b3af1a6fa22cbf7ced074776abbf688b2e9cb4629700c62/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a28ac29c60e4ba84b5f58605ace8ad495414a724fe7aceb7cf06cd0598d04e1", size = 569578, upload-time = "2025-04-04T12:05:15.36Z" }, - { url = "https://files.pythonhosted.org/packages/76/0c/3a633acd762aa6655fcb71fa841907eae0ab1e8582ff494b137266de341d/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43b03c1ceea27c6520124f4fb2ba9c647409b9abdf9a62388117148a90419494", size = 798248, upload-time = "2025-04-04T12:05:17.376Z" }, - { url = "https://files.pythonhosted.org/packages/cd/cc/6c99c84aa60ac1cc56747bed6be8ce6305b9b861d7475772e7a25ce019d3/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7731abd23a782851426d4e37deb2057bf9410848a4459b5ede4fe89342e687a9", size = 756757, upload-time = "2025-04-04T12:05:19.19Z" }, - { url = "https://files.pythonhosted.org/packages/13/9c/d8073bd898eb896e94c679abe82e47506e2b750eb261cf6010ced869797c/pyzmq-26.4.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a222ad02fbe80166b0526c038776e8042cd4e5f0dec1489a006a1df47e9040e0", size = 555371, upload-time = "2025-04-04T12:05:20.702Z" }, ] [[package]] -name = "redis" -version = "6.2.0" +name = "ray" +version = "2.48.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "async-timeout", marker = "python_full_version < '3.11.3'" }, + { name = "click" }, + { name = "filelock" }, + { name = "jsonschema" }, + { name = "msgpack" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "pyyaml" }, + { name = "requests" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/53/0d105e1baa6c8c9582f90154ba3f0ca08d58129384ea2707b2e59449b03b/ray-2.48.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:8de799f3b0896f48d306d5e4a04fc6037a08c495d45f9c79935344e5693e3cf8", size = 67302857, upload-time = "2025-07-18T22:33:06.414Z" }, + { url = "https://files.pythonhosted.org/packages/df/c5/7de1e9d92a45b1805fe828dcbd18b4c5a1f35ab3cad9134efeb20a3ab3e5/ray-2.48.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5a6f57126eac9dd3286289e07e91e87b054792f9698b6f7ccab88b624816b542", size = 69823198, upload-time = "2025-07-18T22:33:12.494Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a6/e7c969bd371c65b7c233d86f23610489e15164ee7eadb3eb78f9d55eda4d/ray-2.48.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:f1cf33d260316f92f77558185f1c36fc35506d76ee7fdfed9f5b70f9c4bdba7f", size = 69151702, upload-time = "2025-07-18T22:33:18.655Z" }, + { url = "https://files.pythonhosted.org/packages/61/02/1894be2ab930b599de0f1f77f785b86c78bda4873c6c2dd65d1de5b40837/ray-2.48.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:a42ed3b640f4b599a3fc8067c83ee60497c0f03d070d7a7df02a388fa17a546b", size = 70124265, upload-time = "2025-07-18T22:33:25.155Z" }, + { url = "https://files.pythonhosted.org/packages/79/8c/d3653d17337fc787af108411d9c9a38333c9fbdf247283ee56dd096d3360/ray-2.48.0-cp312-cp312-win_amd64.whl", hash = "sha256:e15fdffa6b60d5729f6025691396b8a01dc3461ba19dc92bba354ec1813ed6b1", size = 26745570, upload-time = "2025-07-18T22:33:31.328Z" }, + { url = "https://files.pythonhosted.org/packages/d9/7f/0dc9f5464181ecad93ec2d6f106084d46e5c5ec9a8718c1ba60610ea65fe/ray-2.48.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a7a6d830d9dc5ae8bb156fcde9a1adab7f4edb004f03918a724d885eceb8264d", size = 67250116, upload-time = "2025-07-18T22:33:36.572Z" }, + { url = "https://files.pythonhosted.org/packages/22/ef/bf5dc762663475fc40680f44df716c553f5d619c6648c8b43ccde00f13ce/ray-2.48.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:5742b72a514afe5d60f41330200cd508376e16c650f6962e62337aa482d6a0c6", size = 69763475, upload-time = "2025-07-18T22:33:42.297Z" }, + { url = "https://files.pythonhosted.org/packages/f3/7c/498ceb9684971cb5c9722a2c8400919cd886473b77416c23c23e4e7ddc67/ray-2.48.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:622e6bcdb78d98040d87bea94e65d0bb6ccc0ae1b43294c6bd69f542bf28e092", size = 69062026, upload-time = "2025-07-18T22:33:48.058Z" }, + { url = "https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:25e4b79fcc8f849d72db1acc4f03f37008c5c0b745df63d8a30cd35676b6545e", size = 70039793, upload-time = "2025-07-18T22:33:54.072Z" }, +] + +[package.optional-dependencies] +default = [ + { name = "aiohttp" }, + { name = "aiohttp-cors" }, + { name = "colorful" }, + { name = "grpcio" }, + { name = "opencensus" }, + { name = "opentelemetry-exporter-prometheus" }, + { name = "opentelemetry-proto" }, + { name = "opentelemetry-sdk" }, + { name = "prometheus-client" }, + { name = "py-spy" }, + { name = "pydantic" }, + { name = "requests" }, + { name = "smart-open" }, + { name = "virtualenv" }, ] + +[[package]] +name = "redis" +version = "6.2.0" +source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/ea/9a/0551e01ba52b944f97480721656578c8a7c46b51b99d66814f85fe3a4f3e/redis-6.2.0.tar.gz", hash = "sha256:e821f129b75dde6cb99dd35e5c76e8c49512a5a0d8dfdc560b2fbd44b85ca977", size = 4639129, upload-time = "2025-05-28T05:01:18.91Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/13/67/e60968d3b0e077495a8fee89cf3f2373db98e528288a48f1ee44967f6e8c/redis-6.2.0-py3-none-any.whl", hash = "sha256:c8ddf316ee0aab65f04a11229e94a64b2618451dab7a67cb2f77eb799d872d5e", size = 278659, upload-time = "2025-05-28T05:01:16.955Z" }, @@ -2028,7 +2046,6 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "markdown-it-py" }, { name = "pygments" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149, upload-time = "2024-11-01T16:43:57.873Z" } wheels = [ @@ -2041,33 +2058,6 @@ version = "0.26.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a5/aa/4456d84bbb54adc6a916fb10c9b374f78ac840337644e4a5eda229c81275/rpds_py-0.26.0.tar.gz", hash = "sha256:20dae58a859b0906f0685642e591056f1e787f3a8b39c8e8749a45dc7d26bdb0", size = 27385, upload-time = "2025-07-01T15:57:13.958Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b9/31/1459645f036c3dfeacef89e8e5825e430c77dde8489f3b99eaafcd4a60f5/rpds_py-0.26.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4c70c70f9169692b36307a95f3d8c0a9fcd79f7b4a383aad5eaa0e9718b79b37", size = 372466, upload-time = "2025-07-01T15:53:40.55Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ff/3d0727f35836cc8773d3eeb9a46c40cc405854e36a8d2e951f3a8391c976/rpds_py-0.26.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:777c62479d12395bfb932944e61e915741e364c843afc3196b694db3d669fcd0", size = 357825, upload-time = "2025-07-01T15:53:42.247Z" }, - { url = "https://files.pythonhosted.org/packages/bf/ce/badc5e06120a54099ae287fa96d82cbb650a5f85cf247ffe19c7b157fd1f/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec671691e72dff75817386aa02d81e708b5a7ec0dec6669ec05213ff6b77e1bd", size = 381530, upload-time = "2025-07-01T15:53:43.585Z" }, - { url = "https://files.pythonhosted.org/packages/1e/a5/fa5d96a66c95d06c62d7a30707b6a4cfec696ab8ae280ee7be14e961e118/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a1cb5d6ce81379401bbb7f6dbe3d56de537fb8235979843f0d53bc2e9815a79", size = 396933, upload-time = "2025-07-01T15:53:45.78Z" }, - { url = "https://files.pythonhosted.org/packages/00/a7/7049d66750f18605c591a9db47d4a059e112a0c9ff8de8daf8fa0f446bba/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f789e32fa1fb6a7bf890e0124e7b42d1e60d28ebff57fe806719abb75f0e9a3", size = 513973, upload-time = "2025-07-01T15:53:47.085Z" }, - { url = "https://files.pythonhosted.org/packages/0e/f1/528d02c7d6b29d29fac8fd784b354d3571cc2153f33f842599ef0cf20dd2/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c55b0a669976cf258afd718de3d9ad1b7d1fe0a91cd1ab36f38b03d4d4aeaaf", size = 402293, upload-time = "2025-07-01T15:53:48.117Z" }, - { url = "https://files.pythonhosted.org/packages/15/93/fde36cd6e4685df2cd08508f6c45a841e82f5bb98c8d5ecf05649522acb5/rpds_py-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c70d9ec912802ecfd6cd390dadb34a9578b04f9bcb8e863d0a7598ba5e9e7ccc", size = 383787, upload-time = "2025-07-01T15:53:50.874Z" }, - { url = "https://files.pythonhosted.org/packages/69/f2/5007553aaba1dcae5d663143683c3dfd03d9395289f495f0aebc93e90f24/rpds_py-0.26.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3021933c2cb7def39d927b9862292e0f4c75a13d7de70eb0ab06efed4c508c19", size = 416312, upload-time = "2025-07-01T15:53:52.046Z" }, - { url = "https://files.pythonhosted.org/packages/8f/a7/ce52c75c1e624a79e48a69e611f1c08844564e44c85db2b6f711d76d10ce/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8a7898b6ca3b7d6659e55cdac825a2e58c638cbf335cde41f4619e290dd0ad11", size = 558403, upload-time = "2025-07-01T15:53:53.192Z" }, - { url = "https://files.pythonhosted.org/packages/79/d5/e119db99341cc75b538bf4cb80504129fa22ce216672fb2c28e4a101f4d9/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:12bff2ad9447188377f1b2794772f91fe68bb4bbfa5a39d7941fbebdbf8c500f", size = 588323, upload-time = "2025-07-01T15:53:54.336Z" }, - { url = "https://files.pythonhosted.org/packages/93/94/d28272a0b02f5fe24c78c20e13bbcb95f03dc1451b68e7830ca040c60bd6/rpds_py-0.26.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:191aa858f7d4902e975d4cf2f2d9243816c91e9605070aeb09c0a800d187e323", size = 554541, upload-time = "2025-07-01T15:53:55.469Z" }, - { url = "https://files.pythonhosted.org/packages/93/e0/8c41166602f1b791da892d976057eba30685486d2e2c061ce234679c922b/rpds_py-0.26.0-cp310-cp310-win32.whl", hash = "sha256:b37a04d9f52cb76b6b78f35109b513f6519efb481d8ca4c321f6a3b9580b3f45", size = 220442, upload-time = "2025-07-01T15:53:56.524Z" }, - { url = "https://files.pythonhosted.org/packages/87/f0/509736bb752a7ab50fb0270c2a4134d671a7b3038030837e5536c3de0e0b/rpds_py-0.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:38721d4c9edd3eb6670437d8d5e2070063f305bfa2d5aa4278c51cedcd508a84", size = 231314, upload-time = "2025-07-01T15:53:57.842Z" }, - { url = "https://files.pythonhosted.org/packages/09/4c/4ee8f7e512030ff79fda1df3243c88d70fc874634e2dbe5df13ba4210078/rpds_py-0.26.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9e8cb77286025bdb21be2941d64ac6ca016130bfdcd228739e8ab137eb4406ed", size = 372610, upload-time = "2025-07-01T15:53:58.844Z" }, - { url = "https://files.pythonhosted.org/packages/fa/9d/3dc16be00f14fc1f03c71b1d67c8df98263ab2710a2fbd65a6193214a527/rpds_py-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e09330b21d98adc8ccb2dbb9fc6cb434e8908d4c119aeaa772cb1caab5440a0", size = 358032, upload-time = "2025-07-01T15:53:59.985Z" }, - { url = "https://files.pythonhosted.org/packages/e7/5a/7f1bf8f045da2866324a08ae80af63e64e7bfaf83bd31f865a7b91a58601/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c9c1b92b774b2e68d11193dc39620d62fd8ab33f0a3c77ecdabe19c179cdbc1", size = 381525, upload-time = "2025-07-01T15:54:01.162Z" }, - { url = "https://files.pythonhosted.org/packages/45/8a/04479398c755a066ace10e3d158866beb600867cacae194c50ffa783abd0/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:824e6d3503ab990d7090768e4dfd9e840837bae057f212ff9f4f05ec6d1975e7", size = 397089, upload-time = "2025-07-01T15:54:02.319Z" }, - { url = "https://files.pythonhosted.org/packages/72/88/9203f47268db488a1b6d469d69c12201ede776bb728b9d9f29dbfd7df406/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ad7fd2258228bf288f2331f0a6148ad0186b2e3643055ed0db30990e59817a6", size = 514255, upload-time = "2025-07-01T15:54:03.38Z" }, - { url = "https://files.pythonhosted.org/packages/f5/b4/01ce5d1e853ddf81fbbd4311ab1eff0b3cf162d559288d10fd127e2588b5/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0dc23bbb3e06ec1ea72d515fb572c1fea59695aefbffb106501138762e1e915e", size = 402283, upload-time = "2025-07-01T15:54:04.923Z" }, - { url = "https://files.pythonhosted.org/packages/34/a2/004c99936997bfc644d590a9defd9e9c93f8286568f9c16cdaf3e14429a7/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d80bf832ac7b1920ee29a426cdca335f96a2b5caa839811803e999b41ba9030d", size = 383881, upload-time = "2025-07-01T15:54:06.482Z" }, - { url = "https://files.pythonhosted.org/packages/05/1b/ef5fba4a8f81ce04c427bfd96223f92f05e6cd72291ce9d7523db3b03a6c/rpds_py-0.26.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0919f38f5542c0a87e7b4afcafab6fd2c15386632d249e9a087498571250abe3", size = 415822, upload-time = "2025-07-01T15:54:07.605Z" }, - { url = "https://files.pythonhosted.org/packages/16/80/5c54195aec456b292f7bd8aa61741c8232964063fd8a75fdde9c1e982328/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d422b945683e409000c888e384546dbab9009bb92f7c0b456e217988cf316107", size = 558347, upload-time = "2025-07-01T15:54:08.591Z" }, - { url = "https://files.pythonhosted.org/packages/f2/1c/1845c1b1fd6d827187c43afe1841d91678d7241cbdb5420a4c6de180a538/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:77a7711fa562ba2da1aa757e11024ad6d93bad6ad7ede5afb9af144623e5f76a", size = 587956, upload-time = "2025-07-01T15:54:09.963Z" }, - { url = "https://files.pythonhosted.org/packages/2e/ff/9e979329dd131aa73a438c077252ddabd7df6d1a7ad7b9aacf6261f10faa/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238e8c8610cb7c29460e37184f6799547f7e09e6a9bdbdab4e8edb90986a2318", size = 554363, upload-time = "2025-07-01T15:54:11.073Z" }, - { url = "https://files.pythonhosted.org/packages/00/8b/d78cfe034b71ffbe72873a136e71acc7a831a03e37771cfe59f33f6de8a2/rpds_py-0.26.0-cp311-cp311-win32.whl", hash = "sha256:893b022bfbdf26d7bedb083efeea624e8550ca6eb98bf7fea30211ce95b9201a", size = 220123, upload-time = "2025-07-01T15:54:12.382Z" }, - { url = "https://files.pythonhosted.org/packages/94/c1/3c8c94c7dd3905dbfde768381ce98778500a80db9924731d87ddcdb117e9/rpds_py-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:87a5531de9f71aceb8af041d72fc4cab4943648d91875ed56d2e629bef6d4c03", size = 231732, upload-time = "2025-07-01T15:54:13.434Z" }, - { url = "https://files.pythonhosted.org/packages/67/93/e936fbed1b734eabf36ccb5d93c6a2e9246fbb13c1da011624b7286fae3e/rpds_py-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:de2713f48c1ad57f89ac25b3cb7daed2156d8e822cf0eca9b96a6f990718cc41", size = 221917, upload-time = "2025-07-01T15:54:14.559Z" }, { url = "https://files.pythonhosted.org/packages/ea/86/90eb87c6f87085868bd077c7a9938006eb1ce19ed4d06944a90d3560fce2/rpds_py-0.26.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:894514d47e012e794f1350f076c427d2347ebf82f9b958d554d12819849a369d", size = 363933, upload-time = "2025-07-01T15:54:15.734Z" }, { url = "https://files.pythonhosted.org/packages/63/78/4469f24d34636242c924626082b9586f064ada0b5dbb1e9d096ee7a8e0c6/rpds_py-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc921b96fa95a097add244da36a1d9e4f3039160d1d30f1b35837bf108c21136", size = 350447, upload-time = "2025-07-01T15:54:16.922Z" }, { url = "https://files.pythonhosted.org/packages/ad/91/c448ed45efdfdade82348d5e7995e15612754826ea640afc20915119734f/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e1157659470aa42a75448b6e943c895be8c70531c43cb78b9ba990778955582", size = 384711, upload-time = "2025-07-01T15:54:18.101Z" }, @@ -2136,29 +2126,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/d9/3f0f105420fecd18551b678c9a6ce60bd23986098b252a56d35781b3e7e9/rpds_py-0.26.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c1851f429b822831bd2edcbe0cfd12ee9ea77868f8d3daf267b189371671c80e", size = 554886, upload-time = "2025-07-01T15:55:52.541Z" }, { url = "https://files.pythonhosted.org/packages/6b/c5/347c056a90dc8dd9bc240a08c527315008e1b5042e7a4cf4ac027be9d38a/rpds_py-0.26.0-cp314-cp314t-win32.whl", hash = "sha256:7bdb17009696214c3b66bb3590c6d62e14ac5935e53e929bcdbc5a495987a84f", size = 219027, upload-time = "2025-07-01T15:55:53.874Z" }, { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload-time = "2025-07-01T15:55:55.167Z" }, - { url = "https://files.pythonhosted.org/packages/ef/9a/1f033b0b31253d03d785b0cd905bc127e555ab496ea6b4c7c2e1f951f2fd/rpds_py-0.26.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3c0909c5234543ada2515c05dc08595b08d621ba919629e94427e8e03539c958", size = 373226, upload-time = "2025-07-01T15:56:16.578Z" }, - { url = "https://files.pythonhosted.org/packages/58/29/5f88023fd6aaaa8ca3c4a6357ebb23f6f07da6079093ccf27c99efce87db/rpds_py-0.26.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c1fb0cda2abcc0ac62f64e2ea4b4e64c57dfd6b885e693095460c61bde7bb18e", size = 359230, upload-time = "2025-07-01T15:56:17.978Z" }, - { url = "https://files.pythonhosted.org/packages/6c/6c/13eaebd28b439da6964dde22712b52e53fe2824af0223b8e403249d10405/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d142d2d6cf9b31c12aa4878d82ed3b2324226270b89b676ac62ccd7df52d08", size = 382363, upload-time = "2025-07-01T15:56:19.977Z" }, - { url = "https://files.pythonhosted.org/packages/55/fc/3bb9c486b06da19448646f96147796de23c5811ef77cbfc26f17307b6a9d/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a547e21c5610b7e9093d870be50682a6a6cf180d6da0f42c47c306073bfdbbf6", size = 397146, upload-time = "2025-07-01T15:56:21.39Z" }, - { url = "https://files.pythonhosted.org/packages/15/18/9d1b79eb4d18e64ba8bba9e7dec6f9d6920b639f22f07ee9368ca35d4673/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35e9a70a0f335371275cdcd08bc5b8051ac494dd58bff3bbfb421038220dc871", size = 514804, upload-time = "2025-07-01T15:56:22.78Z" }, - { url = "https://files.pythonhosted.org/packages/4f/5a/175ad7191bdbcd28785204621b225ad70e85cdfd1e09cc414cb554633b21/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0dfa6115c6def37905344d56fb54c03afc49104e2ca473d5dedec0f6606913b4", size = 402820, upload-time = "2025-07-01T15:56:24.584Z" }, - { url = "https://files.pythonhosted.org/packages/11/45/6a67ecf6d61c4d4aff4bc056e864eec4b2447787e11d1c2c9a0242c6e92a/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:313cfcd6af1a55a286a3c9a25f64af6d0e46cf60bc5798f1db152d97a216ff6f", size = 384567, upload-time = "2025-07-01T15:56:26.064Z" }, - { url = "https://files.pythonhosted.org/packages/a1/ba/16589da828732b46454c61858950a78fe4c931ea4bf95f17432ffe64b241/rpds_py-0.26.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f7bf2496fa563c046d05e4d232d7b7fd61346e2402052064b773e5c378bf6f73", size = 416520, upload-time = "2025-07-01T15:56:27.608Z" }, - { url = "https://files.pythonhosted.org/packages/81/4b/00092999fc7c0c266045e984d56b7314734cc400a6c6dc4d61a35f135a9d/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:aa81873e2c8c5aa616ab8e017a481a96742fdf9313c40f14338ca7dbf50cb55f", size = 559362, upload-time = "2025-07-01T15:56:29.078Z" }, - { url = "https://files.pythonhosted.org/packages/96/0c/43737053cde1f93ac4945157f7be1428724ab943e2132a0d235a7e161d4e/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:68ffcf982715f5b5b7686bdd349ff75d422e8f22551000c24b30eaa1b7f7ae84", size = 588113, upload-time = "2025-07-01T15:56:30.485Z" }, - { url = "https://files.pythonhosted.org/packages/46/46/8e38f6161466e60a997ed7e9951ae5de131dedc3cf778ad35994b4af823d/rpds_py-0.26.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:6188de70e190847bb6db3dc3981cbadff87d27d6fe9b4f0e18726d55795cee9b", size = 555429, upload-time = "2025-07-01T15:56:31.956Z" }, - { url = "https://files.pythonhosted.org/packages/2c/ac/65da605e9f1dd643ebe615d5bbd11b6efa1d69644fc4bf623ea5ae385a82/rpds_py-0.26.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1c962145c7473723df9722ba4c058de12eb5ebedcb4e27e7d902920aa3831ee8", size = 231950, upload-time = "2025-07-01T15:56:33.337Z" }, - { url = "https://files.pythonhosted.org/packages/51/f2/b5c85b758a00c513bb0389f8fc8e61eb5423050c91c958cdd21843faa3e6/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f61a9326f80ca59214d1cceb0a09bb2ece5b2563d4e0cd37bfd5515c28510674", size = 373505, upload-time = "2025-07-01T15:56:34.716Z" }, - { url = "https://files.pythonhosted.org/packages/23/e0/25db45e391251118e915e541995bb5f5ac5691a3b98fb233020ba53afc9b/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:183f857a53bcf4b1b42ef0f57ca553ab56bdd170e49d8091e96c51c3d69ca696", size = 359468, upload-time = "2025-07-01T15:56:36.219Z" }, - { url = "https://files.pythonhosted.org/packages/0b/73/dd5ee6075bb6491be3a646b301dfd814f9486d924137a5098e61f0487e16/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:941c1cfdf4799d623cf3aa1d326a6b4fdb7a5799ee2687f3516738216d2262fb", size = 382680, upload-time = "2025-07-01T15:56:37.644Z" }, - { url = "https://files.pythonhosted.org/packages/2f/10/84b522ff58763a5c443f5bcedc1820240e454ce4e620e88520f04589e2ea/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72a8d9564a717ee291f554eeb4bfeafe2309d5ec0aa6c475170bdab0f9ee8e88", size = 397035, upload-time = "2025-07-01T15:56:39.241Z" }, - { url = "https://files.pythonhosted.org/packages/06/ea/8667604229a10a520fcbf78b30ccc278977dcc0627beb7ea2c96b3becef0/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:511d15193cbe013619dd05414c35a7dedf2088fcee93c6bbb7c77859765bd4e8", size = 514922, upload-time = "2025-07-01T15:56:40.645Z" }, - { url = "https://files.pythonhosted.org/packages/24/e6/9ed5b625c0661c4882fc8cdf302bf8e96c73c40de99c31e0b95ed37d508c/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aea1f9741b603a8d8fedb0ed5502c2bc0accbc51f43e2ad1337fe7259c2b77a5", size = 402822, upload-time = "2025-07-01T15:56:42.137Z" }, - { url = "https://files.pythonhosted.org/packages/8a/58/212c7b6fd51946047fb45d3733da27e2fa8f7384a13457c874186af691b1/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4019a9d473c708cf2f16415688ef0b4639e07abaa569d72f74745bbeffafa2c7", size = 384336, upload-time = "2025-07-01T15:56:44.239Z" }, - { url = "https://files.pythonhosted.org/packages/aa/f5/a40ba78748ae8ebf4934d4b88e77b98497378bc2c24ba55ebe87a4e87057/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:093d63b4b0f52d98ebae33b8c50900d3d67e0666094b1be7a12fffd7f65de74b", size = 416871, upload-time = "2025-07-01T15:56:46.284Z" }, - { url = "https://files.pythonhosted.org/packages/d5/a6/33b1fc0c9f7dcfcfc4a4353daa6308b3ece22496ceece348b3e7a7559a09/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2abe21d8ba64cded53a2a677e149ceb76dcf44284202d737178afe7ba540c1eb", size = 559439, upload-time = "2025-07-01T15:56:48.549Z" }, - { url = "https://files.pythonhosted.org/packages/71/2d/ceb3f9c12f8cfa56d34995097f6cd99da1325642c60d1b6680dd9df03ed8/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:4feb7511c29f8442cbbc28149a92093d32e815a28aa2c50d333826ad2a20fdf0", size = 588380, upload-time = "2025-07-01T15:56:50.086Z" }, - { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, +] + +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, ] [[package]] @@ -2204,6 +2183,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] +[[package]] +name = "smart-open" +version = "7.3.0.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/2b/5e7234c68ed5bc872ad6ae77b8a421c2ed70dcb1190b44dc1abdeed5e347/smart_open-7.3.0.post1.tar.gz", hash = "sha256:ce6a3d9bc1afbf6234ad13c010b77f8cd36d24636811e3c52c3b5160f5214d1e", size = 51557, upload-time = "2025-07-03T10:06:31.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4", size = 61946, upload-time = "2025-07-03T10:06:29.599Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -2248,45 +2239,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, ] -[[package]] -name = "tomli" -version = "2.2.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175, upload-time = "2024-11-27T22:38:36.873Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077, upload-time = "2024-11-27T22:37:54.956Z" }, - { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429, upload-time = "2024-11-27T22:37:56.698Z" }, - { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067, upload-time = "2024-11-27T22:37:57.63Z" }, - { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030, upload-time = "2024-11-27T22:37:59.344Z" }, - { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898, upload-time = "2024-11-27T22:38:00.429Z" }, - { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894, upload-time = "2024-11-27T22:38:02.094Z" }, - { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319, upload-time = "2024-11-27T22:38:03.206Z" }, - { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273, upload-time = "2024-11-27T22:38:04.217Z" }, - { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310, upload-time = "2024-11-27T22:38:05.908Z" }, - { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309, upload-time = "2024-11-27T22:38:06.812Z" }, - { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762, upload-time = "2024-11-27T22:38:07.731Z" }, - { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453, upload-time = "2024-11-27T22:38:09.384Z" }, - { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486, upload-time = "2024-11-27T22:38:10.329Z" }, - { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349, upload-time = "2024-11-27T22:38:11.443Z" }, - { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159, upload-time = "2024-11-27T22:38:13.099Z" }, - { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243, upload-time = "2024-11-27T22:38:14.766Z" }, - { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645, upload-time = "2024-11-27T22:38:15.843Z" }, - { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584, upload-time = "2024-11-27T22:38:17.645Z" }, - { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875, upload-time = "2024-11-27T22:38:19.159Z" }, - { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418, upload-time = "2024-11-27T22:38:20.064Z" }, - { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708, upload-time = "2024-11-27T22:38:21.659Z" }, - { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582, upload-time = "2024-11-27T22:38:22.693Z" }, - { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543, upload-time = "2024-11-27T22:38:24.367Z" }, - { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691, upload-time = "2024-11-27T22:38:26.081Z" }, - { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170, upload-time = "2024-11-27T22:38:27.921Z" }, - { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530, upload-time = "2024-11-27T22:38:29.591Z" }, - { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666, upload-time = "2024-11-27T22:38:30.639Z" }, - { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954, upload-time = "2024-11-27T22:38:31.702Z" }, - { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724, upload-time = "2024-11-27T22:38:32.837Z" }, - { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383, upload-time = "2024-11-27T22:38:34.455Z" }, - { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, -] - [[package]] name = "tornado" version = "6.5.1" @@ -2366,6 +2318,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680, upload-time = "2025-04-10T15:23:37.377Z" }, ] +[[package]] +name = "virtualenv" +version = "20.33.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock" }, + { name = "platformdirs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/db/2e/8a70dcbe8bf15213a08f9b0325ede04faca5d362922ae0d62ef0fa4b069d/virtualenv-20.33.0.tar.gz", hash = "sha256:47e0c0d2ef1801fce721708ccdf2a28b9403fa2307c3268aebd03225976f61d2", size = 6082069, upload-time = "2025-08-03T08:09:19.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/87/b22cf40cdf7e2b2bf83f38a94d2c90c5ad6c304896e5a12d0c08a602eb59/virtualenv-20.33.0-py3-none-any.whl", hash = "sha256:106b6baa8ab1b526d5a9b71165c85c456fbd49b16976c88e2bc9352ee3bc5d3f", size = 6060205, upload-time = "2025-08-03T08:09:16.674Z" }, +] + [[package]] name = "wcwidth" version = "0.2.13" @@ -2375,34 +2341,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166, upload-time = "2024-01-06T02:10:55.763Z" }, ] +[[package]] +name = "widgetsnbextension" +version = "4.0.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c9656b1843892052a31c36d37ad42812b5da45c62191f7e/widgetsnbextension-4.0.14.tar.gz", hash = "sha256:a3629b04e3edb893212df862038c7232f62973373869db5084aed739b437b5af", size = 1097428, upload-time = "2025-04-10T13:01:25.628Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" }, +] + [[package]] name = "wrapt" version = "1.17.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531, upload-time = "2025-01-14T10:35:45.465Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/d1/1daec934997e8b160040c78d7b31789f19b122110a75eca3d4e8da0049e1/wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984", size = 53307, upload-time = "2025-01-14T10:33:13.616Z" }, - { url = "https://files.pythonhosted.org/packages/1b/7b/13369d42651b809389c1a7153baa01d9700430576c81a2f5c5e460df0ed9/wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22", size = 38486, upload-time = "2025-01-14T10:33:15.947Z" }, - { url = "https://files.pythonhosted.org/packages/62/bf/e0105016f907c30b4bd9e377867c48c34dc9c6c0c104556c9c9126bd89ed/wrapt-1.17.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:80dd7db6a7cb57ffbc279c4394246414ec99537ae81ffd702443335a61dbf3a7", size = 38777, upload-time = "2025-01-14T10:33:17.462Z" }, - { url = "https://files.pythonhosted.org/packages/27/70/0f6e0679845cbf8b165e027d43402a55494779295c4b08414097b258ac87/wrapt-1.17.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a6e821770cf99cc586d33833b2ff32faebdbe886bd6322395606cf55153246c", size = 83314, upload-time = "2025-01-14T10:33:21.282Z" }, - { url = "https://files.pythonhosted.org/packages/0f/77/0576d841bf84af8579124a93d216f55d6f74374e4445264cb378a6ed33eb/wrapt-1.17.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b60fb58b90c6d63779cb0c0c54eeb38941bae3ecf7a73c764c52c88c2dcb9d72", size = 74947, upload-time = "2025-01-14T10:33:24.414Z" }, - { url = "https://files.pythonhosted.org/packages/90/ec/00759565518f268ed707dcc40f7eeec38637d46b098a1f5143bff488fe97/wrapt-1.17.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b870b5df5b71d8c3359d21be8f0d6c485fa0ebdb6477dda51a1ea54a9b558061", size = 82778, upload-time = "2025-01-14T10:33:26.152Z" }, - { url = "https://files.pythonhosted.org/packages/f8/5a/7cffd26b1c607b0b0c8a9ca9d75757ad7620c9c0a9b4a25d3f8a1480fafc/wrapt-1.17.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4011d137b9955791f9084749cba9a367c68d50ab8d11d64c50ba1688c9b457f2", size = 81716, upload-time = "2025-01-14T10:33:27.372Z" }, - { url = "https://files.pythonhosted.org/packages/7e/09/dccf68fa98e862df7e6a60a61d43d644b7d095a5fc36dbb591bbd4a1c7b2/wrapt-1.17.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:1473400e5b2733e58b396a04eb7f35f541e1fb976d0c0724d0223dd607e0f74c", size = 74548, upload-time = "2025-01-14T10:33:28.52Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8e/067021fa3c8814952c5e228d916963c1115b983e21393289de15128e867e/wrapt-1.17.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3cedbfa9c940fdad3e6e941db7138e26ce8aad38ab5fe9dcfadfed9db7a54e62", size = 81334, upload-time = "2025-01-14T10:33:29.643Z" }, - { url = "https://files.pythonhosted.org/packages/4b/0d/9d4b5219ae4393f718699ca1c05f5ebc0c40d076f7e65fd48f5f693294fb/wrapt-1.17.2-cp310-cp310-win32.whl", hash = "sha256:582530701bff1dec6779efa00c516496968edd851fba224fbd86e46cc6b73563", size = 36427, upload-time = "2025-01-14T10:33:30.832Z" }, - { url = "https://files.pythonhosted.org/packages/72/6a/c5a83e8f61aec1e1aeef939807602fb880e5872371e95df2137142f5c58e/wrapt-1.17.2-cp310-cp310-win_amd64.whl", hash = "sha256:58705da316756681ad3c9c73fd15499aa4d8c69f9fd38dc8a35e06c12468582f", size = 38774, upload-time = "2025-01-14T10:33:32.897Z" }, - { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308, upload-time = "2025-01-14T10:33:33.992Z" }, - { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488, upload-time = "2025-01-14T10:33:35.264Z" }, - { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776, upload-time = "2025-01-14T10:33:38.28Z" }, - { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776, upload-time = "2025-01-14T10:33:40.678Z" }, - { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420, upload-time = "2025-01-14T10:33:41.868Z" }, - { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199, upload-time = "2025-01-14T10:33:43.598Z" }, - { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307, upload-time = "2025-01-14T10:33:48.499Z" }, - { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025, upload-time = "2025-01-14T10:33:51.191Z" }, - { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879, upload-time = "2025-01-14T10:33:52.328Z" }, - { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419, upload-time = "2025-01-14T10:33:53.551Z" }, - { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773, upload-time = "2025-01-14T10:33:56.323Z" }, { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799, upload-time = "2025-01-14T10:33:57.4Z" }, { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821, upload-time = "2025-01-14T10:33:59.334Z" }, { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919, upload-time = "2025-01-14T10:34:04.093Z" }, @@ -2445,36 +2398,6 @@ version = "3.5.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/8a/0e9feca390d512d293afd844d31670e25608c4a901e10202aa98785eab09/xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212", size = 31970, upload-time = "2024-08-17T09:17:35.675Z" }, - { url = "https://files.pythonhosted.org/packages/16/e6/be5aa49580cd064a18200ab78e29b88b1127e1a8c7955eb8ecf81f2626eb/xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520", size = 30801, upload-time = "2024-08-17T09:17:37.353Z" }, - { url = "https://files.pythonhosted.org/packages/20/ee/b8a99ebbc6d1113b3a3f09e747fa318c3cde5b04bd9c197688fadf0eeae8/xxhash-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5d3e570ef46adaf93fc81b44aca6002b5a4d8ca11bd0580c07eac537f36680", size = 220927, upload-time = "2024-08-17T09:17:38.835Z" }, - { url = "https://files.pythonhosted.org/packages/58/62/15d10582ef159283a5c2b47f6d799fc3303fe3911d5bb0bcc820e1ef7ff4/xxhash-3.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7cb29a034301e2982df8b1fe6328a84f4b676106a13e9135a0d7e0c3e9f806da", size = 200360, upload-time = "2024-08-17T09:17:40.851Z" }, - { url = "https://files.pythonhosted.org/packages/23/41/61202663ea9b1bd8e53673b8ec9e2619989353dba8cfb68e59a9cbd9ffe3/xxhash-3.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d0d307d27099bb0cbeea7260eb39ed4fdb99c5542e21e94bb6fd29e49c57a23", size = 428528, upload-time = "2024-08-17T09:17:42.545Z" }, - { url = "https://files.pythonhosted.org/packages/f2/07/d9a3059f702dec5b3b703737afb6dda32f304f6e9da181a229dafd052c29/xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0342aafd421795d740e514bc9858ebddfc705a75a8c5046ac56d85fe97bf196", size = 194149, upload-time = "2024-08-17T09:17:44.361Z" }, - { url = "https://files.pythonhosted.org/packages/eb/58/27caadf78226ecf1d62dbd0c01d152ed381c14c1ee4ad01f0d460fc40eac/xxhash-3.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3dbbd9892c5ebffeca1ed620cf0ade13eb55a0d8c84e0751a6653adc6ac40d0c", size = 207703, upload-time = "2024-08-17T09:17:46.656Z" }, - { url = "https://files.pythonhosted.org/packages/b1/08/32d558ce23e1e068453c39aed7b3c1cdc690c177873ec0ca3a90d5808765/xxhash-3.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4cc2d67fdb4d057730c75a64c5923abfa17775ae234a71b0200346bfb0a7f482", size = 216255, upload-time = "2024-08-17T09:17:48.031Z" }, - { url = "https://files.pythonhosted.org/packages/3f/d4/2b971e2d2b0a61045f842b622ef11e94096cf1f12cd448b6fd426e80e0e2/xxhash-3.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ec28adb204b759306a3d64358a5e5c07d7b1dd0ccbce04aa76cb9377b7b70296", size = 202744, upload-time = "2024-08-17T09:17:50.045Z" }, - { url = "https://files.pythonhosted.org/packages/19/ae/6a6438864a8c4c39915d7b65effd85392ebe22710412902487e51769146d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:1328f6d8cca2b86acb14104e381225a3d7b42c92c4b86ceae814e5c400dbb415", size = 210115, upload-time = "2024-08-17T09:17:51.834Z" }, - { url = "https://files.pythonhosted.org/packages/48/7d/b3c27c27d1fc868094d02fe4498ccce8cec9fcc591825c01d6bcb0b4fc49/xxhash-3.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8d47ebd9f5d9607fd039c1fbf4994e3b071ea23eff42f4ecef246ab2b7334198", size = 414247, upload-time = "2024-08-17T09:17:53.094Z" }, - { url = "https://files.pythonhosted.org/packages/a1/05/918f9e7d2fbbd334b829997045d341d6239b563c44e683b9a7ef8fe50f5d/xxhash-3.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b96d559e0fcddd3343c510a0fe2b127fbff16bf346dd76280b82292567523442", size = 191419, upload-time = "2024-08-17T09:17:54.906Z" }, - { url = "https://files.pythonhosted.org/packages/08/29/dfe393805b2f86bfc47c290b275f0b7c189dc2f4e136fd4754f32eb18a8d/xxhash-3.5.0-cp310-cp310-win32.whl", hash = "sha256:61c722ed8d49ac9bc26c7071eeaa1f6ff24053d553146d5df031802deffd03da", size = 30114, upload-time = "2024-08-17T09:17:56.566Z" }, - { url = "https://files.pythonhosted.org/packages/7b/d7/aa0b22c4ebb7c3ccb993d4c565132abc641cd11164f8952d89eb6a501909/xxhash-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:9bed5144c6923cc902cd14bb8963f2d5e034def4486ab0bbe1f58f03f042f9a9", size = 30003, upload-time = "2024-08-17T09:17:57.596Z" }, - { url = "https://files.pythonhosted.org/packages/69/12/f969b81541ee91b55f1ce469d7ab55079593c80d04fd01691b550e535000/xxhash-3.5.0-cp310-cp310-win_arm64.whl", hash = "sha256:893074d651cf25c1cc14e3bea4fceefd67f2921b1bb8e40fcfeba56820de80c6", size = 26773, upload-time = "2024-08-17T09:17:59.169Z" }, - { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" }, - { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" }, - { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" }, - { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" }, - { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" }, - { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" }, - { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" }, - { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" }, - { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" }, - { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" }, - { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" }, - { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" }, - { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" }, - { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" }, - { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" }, { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969, upload-time = "2024-08-17T09:18:24.025Z" }, { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787, upload-time = "2024-08-17T09:18:25.318Z" }, { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959, upload-time = "2024-08-17T09:18:26.518Z" }, @@ -2505,9 +2428,78 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/6d/c61e0668943a034abc3a569cdc5aeae37d686d9da7e39cf2ed621d533e36/xxhash-3.5.0-cp313-cp313-win32.whl", hash = "sha256:53a068fe70301ec30d868ece566ac90d873e3bb059cf83c32e76012c889b8637", size = 30172, upload-time = "2024-08-17T09:19:04.355Z" }, { url = "https://files.pythonhosted.org/packages/96/14/8416dce965f35e3d24722cdf79361ae154fa23e2ab730e5323aa98d7919e/xxhash-3.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:80babcc30e7a1a484eab952d76a4f4673ff601f54d5142c26826502740e70b43", size = 30041, upload-time = "2024-08-17T09:19:05.435Z" }, { url = "https://files.pythonhosted.org/packages/27/ee/518b72faa2073f5aa8e3262408d284892cb79cf2754ba0c3a5870645ef73/xxhash-3.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:4811336f1ce11cac89dcbd18f3a25c527c16311709a89313c3acaf771def2d4b", size = 26801, upload-time = "2024-08-17T09:19:06.547Z" }, - { url = "https://files.pythonhosted.org/packages/ab/9a/233606bada5bd6f50b2b72c45de3d9868ad551e83893d2ac86dc7bb8553a/xxhash-3.5.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:2014c5b3ff15e64feecb6b713af12093f75b7926049e26a580e94dcad3c73d8c", size = 29732, upload-time = "2024-08-17T09:20:11.175Z" }, - { url = "https://files.pythonhosted.org/packages/0c/67/f75276ca39e2c6604e3bee6c84e9db8a56a4973fde9bf35989787cf6e8aa/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fab81ef75003eda96239a23eda4e4543cedc22e34c373edcaf744e721a163986", size = 36214, upload-time = "2024-08-17T09:20:12.335Z" }, - { url = "https://files.pythonhosted.org/packages/0f/f8/f6c61fd794229cc3848d144f73754a0c107854372d7261419dcbbd286299/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e2febf914ace002132aa09169cc572e0d8959d0f305f93d5828c4836f9bc5a6", size = 32020, upload-time = "2024-08-17T09:20:13.537Z" }, - { url = "https://files.pythonhosted.org/packages/79/d3/c029c99801526f859e6b38d34ab87c08993bf3dcea34b11275775001638a/xxhash-3.5.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5d3a10609c51da2a1c0ea0293fc3968ca0a18bd73838455b5bca3069d7f8e32b", size = 40515, upload-time = "2024-08-17T09:20:14.669Z" }, - { url = "https://files.pythonhosted.org/packages/62/e3/bef7b82c1997579c94de9ac5ea7626d01ae5858aa22bf4fcb38bf220cb3e/xxhash-3.5.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5a74f23335b9689b66eb6dbe2a931a88fcd7a4c2cc4b1cb0edba8ce381c7a1da", size = 30064, upload-time = "2024-08-17T09:20:15.925Z" }, +] + +[[package]] +name = "yarl" +version = "1.20.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5f/9a/cb7fad7d73c69f296eda6815e4a2c7ed53fc70c2f136479a91c8e5fbdb6d/yarl-1.20.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdcc4cd244e58593a4379fe60fdee5ac0331f8eb70320a24d591a3be197b94a9", size = 133667, upload-time = "2025-06-10T00:43:44.369Z" }, + { url = "https://files.pythonhosted.org/packages/67/38/688577a1cb1e656e3971fb66a3492501c5a5df56d99722e57c98249e5b8a/yarl-1.20.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b29a2c385a5f5b9c7d9347e5812b6f7ab267193c62d282a540b4fc528c8a9d2a", size = 91025, upload-time = "2025-06-10T00:43:46.295Z" }, + { url = "https://files.pythonhosted.org/packages/50/ec/72991ae51febeb11a42813fc259f0d4c8e0507f2b74b5514618d8b640365/yarl-1.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1112ae8154186dfe2de4732197f59c05a83dc814849a5ced892b708033f40dc2", size = 89709, upload-time = "2025-06-10T00:43:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/99/da/4d798025490e89426e9f976702e5f9482005c548c579bdae792a4c37769e/yarl-1.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90bbd29c4fe234233f7fa2b9b121fb63c321830e5d05b45153a2ca68f7d310ee", size = 352287, upload-time = "2025-06-10T00:43:49.924Z" }, + { url = "https://files.pythonhosted.org/packages/1a/26/54a15c6a567aac1c61b18aa0f4b8aa2e285a52d547d1be8bf48abe2b3991/yarl-1.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:680e19c7ce3710ac4cd964e90dad99bf9b5029372ba0c7cbfcd55e54d90ea819", size = 345429, upload-time = "2025-06-10T00:43:51.7Z" }, + { url = "https://files.pythonhosted.org/packages/d6/95/9dcf2386cb875b234353b93ec43e40219e14900e046bf6ac118f94b1e353/yarl-1.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a979218c1fdb4246a05efc2cc23859d47c89af463a90b99b7c56094daf25a16", size = 365429, upload-time = "2025-06-10T00:43:53.494Z" }, + { url = "https://files.pythonhosted.org/packages/91/b2/33a8750f6a4bc224242a635f5f2cff6d6ad5ba651f6edcccf721992c21a0/yarl-1.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:255b468adf57b4a7b65d8aad5b5138dce6a0752c139965711bdcb81bc370e1b6", size = 363862, upload-time = "2025-06-10T00:43:55.766Z" }, + { url = "https://files.pythonhosted.org/packages/98/28/3ab7acc5b51f4434b181b0cee8f1f4b77a65919700a355fb3617f9488874/yarl-1.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a97d67108e79cfe22e2b430d80d7571ae57d19f17cda8bb967057ca8a7bf5bfd", size = 355616, upload-time = "2025-06-10T00:43:58.056Z" }, + { url = "https://files.pythonhosted.org/packages/36/a3/f666894aa947a371724ec7cd2e5daa78ee8a777b21509b4252dd7bd15e29/yarl-1.20.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8570d998db4ddbfb9a590b185a0a33dbf8aafb831d07a5257b4ec9948df9cb0a", size = 339954, upload-time = "2025-06-10T00:43:59.773Z" }, + { url = "https://files.pythonhosted.org/packages/f1/81/5f466427e09773c04219d3450d7a1256138a010b6c9f0af2d48565e9ad13/yarl-1.20.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:97c75596019baae7c71ccf1d8cc4738bc08134060d0adfcbe5642f778d1dca38", size = 365575, upload-time = "2025-06-10T00:44:02.051Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e3/e4b0ad8403e97e6c9972dd587388940a032f030ebec196ab81a3b8e94d31/yarl-1.20.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:1c48912653e63aef91ff988c5432832692ac5a1d8f0fb8a33091520b5bbe19ef", size = 365061, upload-time = "2025-06-10T00:44:04.196Z" }, + { url = "https://files.pythonhosted.org/packages/ac/99/b8a142e79eb86c926f9f06452eb13ecb1bb5713bd01dc0038faf5452e544/yarl-1.20.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4c3ae28f3ae1563c50f3d37f064ddb1511ecc1d5584e88c6b7c63cf7702a6d5f", size = 364142, upload-time = "2025-06-10T00:44:06.527Z" }, + { url = "https://files.pythonhosted.org/packages/34/f2/08ed34a4a506d82a1a3e5bab99ccd930a040f9b6449e9fd050320e45845c/yarl-1.20.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c5e9642f27036283550f5f57dc6156c51084b458570b9d0d96100c8bebb186a8", size = 381894, upload-time = "2025-06-10T00:44:08.379Z" }, + { url = "https://files.pythonhosted.org/packages/92/f8/9a3fbf0968eac704f681726eff595dce9b49c8a25cd92bf83df209668285/yarl-1.20.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2c26b0c49220d5799f7b22c6838409ee9bc58ee5c95361a4d7831f03cc225b5a", size = 383378, upload-time = "2025-06-10T00:44:10.51Z" }, + { url = "https://files.pythonhosted.org/packages/af/85/9363f77bdfa1e4d690957cd39d192c4cacd1c58965df0470a4905253b54f/yarl-1.20.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564ab3d517e3d01c408c67f2e5247aad4019dcf1969982aba3974b4093279004", size = 374069, upload-time = "2025-06-10T00:44:12.834Z" }, + { url = "https://files.pythonhosted.org/packages/35/99/9918c8739ba271dcd935400cff8b32e3cd319eaf02fcd023d5dcd487a7c8/yarl-1.20.1-cp312-cp312-win32.whl", hash = "sha256:daea0d313868da1cf2fac6b2d3a25c6e3a9e879483244be38c8e6a41f1d876a5", size = 81249, upload-time = "2025-06-10T00:44:14.731Z" }, + { url = "https://files.pythonhosted.org/packages/eb/83/5d9092950565481b413b31a23e75dd3418ff0a277d6e0abf3729d4d1ce25/yarl-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:48ea7d7f9be0487339828a4de0360d7ce0efc06524a48e1810f945c45b813698", size = 86710, upload-time = "2025-06-10T00:44:16.716Z" }, + { url = "https://files.pythonhosted.org/packages/8a/e1/2411b6d7f769a07687acee88a062af5833cf1966b7266f3d8dfb3d3dc7d3/yarl-1.20.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:0b5ff0fbb7c9f1b1b5ab53330acbfc5247893069e7716840c8e7d5bb7355038a", size = 131811, upload-time = "2025-06-10T00:44:18.933Z" }, + { url = "https://files.pythonhosted.org/packages/b2/27/584394e1cb76fb771371770eccad35de400e7b434ce3142c2dd27392c968/yarl-1.20.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:14f326acd845c2b2e2eb38fb1346c94f7f3b01a4f5c788f8144f9b630bfff9a3", size = 90078, upload-time = "2025-06-10T00:44:20.635Z" }, + { url = "https://files.pythonhosted.org/packages/bf/9a/3246ae92d4049099f52d9b0fe3486e3b500e29b7ea872d0f152966fc209d/yarl-1.20.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f60e4ad5db23f0b96e49c018596707c3ae89f5d0bd97f0ad3684bcbad899f1e7", size = 88748, upload-time = "2025-06-10T00:44:22.34Z" }, + { url = "https://files.pythonhosted.org/packages/a3/25/35afe384e31115a1a801fbcf84012d7a066d89035befae7c5d4284df1e03/yarl-1.20.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:49bdd1b8e00ce57e68ba51916e4bb04461746e794e7c4d4bbc42ba2f18297691", size = 349595, upload-time = "2025-06-10T00:44:24.314Z" }, + { url = "https://files.pythonhosted.org/packages/28/2d/8aca6cb2cabc8f12efcb82749b9cefecbccfc7b0384e56cd71058ccee433/yarl-1.20.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:66252d780b45189975abfed839616e8fd2dbacbdc262105ad7742c6ae58f3e31", size = 342616, upload-time = "2025-06-10T00:44:26.167Z" }, + { url = "https://files.pythonhosted.org/packages/0b/e9/1312633d16b31acf0098d30440ca855e3492d66623dafb8e25b03d00c3da/yarl-1.20.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59174e7332f5d153d8f7452a102b103e2e74035ad085f404df2e40e663a22b28", size = 361324, upload-time = "2025-06-10T00:44:27.915Z" }, + { url = "https://files.pythonhosted.org/packages/bc/a0/688cc99463f12f7669eec7c8acc71ef56a1521b99eab7cd3abb75af887b0/yarl-1.20.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3968ec7d92a0c0f9ac34d5ecfd03869ec0cab0697c91a45db3fbbd95fe1b653", size = 359676, upload-time = "2025-06-10T00:44:30.041Z" }, + { url = "https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1a4fbb50e14396ba3d375f68bfe02215d8e7bc3ec49da8341fe3157f59d2ff5", size = 352614, upload-time = "2025-06-10T00:44:32.171Z" }, + { url = "https://files.pythonhosted.org/packages/b1/91/31163295e82b8d5485d31d9cf7754d973d41915cadce070491778d9c9825/yarl-1.20.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11a62c839c3a8eac2410e951301309426f368388ff2f33799052787035793b02", size = 336766, upload-time = "2025-06-10T00:44:34.494Z" }, + { url = "https://files.pythonhosted.org/packages/b4/8e/c41a5bc482121f51c083c4c2bcd16b9e01e1cf8729e380273a952513a21f/yarl-1.20.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:041eaa14f73ff5a8986b4388ac6bb43a77f2ea09bf1913df7a35d4646db69e53", size = 364615, upload-time = "2025-06-10T00:44:36.856Z" }, + { url = "https://files.pythonhosted.org/packages/e3/5b/61a3b054238d33d70ea06ebba7e58597891b71c699e247df35cc984ab393/yarl-1.20.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:377fae2fef158e8fd9d60b4c8751387b8d1fb121d3d0b8e9b0be07d1b41e83dc", size = 360982, upload-time = "2025-06-10T00:44:39.141Z" }, + { url = "https://files.pythonhosted.org/packages/df/a3/6a72fb83f8d478cb201d14927bc8040af901811a88e0ff2da7842dd0ed19/yarl-1.20.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1c92f4390e407513f619d49319023664643d3339bd5e5a56a3bebe01bc67ec04", size = 369792, upload-time = "2025-06-10T00:44:40.934Z" }, + { url = "https://files.pythonhosted.org/packages/7c/af/4cc3c36dfc7c077f8dedb561eb21f69e1e9f2456b91b593882b0b18c19dc/yarl-1.20.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:d25ddcf954df1754ab0f86bb696af765c5bfaba39b74095f27eececa049ef9a4", size = 382049, upload-time = "2025-06-10T00:44:42.854Z" }, + { url = "https://files.pythonhosted.org/packages/19/3a/e54e2c4752160115183a66dc9ee75a153f81f3ab2ba4bf79c3c53b33de34/yarl-1.20.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:909313577e9619dcff8c31a0ea2aa0a2a828341d92673015456b3ae492e7317b", size = 384774, upload-time = "2025-06-10T00:44:45.275Z" }, + { url = "https://files.pythonhosted.org/packages/9c/20/200ae86dabfca89060ec6447649f219b4cbd94531e425e50d57e5f5ac330/yarl-1.20.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:793fd0580cb9664548c6b83c63b43c477212c0260891ddf86809e1c06c8b08f1", size = 374252, upload-time = "2025-06-10T00:44:47.31Z" }, + { url = "https://files.pythonhosted.org/packages/83/75/11ee332f2f516b3d094e89448da73d557687f7d137d5a0f48c40ff211487/yarl-1.20.1-cp313-cp313-win32.whl", hash = "sha256:468f6e40285de5a5b3c44981ca3a319a4b208ccc07d526b20b12aeedcfa654b7", size = 81198, upload-time = "2025-06-10T00:44:49.164Z" }, + { url = "https://files.pythonhosted.org/packages/ba/ba/39b1ecbf51620b40ab402b0fc817f0ff750f6d92712b44689c2c215be89d/yarl-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:495b4ef2fea40596bfc0affe3837411d6aa3371abcf31aac0ccc4bdd64d4ef5c", size = 86346, upload-time = "2025-06-10T00:44:51.182Z" }, + { url = "https://files.pythonhosted.org/packages/43/c7/669c52519dca4c95153c8ad96dd123c79f354a376346b198f438e56ffeb4/yarl-1.20.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:f60233b98423aab21d249a30eb27c389c14929f47be8430efa7dbd91493a729d", size = 138826, upload-time = "2025-06-10T00:44:52.883Z" }, + { url = "https://files.pythonhosted.org/packages/6a/42/fc0053719b44f6ad04a75d7f05e0e9674d45ef62f2d9ad2c1163e5c05827/yarl-1.20.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6f3eff4cc3f03d650d8755c6eefc844edde99d641d0dcf4da3ab27141a5f8ddf", size = 93217, upload-time = "2025-06-10T00:44:54.658Z" }, + { url = "https://files.pythonhosted.org/packages/4f/7f/fa59c4c27e2a076bba0d959386e26eba77eb52ea4a0aac48e3515c186b4c/yarl-1.20.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:69ff8439d8ba832d6bed88af2c2b3445977eba9a4588b787b32945871c2444e3", size = 92700, upload-time = "2025-06-10T00:44:56.784Z" }, + { url = "https://files.pythonhosted.org/packages/2f/d4/062b2f48e7c93481e88eff97a6312dca15ea200e959f23e96d8ab898c5b8/yarl-1.20.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cf34efa60eb81dd2645a2e13e00bb98b76c35ab5061a3989c7a70f78c85006d", size = 347644, upload-time = "2025-06-10T00:44:59.071Z" }, + { url = "https://files.pythonhosted.org/packages/89/47/78b7f40d13c8f62b499cc702fdf69e090455518ae544c00a3bf4afc9fc77/yarl-1.20.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8e0fe9364ad0fddab2688ce72cb7a8e61ea42eff3c7caeeb83874a5d479c896c", size = 323452, upload-time = "2025-06-10T00:45:01.605Z" }, + { url = "https://files.pythonhosted.org/packages/eb/2b/490d3b2dc66f52987d4ee0d3090a147ea67732ce6b4d61e362c1846d0d32/yarl-1.20.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f64fbf81878ba914562c672024089e3401974a39767747691c65080a67b18c1", size = 346378, upload-time = "2025-06-10T00:45:03.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/ad/775da9c8a94ce925d1537f939a4f17d782efef1f973039d821cbe4bcc211/yarl-1.20.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6342d643bf9a1de97e512e45e4b9560a043347e779a173250824f8b254bd5ce", size = 353261, upload-time = "2025-06-10T00:45:05.992Z" }, + { url = "https://files.pythonhosted.org/packages/4b/23/0ed0922b47a4f5c6eb9065d5ff1e459747226ddce5c6a4c111e728c9f701/yarl-1.20.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56dac5f452ed25eef0f6e3c6a066c6ab68971d96a9fb441791cad0efba6140d3", size = 335987, upload-time = "2025-06-10T00:45:08.227Z" }, + { url = "https://files.pythonhosted.org/packages/3e/49/bc728a7fe7d0e9336e2b78f0958a2d6b288ba89f25a1762407a222bf53c3/yarl-1.20.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7d7f497126d65e2cad8dc5f97d34c27b19199b6414a40cb36b52f41b79014be", size = 329361, upload-time = "2025-06-10T00:45:10.11Z" }, + { url = "https://files.pythonhosted.org/packages/93/8f/b811b9d1f617c83c907e7082a76e2b92b655400e61730cd61a1f67178393/yarl-1.20.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:67e708dfb8e78d8a19169818eeb5c7a80717562de9051bf2413aca8e3696bf16", size = 346460, upload-time = "2025-06-10T00:45:12.055Z" }, + { url = "https://files.pythonhosted.org/packages/70/fd/af94f04f275f95da2c3b8b5e1d49e3e79f1ed8b6ceb0f1664cbd902773ff/yarl-1.20.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:595c07bc79af2494365cc96ddeb772f76272364ef7c80fb892ef9d0649586513", size = 334486, upload-time = "2025-06-10T00:45:13.995Z" }, + { url = "https://files.pythonhosted.org/packages/84/65/04c62e82704e7dd0a9b3f61dbaa8447f8507655fd16c51da0637b39b2910/yarl-1.20.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7bdd2f80f4a7df852ab9ab49484a4dee8030023aa536df41f2d922fd57bf023f", size = 342219, upload-time = "2025-06-10T00:45:16.479Z" }, + { url = "https://files.pythonhosted.org/packages/91/95/459ca62eb958381b342d94ab9a4b6aec1ddec1f7057c487e926f03c06d30/yarl-1.20.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c03bfebc4ae8d862f853a9757199677ab74ec25424d0ebd68a0027e9c639a390", size = 350693, upload-time = "2025-06-10T00:45:18.399Z" }, + { url = "https://files.pythonhosted.org/packages/a6/00/d393e82dd955ad20617abc546a8f1aee40534d599ff555ea053d0ec9bf03/yarl-1.20.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:344d1103e9c1523f32a5ed704d576172d2cabed3122ea90b1d4e11fe17c66458", size = 355803, upload-time = "2025-06-10T00:45:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, + { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, + { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, + { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, ] From 695d1d611e1a5f34f2ed1a96434690ce07dc6120 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 08:11:18 +0000 Subject: [PATCH 176/224] feat: support execution engine in pipeline run --- .../01_introduction_to_orcapod.ipynb | 136 +----------------- src/orcapod/pipeline/graph.py | 16 ++- 2 files changed, 13 insertions(+), 139 deletions(-) diff --git a/notebooks/tutorials/01_introduction_to_orcapod.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb index 9feecdb..434dd49 100644 --- a/notebooks/tutorials/01_introduction_to_orcapod.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -1834,39 +1834,7 @@ "execution_count": 59, "id": "c77154ec", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────┐\n", - "│ id ┆ sum │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════╡\n", - "│ 0 ┆ 11 │\n", - "│ 1 ┆ 22 │\n", - "│ 2 ┆ 33 │\n", - "│ 3 ┆ 44 │\n", - "│ 4 ┆ 55 │\n", - "└─────┴─────┘" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.add_numbers.df" ] @@ -1930,39 +1898,7 @@ "execution_count": 62, "id": "8f146ae7", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────┐\n", - "│ id ┆ sum │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════╡\n", - "│ 0 ┆ 11 │\n", - "│ 1 ┆ 22 │\n", - "│ 2 ┆ 33 │\n", - "│ 3 ┆ 44 │\n", - "│ 4 ┆ 55 │\n", - "└─────┴─────┘" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline2.my_summation.df" ] @@ -1972,39 +1908,7 @@ "execution_count": 63, "id": "8fd7bf4e", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idproduct
i64i64
010
140
290
3160
4250
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬─────────┐\n", - "│ id ┆ product │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ i64 │\n", - "╞═════╪═════════╡\n", - "│ 0 ┆ 10 │\n", - "│ 1 ┆ 40 │\n", - "│ 2 ┆ 90 │\n", - "│ 3 ┆ 160 │\n", - "│ 4 ┆ 250 │\n", - "└─────┴─────────┘" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline2.my_product.df" ] @@ -2014,39 +1918,7 @@ "execution_count": 64, "id": "2a918db1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "shape: (5, 2)
idresult
i64str
0"Sum: 11, Product: 10"
1"Sum: 22, Product: 40"
2"Sum: 33, Product: 90"
3"Sum: 44, Product: 160"
4"Sum: 55, Product: 250"
" - ], - "text/plain": [ - "shape: (5, 2)\n", - "┌─────┬───────────────────────┐\n", - "│ id ┆ result │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═════╪═══════════════════════╡\n", - "│ 0 ┆ Sum: 11, Product: 10 │\n", - "│ 1 ┆ Sum: 22, Product: 40 │\n", - "│ 2 ┆ Sum: 33, Product: 90 │\n", - "│ 3 ┆ Sum: 44, Product: 160 │\n", - "│ 4 ┆ Sum: 55, Product: 250 │\n", - "└─────┴───────────────────────┘" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline2.my_final_result.df" ] diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index f18e6f6..6ada8a0 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -45,7 +45,7 @@ def __init__( self.nodes = {} self.auto_compile = auto_compile self._dirty = False - self._ordered_nodes = [] # Track order of invocations + self._topological_order = [] # Track order of invocations def __exit__(self, exc_type=None, exc_value=None, traceback=None): """ @@ -82,20 +82,22 @@ def compile(self) -> None: invocation_to_stream_lut = {} G = self.generate_graph() + topological_order = [] for invocation in nx.topological_sort(G): input_streams = [ invocation_to_stream_lut[parent] for parent in invocation.parents() ] node = self.wrap_invocation(invocation, new_input_streams=input_streams) + topological_order.append(node) invocation_to_stream_lut[invocation] = node() self.nodes[node.label] = node + self._topolical_order = topological_order - def run(self) -> None: - # FIXME: perform more efficient traversal through the graph! - for node in self.nodes.values(): - node.flow() - - self.flush() + def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + # TODO: perform more efficient traversal through the graph! + for node in self._topological_order: + node().run(execution_engine=execution_engine) + self.flush() def wrap_invocation( self, From f8fd07cd36bdf6afa2586333774c0cdcc1ec99d6 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 5 Aug 2025 19:56:25 +0000 Subject: [PATCH 177/224] fix: proper use of execution engine in pipeline --- .../02_parallel_execution_on_ray.ipynb | 198 +++++++++++++----- pyproject.toml | 5 +- src/orcapod/data/pods.py | 25 ++- src/orcapod/data/sources.py | 51 +++-- src/orcapod/data/streams.py | 5 +- src/orcapod/pipeline/graph.py | 14 +- src/orcapod/pipeline/nodes.py | 48 ----- src/orcapod/protocols/data_protocols.py | 12 +- uv.lock | 6 +- 9 files changed, 215 insertions(+), 149 deletions(-) diff --git a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb index 692e121..73e04cc 100644 --- a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb +++ b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb @@ -12,19 +12,6 @@ "import pyarrow as pa" ] }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3196df7e", - "metadata": {}, - "outputs": [], - "source": [ - "input_stream = op.streams.ImmutableTableStream(\n", - " pa.Table.from_pylist([{\"id\": i, \"x\": i * 2, \"y\": i * 3} for i in range(30)]),\n", - " tag_columns=[\"id\"],\n", - ")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -35,9 +22,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-08-05 07:53:27,293\tINFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver\n", + "2025-08-05 19:55:02,021\tINFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver\n", "SIGTERM handler is not set because current thread is not the main thread.\n", - "2025-08-05 07:53:30,607\tWARNING utils.py:1280 -- Python patch version mismatch: The cluster was started with:\n", + "2025-08-05 19:55:04,766\tWARNING utils.py:1280 -- Python patch version mismatch: The cluster was started with:\n", " Ray: 2.48.0\n", " Python: 3.13.5\n", "This process on Ray Client was started with:\n", @@ -50,13 +37,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[36m(autoscaler +33s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", - "\u001b[36m(autoscaler +33s)\u001b[0m Adding 5 node(s) of type workergroup.\n", - "\u001b[36m(autoscaler +33s)\u001b[0m Resized to 6 CPUs, 5 GPUs.\n", - "\u001b[36m(autoscaler +33s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*19. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +39s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*9. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +54s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*12. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +1m20s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*8. Add suitable node types to this cluster to resolve this issue.\n" + "\u001b[36m(autoscaler +15s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", + "\u001b[36m(autoscaler +15s)\u001b[0m Adding 5 node(s) of type workergroup.\n", + "\u001b[36m(autoscaler +15s)\u001b[0m Resized to 6 CPUs, 5 GPUs.\n", + "\u001b[36m(autoscaler +15s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*13. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +20s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*24. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +31s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*30. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +36s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*3. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +46s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*14. Add suitable node types to this cluster to resolve this issue.\n" ] } ], @@ -66,6 +54,19 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": 3, + "id": "64a4e7d9", + "metadata": {}, + "outputs": [], + "source": [ + "input_stream = op.streams.ImmutableTableStream(\n", + " pa.Table.from_pylist([{\"id\": i, \"x\": i * 2, \"y\": i * 3} for i in range(50)]),\n", + " tag_columns=[\"id\"],\n", + ")" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -73,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "from time import sleep\n", + "from pathlib import Path\n", "\n", "\n", "@op.function_pod(\"sum\")\n", @@ -81,22 +82,24 @@ " \"\"\"\n", " A simple function that adds two numbers.\n", " \"\"\"\n", - " sleep(0.5)\n", + " import time\n", + "\n", + " time.sleep(0.2)\n", " return x + y" ] }, { "cell_type": "markdown", - "id": "0de4762b", + "id": "52c36a2a", "metadata": {}, "source": [ - "Run first synchronously" + "First run synchronously" ] }, { "cell_type": "code", "execution_count": 5, - "id": "506a3a1e", + "id": "4e12bab5", "metadata": {}, "outputs": [ { @@ -109,10 +112,10 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (30, 2)
idsum
i64i64
00
15
210
315
420
25125
26130
27135
28140
29145
" + "shape: (50, 2)
idsum
i64i64
00
15
210
315
420
45225
46230
47235
48240
49245
" ], "text/plain": [ - "shape: (30, 2)\n", + "shape: (50, 2)\n", "┌─────┬─────┐\n", "│ id ┆ sum │\n", "│ --- ┆ --- │\n", @@ -124,11 +127,11 @@ "│ 3 ┆ 15 │\n", "│ 4 ┆ 20 │\n", "│ … ┆ … │\n", - "│ 25 ┆ 125 │\n", - "│ 26 ┆ 130 │\n", - "│ 27 ┆ 135 │\n", - "│ 28 ┆ 140 │\n", - "│ 29 ┆ 145 │\n", + "│ 45 ┆ 225 │\n", + "│ 46 ┆ 230 │\n", + "│ 47 ┆ 235 │\n", + "│ 48 ┆ 240 │\n", + "│ 49 ┆ 245 │\n", "└─────┴─────┘" ] }, @@ -145,16 +148,16 @@ }, { "cell_type": "markdown", - "id": "fcc8c2f8", + "id": "550b216d", "metadata": {}, "source": [ - "Now let's run it asynchronously using the Ray engine" + "Next we run using Ray engine" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "e83fddac", + "execution_count": 6, + "id": "75ade620", "metadata": {}, "outputs": [ { @@ -167,10 +170,10 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (30, 2)
idsum
i64i64
00
15
210
315
420
25125
26130
27135
28140
29145
" + "shape: (50, 2)
idsum
i64i64
00
15
210
315
420
45225
46230
47235
48240
49245
" ], "text/plain": [ - "shape: (30, 2)\n", + "shape: (50, 2)\n", "┌─────┬─────┐\n", "│ id ┆ sum │\n", "│ --- ┆ --- │\n", @@ -182,15 +185,15 @@ "│ 3 ┆ 15 │\n", "│ 4 ┆ 20 │\n", "│ … ┆ … │\n", - "│ 25 ┆ 125 │\n", - "│ 26 ┆ 130 │\n", - "│ 27 ┆ 135 │\n", - "│ 28 ┆ 140 │\n", - "│ 29 ┆ 145 │\n", + "│ 45 ┆ 225 │\n", + "│ 46 ┆ 230 │\n", + "│ 47 ┆ 235 │\n", + "│ 48 ┆ 240 │\n", + "│ 49 ┆ 245 │\n", "└─────┴─────┘" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -203,11 +206,112 @@ }, { "cell_type": "markdown", - "id": "23179bdc", + "id": "9440ecff", "metadata": {}, "source": [ "**NOTE**: Depending on the availability of nodes and how Ray was configured, you may *not* see any improvement in the running speed for the example above (it may even take longer due to overhead!). If you observe that you don't seem to be getting any speed up, please consult your Ray cluster administrator." ] + }, + { + "cell_type": "markdown", + "id": "39ef532a", + "metadata": {}, + "source": [ + "## Integration with pipeline system" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f459da03", + "metadata": {}, + "outputs": [], + "source": [ + "# make sure we are stating with a clean slate\n", + "import shutil\n", + "\n", + "shutil.rmtree(\"./test_store\", ignore_errors=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2befd400", + "metadata": {}, + "outputs": [], + "source": [ + "store = op.stores.BatchedDeltaTableArrowStore(\"./test_store\")\n", + "pipeline = op.Pipeline(\"pipeline_with_ray\", store)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e21ecaf2", + "metadata": {}, + "outputs": [], + "source": [ + "with pipeline:\n", + " result_stream = add_numbers(input_stream)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8449cb5d", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.run(ray_engine)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "40743bb7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (50, 2)
idsum
i64i64
00
15
210
315
420
45225
46230
47235
48240
49245
" + ], + "text/plain": [ + "shape: (50, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 0 │\n", + "│ 1 ┆ 5 │\n", + "│ 2 ┆ 10 │\n", + "│ 3 ┆ 15 │\n", + "│ 4 ┆ 20 │\n", + "│ … ┆ … │\n", + "│ 45 ┆ 225 │\n", + "│ 46 ┆ 230 │\n", + "│ 47 ┆ 235 │\n", + "│ 48 ┆ 240 │\n", + "│ 49 ┆ 245 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.add_numbers.as_df()" + ] } ], "metadata": { diff --git a/pyproject.toml b/pyproject.toml index 10d436b..0da210f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ dependencies = [ "pyarrow>=20.0.0", "polars>=1.31.0", "beartype>=0.21.0", - ] readme = "README.md" requires-python = ">=3.12.0" @@ -32,7 +31,7 @@ Homepage = "https://github.com/walkerlab/orcapod-python" [project.optional-dependencies] redis = ["redis>=6.2.0"] -ray = ["ray[default]>=2.48.0", "ipywidgets>=8.1.7"] +ray = ["ray[default]==2.48.0", "ipywidgets>=8.1.7"] all = ["orcapod[redis]", "orcapod[ray]"] @@ -53,7 +52,7 @@ dev = [ "pyiceberg>=0.9.1", "pytest>=8.3.5", "pytest-cov>=6.1.1", - "ray>=2.48.0", + "ray[default]==2.48.0", "redis>=6.2.0", "ruff>=0.11.11", "tqdm>=4.67.1", diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 9604da6..4a2ed7a 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -429,19 +429,18 @@ async def async_call( output_values = [] # any kernel/pod invocation happening inside the function will NOT be tracked - with self._tracker_manager.no_tracking(): - # any kernel/pod invocation happening inside the function will NOT be tracked - if not isinstance(packet, dict): - input_dict = packet.as_dict(include_source=False) - else: - input_dict = packet - if execution_engine is not None: - # use the provided execution engine to run the function - values = await execution_engine.submit_async( - self.function, **input_dict - ) - else: - values = self.function(**input_dict) + # with self._tracker_manager.no_tracking(): + # FIXME: figure out how to properly make context manager work with async/await + # any kernel/pod invocation happening inside the function will NOT be tracked + if not isinstance(packet, dict): + input_dict = packet.as_dict(include_source=False) + else: + input_dict = packet + if execution_engine is not None: + # use the provided execution engine to run the function + values = await execution_engine.submit_async(self.function, **input_dict) + else: + values = self.function(**input_dict) output_data = self.process_function_output(values) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index c31971d..bcf2596 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -148,9 +148,29 @@ def as_table( include_content_hash=include_content_hash, ) - def flow(self) -> Collection[tuple[dp.Tag, dp.Packet]]: + def flow( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Collection[tuple[dp.Tag, dp.Packet]]: """Delegate to the cached KernelStream.""" - return self().flow() + return self().flow(execution_engine=execution_engine) + + def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + """ + Run the source node, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + self().run(execution_engine=execution_engine) + + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: + """ + Run the source node asynchronously, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + await self().run_async(execution_engine=execution_engine) # ==================== LiveStream Protocol (Delegation) ==================== @@ -195,38 +215,37 @@ def get_all_records( """ ... - @property - def lazy(self) -> "pl.LazyFrame | None": + def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": records = self.get_all_records(include_system_columns=False) if records is not None: - return pl.LazyFrame(records) + result = pl.LazyFrame(records) + if sort_by_tags: + result = result.sort(self.tag_keys) + return result return None - @property - def df(self) -> "pl.DataFrame | None": + def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": """ Return the DataFrame representation of the pod's records. """ - lazy_df = self.lazy + lazy_df = self.as_lazy_frame(sort_by_tags=sort_by_tags) if lazy_df is not None: return lazy_df.collect() return None - @property - def polars_df(self) -> "pl.DataFrame | None": + def as_polars_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": """ Return the DataFrame representation of the pod's records. """ - return self.df + return self.as_df(sort_by_tags=sort_by_tags) - @property - def pandas_df(self) -> "pd.DataFrame | None": + def as_pandas_df(self, sort_by_tags: bool = True) -> "pd.DataFrame | None": """ Return the pandas DataFrame representation of the pod's records. """ - records = self.get_all_records(include_system_columns=False) - if records is not None: - return records.to_pandas() + df = self.as_polars_df(sort_by_tags=sort_by_tags) + if df is not None: + return df.to_pandas() return None def reset_cache(self) -> None: diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 6ea31e0..b6e7470 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1045,9 +1045,8 @@ async def run_async( import asyncio completed_calls = await asyncio.gather(*pending_calls) - for results in completed_calls: - for tag, packet in results: - cached_results.append((tag, packet)) + for result in completed_calls: + cached_results.append(result) self._cached_output_packets = cached_results self._set_modified_time() diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 6ada8a0..6fc7f05 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -45,7 +45,7 @@ def __init__( self.nodes = {} self.auto_compile = auto_compile self._dirty = False - self._topological_order = [] # Track order of invocations + self._ordered_nodes = [] # Track order of invocations def __exit__(self, exc_type=None, exc_value=None, traceback=None): """ @@ -82,22 +82,20 @@ def compile(self) -> None: invocation_to_stream_lut = {} G = self.generate_graph() - topological_order = [] for invocation in nx.topological_sort(G): input_streams = [ invocation_to_stream_lut[parent] for parent in invocation.parents() ] node = self.wrap_invocation(invocation, new_input_streams=input_streams) - topological_order.append(node) invocation_to_stream_lut[invocation] = node() self.nodes[node.label] = node - self._topolical_order = topological_order def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: - # TODO: perform more efficient traversal through the graph! - for node in self._topological_order: - node().run(execution_engine=execution_engine) - self.flush() + # FIXME: perform more efficient traversal through the graph! + for node in self.nodes.values(): + node.run(execution_engine=execution_engine) + + self.flush() def wrap_invocation( self, diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 781161b..415f8b4 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -187,54 +187,6 @@ def get_all_records( return results -def add_pipeline_record( - self, tag: dp.Tag, input_packet: dp.Packet, retrieved: bool | None = None -) -> None: - # combine dp.Tag with packet content hash to compute entry hash - tag_with_hash = tag.as_table().append_column( - self.PACKET_HASH_COLUMN, - pa.array([input_packet.content_hash()], type=pa.large_string()), - ) - entry_id = self.data_context.arrow_hasher.hash_table( - tag_with_hash, prefix_hasher_id=True - ) - - existing_record = self.pipeline_store.get_record_by_id( - self.pipeline_path, - entry_id, - ) - - if existing_record is not None: - # if the record already exists, return it - return - - # no record matching, so construct the full record - - input_packet_info = ( - input_packet.as_table( - include_source=True, - ) - .append_column( - f"{constants.META_PREFIX}input_packet{constants.CONTEXT_KEY}", - pa.array([input_packet.data_context_key], type=pa.large_string()), - ) - .append_column( - self.DATA_RETRIEVED_FLAG, - pa.array([retrieved], type=pa.bool_()), - ) - .drop(input_packet.keys()) - ) - - combined_record = arrow_utils.hstack_tables(tag_with_hash, input_packet_info) - - self.pipeline_store.add_record( - self.pipeline_path, - entry_id, - combined_record, - skip_duplicates=False, - ) - - class PodNode(Node, CachedPod): def __init__( self, diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 284a697..8026765 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -2061,17 +2061,13 @@ def get_all_records( """ ... - @property - def lazy(self) -> "pl.LazyFrame | None": ... + def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... - @property - def df(self) -> "pl.DataFrame | None": ... + def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": ... - @property - def polars_df(self) -> "pl.DataFrame | None": ... + def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... - @property - def pandas_df(self) -> "pd.DataFrame | None": ... + def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... class Tracker(Protocol): diff --git a/uv.lock b/uv.lock index 632bbc5..a2214ca 100644 --- a/uv.lock +++ b/uv.lock @@ -1284,7 +1284,7 @@ dev = [ { name = "pyiceberg" }, { name = "pytest" }, { name = "pytest-cov" }, - { name = "ray" }, + { name = "ray", extra = ["default"] }, { name = "redis" }, { name = "ruff" }, { name = "tqdm" }, @@ -1302,7 +1302,7 @@ requires-dist = [ { name = "polars", specifier = ">=1.31.0" }, { name = "pyarrow", specifier = ">=20.0.0" }, { name = "pyyaml", specifier = ">=6.0.2" }, - { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = ">=2.48.0" }, + { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, { name = "typing-extensions" }, { name = "xxhash" }, @@ -1320,7 +1320,7 @@ dev = [ { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-cov", specifier = ">=6.1.1" }, - { name = "ray", specifier = ">=2.48.0" }, + { name = "ray", extras = ["default"], specifier = "==2.48.0" }, { name = "redis", specifier = ">=6.2.0" }, { name = "ruff", specifier = ">=0.11.11" }, { name = "tqdm", specifier = ">=4.67.1" }, From 63a217f9a9fc87b484e518e1b43f313d4e7a0771 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 6 Aug 2025 00:30:28 +0000 Subject: [PATCH 178/224] fix: improper handling of same named column in input and output packet and missing rename of source info --- pyproject.toml | 1 + .../data/datagrams/arrow_tag_packet.py | 32 +++++++++++++++++++ src/orcapod/data/datagrams/dict_datagram.py | 2 +- src/orcapod/data/datagrams/dict_tag_packet.py | 32 +++++++++++++++++++ src/orcapod/data/pods.py | 1 - src/orcapod/pipeline/nodes.py | 11 ++++--- src/orcapod/stores/delta_lake_stores.py | 12 ++++--- uv.lock | 31 ++++++++++++++++++ 8 files changed, 111 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0da210f..4bf4d1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,4 +56,5 @@ dev = [ "redis>=6.2.0", "ruff>=0.11.11", "tqdm>=4.67.1", + "unitycatalog-client>=0.3.0", ] diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index afca7a5..4ae06a1 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -445,10 +445,42 @@ def with_source_info(self, **source_info: str | None) -> Self: ) return new_packet + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """ + Create a new ArrowDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New ArrowDatagram instance with renamed data columns + """ + # Create new schema with renamed fields, preserving original types + + if not column_mapping: + return self + + new_names = [column_mapping.get(k, k) for k in self._data_table.column_names] + + new_source_info_names = [ + f"{constants.SOURCE_PREFIX}{column_mapping.get(k.removeprefix(constants.SOURCE_PREFIX), k)}" + for k in self._source_info_table.column_names + ] + + new_datagram = self.copy(include_cache=False) + new_datagram._data_table = new_datagram._data_table.rename_columns(new_names) + new_datagram._source_info_table = ( + new_datagram._source_info_table.rename_columns(new_source_info_names) + ) + + return new_datagram + # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" new_packet = super().copy(include_cache=include_cache) + new_packet._source_info_table = self._source_info_table if include_cache: new_packet._cached_source_info = self._cached_source_info diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 30f0903..00ec527 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -397,7 +397,7 @@ def as_arrow_compatible_dict( include_context=include_context, ) - return self._data_context.type_converter.python_dict_to_struct_dict( + return self._data_context.type_converter.python_dicts_to_struct_dicts( [python_dict], python_schema=python_schema )[0] diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index eaef415..84f9c65 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -401,6 +401,38 @@ def types( schema[f"{constants.SOURCE_PREFIX}{key}"] = str return schema + def rename(self, column_mapping: Mapping[str, str]) -> Self: + """ + Create a new DictDatagram with data columns renamed. + Maintains immutability by returning a new instance. + + Args: + column_mapping: Mapping from old column names to new column names + + Returns: + New DictDatagram instance with renamed data columns + """ + # Rename data columns according to mapping, preserving original types + + new_data = {column_mapping.get(k, k): v for k, v in self._data.items()} + + new_source_info = { + column_mapping.get(k, k): v for k, v in self._source_info.items() + } + + # Handle python_schema updates for renamed columns + new_python_schema = { + column_mapping.get(k, k): v for k, v in self._data_python_schema.items() + } + + return self.__class__( + data=new_data, + meta_info=self._meta_data, + source_info=new_source_info, + python_schema=new_python_schema, + data_context=self._data_context, + ) + def arrow_schema( self, include_all_info: bool = False, diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 4a2ed7a..3827998 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -426,7 +426,6 @@ async def async_call( f"Pod is not active: skipping computation on input packet {packet}" ) return tag, None - output_values = [] # any kernel/pod invocation happening inside the function will NOT be tracked # with self._tracker_manager.no_tracking(): diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 415f8b4..83c946b 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -309,6 +309,7 @@ def add_pipeline_record( tag_with_hash, prefix_hasher_id=True ) + # FIXME: consider and implement more robust cache lookup logic existing_record = None if not skip_cache_lookup: existing_record = self.pipeline_store.get_record_by_id( @@ -320,10 +321,12 @@ def add_pipeline_record( # if the record already exists, then skip return + # rename all keys to avoid potential collision with result columns + renamed_input_packet = input_packet.rename( + {k: f"_input_{k}" for k in input_packet.keys()} + ) input_packet_info = ( - input_packet.as_table( - include_source=True, - ) + renamed_input_packet.as_table(include_source=True) .append_column( constants.PACKET_RECORD_ID, pa.array([packet_record_id], type=pa.large_string()), @@ -336,7 +339,7 @@ def add_pipeline_record( self.DATA_RETRIEVED_FLAG, pa.array([retrieved], type=pa.bool_()), ) - .drop(input_packet.keys()) + .drop_columns(list(renamed_input_packet.keys())) ) combined_record = arrow_utils.hstack_tables(tag.as_table(), input_packet_info) diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index 41c09b5..ab1b8d7 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -992,7 +992,7 @@ def flush_batch(self, record_path: tuple[str, ...]) -> None: record_ids = combined_table.column( self.RECORD_ID_COLUMN ).to_pylist() - unique_record_ids = list(set(record_ids)) + unique_record_ids = cast(list[str], list(set(record_ids))) # Delete existing records with these IDs if unique_record_ids: @@ -1458,7 +1458,7 @@ def add_records( records_renamed[self.RECORD_ID_COLUMN], existing_ids ) ) - records_renamed = pc.filter(records_renamed, mask) + records_renamed = pc.filter(records_renamed, mask) # type: ignore # Update the list of record IDs that will actually be added if len(records_renamed) > 0: @@ -1492,8 +1492,8 @@ def add_records( # Group records by record_id for individual batch entries for record_id in unique_record_ids: # Filter records for this specific record_id - mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) - single_record = pc.filter(records_renamed, mask) + mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) # type: ignore + single_record = pc.filter(records_renamed, mask) # type: ignore # Add to pending batch (will overwrite if duplicate_entry_behavior allows) if ( @@ -1671,7 +1671,9 @@ def get_records_by_ids( try: # Use schema-preserving read with filters - filter_expr = self._create_record_ids_filter(record_ids_list) + filter_expr = self._create_record_ids_filter( + cast(list[str], record_ids_list) + ) result = self._read_table_with_filter(delta_table, filters=filter_expr) if len(result) == 0: diff --git a/uv.lock b/uv.lock index a2214ca..f525a8b 100644 --- a/uv.lock +++ b/uv.lock @@ -78,6 +78,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl", hash = "sha256:3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d", size = 25231, upload-time = "2025-03-31T14:16:18.478Z" }, ] +[[package]] +name = "aiohttp-retry" +version = "2.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9d/61/ebda4d8e3d8cfa1fd3db0fb428db2dd7461d5742cea35178277ad180b033/aiohttp_retry-2.9.1.tar.gz", hash = "sha256:8eb75e904ed4ee5c2ec242fefe85bf04240f685391c4879d8f541d6028ff01f1", size = 13608, upload-time = "2024-11-06T10:44:54.574Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/99/84ba7273339d0f3dfa57901b846489d2e5c2cd731470167757f1935fffbd/aiohttp_retry-2.9.1-py3-none-any.whl", hash = "sha256:66d2759d1921838256a05a3f80ad7e724936f083e35be5abb5e16eed6be6dc54", size = 9981, upload-time = "2024-11-06T10:44:52.917Z" }, +] + [[package]] name = "aiosignal" version = "1.4.0" @@ -1288,6 +1300,7 @@ dev = [ { name = "redis" }, { name = "ruff" }, { name = "tqdm" }, + { name = "unitycatalog-client" }, ] [package.metadata] @@ -1324,6 +1337,7 @@ dev = [ { name = "redis", specifier = ">=6.2.0" }, { name = "ruff", specifier = ">=0.11.11" }, { name = "tqdm", specifier = ">=4.67.1" }, + { name = "unitycatalog-client", specifier = ">=0.3.0" }, ] [[package]] @@ -2309,6 +2323,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "unitycatalog-client" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "aiohttp-retry" }, + { name = "pydantic" }, + { name = "python-dateutil" }, + { name = "typing-extensions" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/14/430e0fd06707b5ade9ba69a5847e1645a1adab7761b8149fd1916f814216/unitycatalog_client-0.3.0.tar.gz", hash = "sha256:6373b8c26723307beb9e14e92c9c5b75cc6dab343ba30b0a1d93c421ca944dfa", size = 63438, upload-time = "2025-06-06T15:23:06.584Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/dd/7e12caea6075a02feec609f60e2b7fe06a7b39f1c7026b32b21eaa6a68b0/unitycatalog_client-0.3.0-py3-none-any.whl", hash = "sha256:29d6061cafd076a098d515d3019a19d2449c14b82621b3910c1273cba16ee6e5", size = 159106, upload-time = "2025-06-06T15:23:03.839Z" }, +] + [[package]] name = "urllib3" version = "2.4.0" From dd2d1d56b5f44549c9914399346e56db4471856c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 8 Aug 2025 22:58:11 +0000 Subject: [PATCH 179/224] Fix join to work with complex data types --- src/orcapod/data/operators/join.py | 9 +++++++-- src/orcapod/data/streams.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index a101f3d..d6d92cc 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -10,8 +10,10 @@ if TYPE_CHECKING: import pyarrow as pa + import polars as pl else: pa = LazyModule("pyarrow") + pl = LazyModule("polars") class Join(NonZeroInputOperator): @@ -78,9 +80,12 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: common_tag_keys = tag_keys.intersection(next_tag_keys) common_tag_keys.add(COMMON_JOIN_KEY) - table = table.join( - next_table, keys=list(common_tag_keys), join_type="inner" + table = ( + pl.DataFrame(table) + .join(pl.DataFrame(next_table), on=list(common_tag_keys), how="inner") + .to_arrow() ) + tag_keys.update(next_tag_keys) # reorder columns to bring tag columns to the front diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index b6e7470..bf88836 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,6 +1,6 @@ import logging from abc import ABC, abstractmethod -from collections.abc import AsyncIterator, Collection, Iterator, Mapping +from collections.abc import Collection, Iterator, Mapping from datetime import datetime, timezone from itertools import repeat from pathlib import Path From 3e1069c5a3e250df516fa2b0eaf1c282b4f42d7d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 10 Aug 2025 23:22:36 +0000 Subject: [PATCH 180/224] feat: add hashing of complex type with semantic type and use of ref in objectspec --- pyproject.toml | 4 + .../contexts/data/schemas/context_schema.json | 5 + src/orcapod/contexts/data/v0.1.json | 41 +- src/orcapod/contexts/registry.py | 25 +- .../execution_engines/ray_execution_engine.py | 2 +- src/orcapod/hashing/arrow_hashers.py | 452 ++++++++++++----- src/orcapod/hashing/visitors.py | 377 ++++++++++++++ src/orcapod/pipeline/graph.py | 1 - src/orcapod/protocols/semantic_protocols.py | 23 + .../semantic_struct_converters.py | 97 +++- src/orcapod/utils/arrow_utils.py | 266 ++++++++++ src/orcapod/utils/object_spec.py | 29 +- uv.lock | 474 +++++++++++++++++- 13 files changed, 1626 insertions(+), 170 deletions(-) create mode 100644 src/orcapod/hashing/visitors.py diff --git a/pyproject.toml b/pyproject.toml index 4bf4d1a..7c48254 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,11 +43,14 @@ version_file = "src/orcapod/_version.py" [dependency-groups] dev = [ + "adlfs>=2024.12.0", "deltalake>=1.0.2", + "gcsfs>=2025.7.0", "httpie>=3.2.4", "ipykernel>=6.29.5", "ipywidgets>=8.1.7", "jsonschema>=4.25.0", + "minio>=7.2.16", "pyarrow-stubs>=20.0.0.20250716", "pyiceberg>=0.9.1", "pytest>=8.3.5", @@ -55,6 +58,7 @@ dev = [ "ray[default]==2.48.0", "redis>=6.2.0", "ruff>=0.11.11", + "s3fs>=2025.7.0", "tqdm>=4.67.1", "unitycatalog-client>=0.3.0", ] diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 0d5a305..0485d51 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -8,6 +8,7 @@ "required": [ "context_key", "version", + "semantic_registry", "type_converter", "arrow_hasher", "object_hasher" @@ -41,6 +42,10 @@ "Enhanced version with timestamp support and improved hashing" ] }, + "semantic_registry": { + "$ref": "#/$defs/objectspec", + "description": "ObjectSpec for the semantic registry" + }, "type_converter": { "$ref": "#/$defs/objectspec", "description": "ObjectSpec for the python-arrow type converter" diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 188bd9c..7cbf2a9 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -2,20 +2,15 @@ "context_key": "std:v0.1:default", "version": "v0.1", "description": "Initial stable release with basic Path semantic type support", - "type_converter": { - "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", + "semantic_registry": { + "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { - "semantic_registry": { - "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", - "_config": { - "converters": [ - { - "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", - "_config": {} - } - ] + "converters": [ + { + "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", + "_config": {} } - } + ] } }, "arrow_hasher": { @@ -25,18 +20,16 @@ "hash_algorithm": "sha256", "chunk_size": 8192, "serialization_method": "logical", - "semantic_type_hashers": { - "path": { - "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", - "_config": { - "file_hasher": { - "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "_config": { - "algorithm": "sha256" - } - } - } - } + "semantic_registry": { + "_ref": "semantic_registry" + } + } + }, + "type_converter": { + "_class": "orcapod.semantic_types.universal_converter.UniversalTypeConverter", + "_config": { + "semantic_registry": { + "_ref": "semantic_registry" } } }, diff --git a/src/orcapod/contexts/registry.py b/src/orcapod/contexts/registry.py index 7bf869a..e3f0989 100644 --- a/src/orcapod/contexts/registry.py +++ b/src/orcapod/contexts/registry.py @@ -264,23 +264,36 @@ def _create_context_from_spec(self, spec: dict[str, Any]) -> DataContext: context_key = spec["context_key"] version = spec["version"] description = spec.get("description", "") + ref_lut = {} logger.debug(f"Creating type converter for {version}") - type_converter = parse_objectspec(spec["type_converter"]) + ref_lut["semantic_registry"] = parse_objectspec( + spec["semantic_registry"], + ref_lut=ref_lut, + ) + + logger.debug(f"Creating type converter for {version}") + ref_lut["type_converter"] = parse_objectspec( + spec["type_converter"], ref_lut=ref_lut + ) logger.debug(f"Creating arrow hasher for {version}") - arrow_hasher = parse_objectspec(spec["arrow_hasher"]) + ref_lut["arrow_hasher"] = parse_objectspec( + spec["arrow_hasher"], ref_lut=ref_lut + ) logger.debug(f"Creating object hasher for {version}") - object_hasher = parse_objectspec(spec["object_hasher"]) + ref_lut["object_hasher"] = parse_objectspec( + spec["object_hasher"], ref_lut=ref_lut + ) return DataContext( context_key=context_key, version=version, description=description, - type_converter=type_converter, - arrow_hasher=arrow_hasher, - object_hasher=object_hasher, + type_converter=ref_lut["type_converter"], + arrow_hasher=ref_lut["arrow_hasher"], + object_hasher=ref_lut["object_hasher"], ) except Exception as e: diff --git a/src/orcapod/execution_engines/ray_execution_engine.py b/src/orcapod/execution_engines/ray_execution_engine.py index 47d2032..6e581f9 100644 --- a/src/orcapod/execution_engines/ray_execution_engine.py +++ b/src/orcapod/execution_engines/ray_execution_engine.py @@ -16,7 +16,7 @@ T = TypeVar("T") -class NativeRayAsyncEngine: +class RayEngine: """ Ray execution engine using native asyncio support. diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 264caad..c8c53fb 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -2,16 +2,19 @@ from typing import Any import pyarrow as pa import json -from orcapod.protocols.hashing_protocols import SemanticTypeHasher, StringCacher +from orcapod.semantic_types import SemanticTypeRegistry from orcapod.hashing import arrow_serialization from collections.abc import Callable +from orcapod.hashing.visitors import SemanticHashingVisitor +from orcapod.utils import arrow_utils + SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { "logical": arrow_serialization.serialize_table_logical, } -def serialize_pyarrow_table(table: pa.Table) -> str: +def json_pyarrow_table_serialization(table: pa.Table) -> str: """ Serialize a PyArrow table to a stable JSON string by converting to dictionary of lists. @@ -41,18 +44,19 @@ class SemanticArrowHasher: Stable hasher for Arrow tables with semantic type support. This hasher: - 1. Processes columns with special semantic types using dedicated hashers - 2. Sorts columns by name for deterministic ordering - 3. Uses Arrow IPC format for stable serialization - 4. Computes final hash of the processed packet + 1. Uses visitor pattern to recursively process nested data structures + 2. Replaces semantic types with their hash strings using registered converters + 3. Sorts columns by name for deterministic ordering + 4. Uses Arrow serialization for stable binary representation + 5. Computes final hash of the processed table """ def __init__( self, + hasher_id: str | None = None, hash_algorithm: str = "sha256", - semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, chunk_size: int = 8192, - hasher_id: str | None = None, + semantic_registry: SemanticTypeRegistry | None = None, handle_missing: str = "error", serialization_method: str = "logical", # TODO: consider passing options for serialization method @@ -61,167 +65,126 @@ def __init__( Initialize SemanticArrowHasher. Args: - chunk_size: Size of chunks to read files in bytes + semantic_registry: Registry containing semantic type converters with hashing + hash_algorithm: Hash algorithm to use for final table hash + chunk_size: Size of chunks to read files in bytes (legacy, may be removed) + hasher_id: Unique identifier for this hasher instance handle_missing: How to handle missing files ('error', 'skip', 'null_hash') + serialization_method: Method for serializing Arrow table """ if hasher_id is None: hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" + self._hasher_id = hasher_id + self.semantic_registry = semantic_registry self.chunk_size = chunk_size self.handle_missing = handle_missing - self.semantic_type_hashers: dict[str, SemanticTypeHasher] = ( - semantic_type_hashers or {} - ) self.hash_algorithm = hash_algorithm + if serialization_method not in SERIALIZATION_METHOD_LUT: raise ValueError( f"Invalid serialization method '{serialization_method}'. " f"Supported methods: {list(SERIALIZATION_METHOD_LUT.keys())}" ) self.serialization_method = serialization_method - self._serialize_arrow_table = SERIALIZATION_METHOD_LUT[serialization_method] - - def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: - """ - Add a string cacher for caching hash values. - - This is a no-op for SemanticArrowHasher since it hashes column contents directly. - """ - if semantic_type in self.semantic_type_hashers: - self.semantic_type_hashers[semantic_type].set_cacher(cacher) - else: - raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") @property def hasher_id(self) -> str: return self._hasher_id - def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): - """Register a custom hasher for a semantic type.""" - self.semantic_type_hashers[semantic_type] = hasher - - def _get_semantic_type(self, field: pa.Field) -> str | None: - """Extract semantic_type from field metadata.""" - if field.metadata is None: - return None - - metadata = field.metadata - if b"semantic_type" in metadata: - return metadata[b"semantic_type"].decode("utf-8") - elif "semantic_type" in metadata: - return metadata["semantic_type"] - - return None + def _process_table_columns(self, table: pa.Table) -> pa.Table: + """ + Process table columns using visitor pattern to handle nested semantic types. - def _create_hash_column( - self, - original_column: pa.Array, - hash_bytes: bytes, - original_field: pa.Field, - hash_algorithm: str | None = None, - ) -> tuple[pa.Array, pa.Field]: - """Create a new column containing the hash bytes.""" - # Create array of hash bytes (one hash value repeated for each row) - hash_value = hash_bytes.hex() # Convert to hex string for readability - hash_array = pa.array([hash_value] * len(original_column)) - - # Create new field with modified metadata - new_metadata = dict(original_field.metadata) if original_field.metadata else {} - new_metadata["original_semantic_type"] = new_metadata.get( - "semantic_type", "unknown" - ) - new_metadata["semantic_type"] = "hash" - new_metadata["hash_algorithm"] = hash_algorithm or self.hasher_id - - new_field = pa.field( - original_field.name, - pa.large_string(), # Hash stored as large string - nullable=original_field.nullable, - metadata=new_metadata, - ) - - return hash_array, new_field + This replaces the old column-by-column processing with a visitor-based approach + that can handle semantic types nested inside complex data structures. + """ + # TODO: Process in batchwise/chunk-wise fashion for memory efficiency + # Currently using to_pylist() for simplicity but this loads entire table into memory - def _process_table_columns(self, table: pa.Table) -> pa.Table: - # TODO: add copy of table-level metadata to the new table - """Process table columns, replacing semantic type columns with their hashes.""" new_columns = [] new_fields = [] + # Import here to avoid circular dependencies for i, field in enumerate(table.schema): - column = table.column(i) - semantic_type = self._get_semantic_type(field) - - if semantic_type in self.semantic_type_hashers: - # Hash the column using the appropriate semantic hasher - hasher = self.semantic_type_hashers[semantic_type] - hash_bytes = hasher.hash_column(column) - - # Replace column with hash - hash_column, hash_field = self._create_hash_column( - column, hash_bytes, field - ) - new_columns.append(hash_column) - new_fields.append(hash_field) - else: - # Keep original column - new_columns.append(column) - new_fields.append(field) - - # Create new table with processed columns - new_schema = pa.schema(new_fields) - return pa.table(new_columns, schema=new_schema) + # Convert column to struct dicts for processing + column_data = table.column(i).to_pylist() + + # Create fresh visitor for each column (stateless approach) + visitor = SemanticHashingVisitor(self.semantic_registry) + + try: + # Use visitor to transform both type and data + new_type = None + processed_data = [] + for c in column_data: + processed_type, processed_value = visitor.visit(field.type, c) + if new_type is None: + new_type = processed_type + processed_data.append(processed_value) + + # Create new Arrow column from processed data + new_column = pa.array(processed_data, type=new_type) + new_field = pa.field(field.name, new_type) + + new_columns.append(new_column) + new_fields.append(new_field) + + except Exception as e: + # Add context about which column failed + raise RuntimeError( + f"Failed to process column '{field.name}': {str(e)}" + ) from e + + # Return new table with processed columns + return pa.table(new_columns, schema=pa.schema(new_fields)) def _sort_table_columns(self, table: pa.Table) -> pa.Table: """Sort table columns by field name for deterministic ordering.""" - # Get column indices sorted by field name - sorted_indices = sorted( - range(len(table.schema)), key=lambda i: table.schema.field(i).name - ) + # Get sorted column names + sorted_column_names = sorted(table.column_names) - # Reorder columns - sorted_columns = [table.column(i) for i in sorted_indices] - sorted_fields = [table.schema.field(i) for i in sorted_indices] + # Use select to reorder columns - much cleaner! + return table.select(sorted_column_names) - sorted_schema = pa.schema(sorted_fields) - return pa.table(sorted_columns, schema=sorted_schema) - - # def _serialize_table_ipc(self, table: pa.Table) -> bytes: - # # TODO: fix and use logical table hashing instead - # """Serialize table using Arrow IPC format for stable binary representation.""" - # buffer = BytesIO() + def serialize_arrow_table(self, table: pa.Table) -> bytes: + """ + Serialize Arrow table using the configured serialization method. - # # Use IPC stream format for deterministic serialization - # with ipc.new_stream(buffer, table.schema) as writer: - # writer.write_table(table) + Args: + table: Arrow table to serialize - # return buffer.getvalue() + Returns: + Serialized bytes of the table + """ + serialization_method_function = SERIALIZATION_METHOD_LUT[ + self.serialization_method + ] + return serialization_method_function(table) def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: """ - Compute stable hash of Arrow table. + Compute stable hash of Arrow table with semantic type processing. Args: table: Arrow table to hash + prefix_hasher_id: Whether to prefix hash with hasher ID Returns: Hex string of the computed hash """ - # Step 1: Process columns with semantic types + # Step 1: Process columns with semantic types using visitor pattern processed_table = self._process_table_columns(table) # Step 2: Sort columns by name for deterministic ordering sorted_table = self._sort_table_columns(processed_table) - # normalize all string to large strings by passing through polars - # TODO: consider cleaner approach in the future - import polars as pl + # normalize all string to large strings (for compatibility with Polars) + normalized_table = arrow_utils.normalize_table_to_large_types(sorted_table) - sorted_table = pl.DataFrame(sorted_table).to_arrow() - - # Step 3: Serialize using Arrow IPC format - serialized_bytes = self._serialize_arrow_table(sorted_table) + # Step 3: Serialize using configured serialization method + serialized_bytes = self.serialize_arrow_table(normalized_table) # Step 4: Compute final hash hasher = hashlib.new(self.hash_algorithm) @@ -240,16 +203,19 @@ def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: Returns: Dictionary containing hash, metadata, and processing info """ - processed_columns = [] + # Process table to see what transformations were made + processed_table = self._process_table_columns(table) # Track processing steps - for i, field in enumerate(table.schema): - semantic_type = self._get_semantic_type(field) + processed_columns = [] + for i, (original_field, processed_field) in enumerate( + zip(table.schema, processed_table.schema) + ): column_info = { - "name": field.name, - "original_type": str(field.type), - "semantic_type": semantic_type, - "processed": semantic_type in self.semantic_type_hashers, + "name": original_field.name, + "original_type": str(original_field.type), + "processed_type": str(processed_field.type), + "was_processed": str(original_field.type) != str(processed_field.type), } processed_columns.append(column_info) @@ -258,8 +224,240 @@ def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: return { "hash": table_hash, + "hasher_id": self.hasher_id, + "serialization_method": self.serialization_method, + "hash_algorithm": self.hash_algorithm, "num_rows": len(table), "num_columns": len(table.schema), "processed_columns": processed_columns, "column_order": [field.name for field in table.schema], } + + +# class SemanticArrowHasher2: +# """ +# Stable hasher for Arrow tables with semantic type support. + +# This hasher: +# 1. Processes columns with special semantic types using dedicated hashers +# 2. Sorts columns by name for deterministic ordering +# 3. Uses Arrow IPC format for stable serialization +# 4. Computes final hash of the processed packet +# """ + +# def __init__( +# self, +# hash_algorithm: str = "sha256", +# semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, +# chunk_size: int = 8192, +# hasher_id: str | None = None, +# handle_missing: str = "error", +# serialization_method: str = "logical", +# # TODO: consider passing options for serialization method +# ): +# """ +# Initialize SemanticArrowHasher. + +# Args: +# chunk_size: Size of chunks to read files in bytes +# handle_missing: How to handle missing files ('error', 'skip', 'null_hash') +# """ +# if hasher_id is None: +# hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" +# self._hasher_id = hasher_id +# self.chunk_size = chunk_size +# self.handle_missing = handle_missing +# self.semantic_type_hashers: dict[str, SemanticTypeHasher] = ( +# semantic_type_hashers or {} +# ) +# self.hash_algorithm = hash_algorithm +# if serialization_method not in SERIALIZATION_METHOD_LUT: +# raise ValueError( +# f"Invalid serialization method '{serialization_method}'. " +# f"Supported methods: {list(SERIALIZATION_METHOD_LUT.keys())}" +# ) +# self.serialization_method = serialization_method +# self._serialize_arrow_table = SERIALIZATION_METHOD_LUT[serialization_method] + +# def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: +# """ +# Add a string cacher for caching hash values. + +# This is a no-op for SemanticArrowHasher since it hashes column contents directly. +# """ +# if semantic_type in self.semantic_type_hashers: +# self.semantic_type_hashers[semantic_type].set_cacher(cacher) +# else: +# raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") + +# @property +# def hasher_id(self) -> str: +# return self._hasher_id + +# def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): +# """Register a custom hasher for a semantic type.""" +# self.semantic_type_hashers[semantic_type] = hasher + +# def _get_semantic_type(self, field: pa.Field) -> str | None: +# """Extract semantic_type from field metadata.""" +# if field.metadata is None: +# return None + +# metadata = field.metadata +# if b"semantic_type" in metadata: +# return metadata[b"semantic_type"].decode("utf-8") +# elif "semantic_type" in metadata: +# return metadata["semantic_type"] + +# return None + +# def _create_hash_column( +# self, +# original_column: pa.Array, +# hash_bytes: bytes, +# original_field: pa.Field, +# hash_algorithm: str | None = None, +# ) -> tuple[pa.Array, pa.Field]: +# """Create a new column containing the hash bytes.""" +# # Create array of hash bytes (one hash value repeated for each row) +# hash_value = hash_bytes.hex() # Convert to hex string for readability +# hash_array = pa.array([hash_value] * len(original_column)) + +# # Create new field with modified metadata +# new_metadata = dict(original_field.metadata) if original_field.metadata else {} +# new_metadata["original_semantic_type"] = new_metadata.get( +# "semantic_type", "unknown" +# ) +# new_metadata["semantic_type"] = "hash" +# new_metadata["hash_algorithm"] = hash_algorithm or self.hasher_id + +# new_field = pa.field( +# original_field.name, +# pa.large_string(), # Hash stored as large string +# nullable=original_field.nullable, +# metadata=new_metadata, +# ) + +# return hash_array, new_field + +# def _process_table_columns(self, table: pa.Table) -> pa.Table: +# # TODO: add copy of table-level metadata to the new table +# """Process table columns, replacing semantic type columns with their hashes.""" +# new_columns = [] +# new_fields = [] + +# for i, field in enumerate(table.schema): +# column = table.column(i) +# semantic_type = self._get_semantic_type(field) + +# if semantic_type in self.semantic_type_hashers: +# # Hash the column using the appropriate semantic hasher +# hasher = self.semantic_type_hashers[semantic_type] +# hash_bytes = hasher.hash_column(column) + +# # Replace column with hash +# hash_column, hash_field = self._create_hash_column( +# column, hash_bytes, field +# ) +# new_columns.append(hash_column) +# new_fields.append(hash_field) +# else: +# # Keep original column +# new_columns.append(column) +# new_fields.append(field) + +# # Create new table with processed columns +# new_schema = pa.schema(new_fields) +# return pa.table(new_columns, schema=new_schema) + +# def _sort_table_columns(self, table: pa.Table) -> pa.Table: +# """Sort table columns by field name for deterministic ordering.""" +# # Get column indices sorted by field name +# sorted_indices = sorted( +# range(len(table.schema)), key=lambda i: table.schema.field(i).name +# ) + +# # Reorder columns +# sorted_columns = [table.column(i) for i in sorted_indices] +# sorted_fields = [table.schema.field(i) for i in sorted_indices] + +# sorted_schema = pa.schema(sorted_fields) +# return pa.table(sorted_columns, schema=sorted_schema) + +# # def _serialize_table_ipc(self, table: pa.Table) -> bytes: +# # # TODO: fix and use logical table hashing instead +# # """Serialize table using Arrow IPC format for stable binary representation.""" +# # buffer = BytesIO() + +# # # Use IPC stream format for deterministic serialization +# # with ipc.new_stream(buffer, table.schema) as writer: +# # writer.write_table(table) + +# # return buffer.getvalue() + +# def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: +# """ +# Compute stable hash of Arrow table. + +# Args: +# table: Arrow table to hash + +# Returns: +# Hex string of the computed hash +# """ + +# # Step 1: Process columns with semantic types +# processed_table = self._process_table_columns(table) + +# # Step 2: Sort columns by name for deterministic ordering +# sorted_table = self._sort_table_columns(processed_table) + +# # normalize all string to large strings by passing through polars +# # TODO: consider cleaner approach in the future +# import polars as pl + +# sorted_table = pl.DataFrame(sorted_table).to_arrow() + +# # Step 3: Serialize using Arrow IPC format +# serialized_bytes = self._serialize_arrow_table(sorted_table) + +# # Step 4: Compute final hash +# hasher = hashlib.new(self.hash_algorithm) +# hasher.update(serialized_bytes) + +# hash_str = hasher.hexdigest() +# if prefix_hasher_id: +# hash_str = f"{self.hasher_id}@{hash_str}" + +# return hash_str + +# def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: +# """ +# Compute hash with additional metadata about the process. + +# Returns: +# Dictionary containing hash, metadata, and processing info +# """ +# processed_columns = [] + +# # Track processing steps +# for i, field in enumerate(table.schema): +# semantic_type = self._get_semantic_type(field) +# column_info = { +# "name": field.name, +# "original_type": str(field.type), +# "semantic_type": semantic_type, +# "processed": semantic_type in self.semantic_type_hashers, +# } +# processed_columns.append(column_info) + +# # Compute hash +# table_hash = self.hash_table(table) + +# return { +# "hash": table_hash, +# "num_rows": len(table), +# "num_columns": len(table.schema), +# "processed_columns": processed_columns, +# "column_order": [field.name for field in table.schema], +# } diff --git a/src/orcapod/hashing/visitors.py b/src/orcapod/hashing/visitors.py new file mode 100644 index 0000000..e205a12 --- /dev/null +++ b/src/orcapod/hashing/visitors.py @@ -0,0 +1,377 @@ +""" +SUGGESTED FILE: src/orcapod/hashing/visitors.py + +Generic visitor pattern for traversing Arrow types and data simultaneously. + +This provides a base visitor class that can be extended for various processing needs +like semantic hashing, validation, data cleaning, etc. +""" + +from abc import ABC, abstractmethod +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry + + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class ArrowTypeDataVisitor(ABC): + """ + Base visitor for traversing Arrow types and data simultaneously. + + This enables processing that needs to transform both the Arrow schema + and the corresponding data in a single pass. + """ + + @abstractmethod + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Visit a struct type with its data""" + pass + + @abstractmethod + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + """Visit a list type with its data""" + pass + + @abstractmethod + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """Visit a map type with its data""" + pass + + @abstractmethod + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + """Visit a primitive type with its data""" + pass + + def visit(self, arrow_type: "pa.DataType", data: Any) -> tuple["pa.DataType", Any]: + """ + Main dispatch method that routes to appropriate visit method. + + Args: + arrow_type: Arrow data type to process + data: Corresponding data value + + Returns: + Tuple of (new_arrow_type, new_data) + """ + if pa.types.is_struct(arrow_type): + return self.visit_struct(arrow_type, data) + elif pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type): + return self.visit_list(arrow_type, data) + elif pa.types.is_fixed_size_list(arrow_type): + # Treat fixed-size lists like regular lists for processing + return self.visit_list(arrow_type, data) + elif pa.types.is_map(arrow_type): + return self.visit_map(arrow_type, data) + else: + return self.visit_primitive(arrow_type, data) + + def _visit_struct_fields( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.StructType", dict]: + """ + Helper method to recursively process struct fields. + + This is the default behavior for regular (non-semantic) structs. + """ + if data is None: + return struct_type, None + + new_fields = [] + new_data = {} + + for field in struct_type: + field_data = data.get(field.name) + new_field_type, new_field_data = self.visit(field.type, field_data) + + new_fields.append(pa.field(field.name, new_field_type)) + new_data[field.name] = new_field_data + + return pa.struct(new_fields), new_data + + def _visit_list_elements( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", list]: + """ + Helper method to recursively process list elements. + + This is the default behavior for lists. + """ + if data is None: + return list_type, None + + element_type = list_type.value_type + processed_elements = [] + new_element_type = None + + for item in data: + current_element_type, processed_item = self.visit(element_type, item) + processed_elements.append(processed_item) + + # Use the first non-None element to determine new element type + if new_element_type is None: + new_element_type = current_element_type + + # If list was empty or all None, keep original element type + if new_element_type is None: + new_element_type = element_type + + # Create appropriate list type based on original type + if pa.types.is_large_list(list_type): + return pa.large_list(new_element_type), processed_elements + elif pa.types.is_fixed_size_list(list_type): + return pa.list_(new_element_type, list_type.list_size), processed_elements + else: + return pa.list_(new_element_type), processed_elements + + +class PassThroughVisitor(ArrowTypeDataVisitor): + """ + A visitor that passes through data unchanged. + + Useful as a base class or for testing the visitor pattern. + """ + + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + return self._visit_struct_fields(struct_type, data) + + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + return self._visit_list_elements(list_type, data) + + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + # For simplicity, treat maps like structs for now + # TODO: Implement proper map handling if needed + return map_type, data + + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + return primitive_type, data + + +class SemanticHashingError(Exception): + """Exception raised when semantic hashing fails""" + + pass + + +class SemanticHashingVisitor(ArrowTypeDataVisitor): + """ + Visitor that replaces semantic types with their hash strings. + + This visitor traverses Arrow type structures and data simultaneously, + identifying semantic types by their struct signatures and replacing + them with hash strings computed by their respective converters. + """ + + def __init__(self, semantic_registry: SemanticTypeRegistry): + """ + Initialize the semantic hashing visitor. + + Args: + semantic_registry: Registry containing semantic type converters + """ + self.registry = semantic_registry + self._current_field_path: list[str] = [] + + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """ + Visit a struct type, checking if it's a semantic type. + + If the struct is a semantic type (recognized by signature), replace it + with a hash string. Otherwise, recursively process its fields. + """ + if data is None: + return struct_type, None + + # Check if this struct IS a semantic type by signature recognition + converter = self.registry.get_converter_for_struct_signature(struct_type) + if converter: + # This is a semantic type - hash it + try: + hash_string = converter.hash_struct_dict(data, add_prefix=True) + return pa.large_string(), hash_string + except Exception as e: + field_path = ( + ".".join(self._current_field_path) + if self._current_field_path + else "" + ) + converter_name = getattr( + converter, "semantic_type_name", str(type(converter).__name__) + ) + raise SemanticHashingError( + f"Failed to hash semantic type '{converter_name}' at field path '{field_path}': {str(e)}" + ) from e + else: + # Regular struct - recursively process fields + return self._visit_struct_fields(struct_type, data) + + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + """ + Visit a list type, recursively processing elements. + + Elements that are semantic types will be replaced with hash strings. + """ + if data is None: + return list_type, None + + # Add list indicator to field path for error context + self._current_field_path.append("[*]") + try: + return self._visit_list_elements(list_type, data) + finally: + self._current_field_path.pop() + + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + """ + Visit a map type. + + For now, we treat maps as pass-through since they're less common. + TODO: Implement proper map traversal if needed for semantic types in keys/values. + """ + return map_type, data + + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + """ + Visit a primitive type - pass through unchanged. + + Primitive types cannot be semantic types (which are always structs). + """ + return primitive_type, data + + def _visit_struct_fields( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.StructType", dict]: + """Override to add field path tracking for better error messages""" + if data is None: + return struct_type, None + + new_fields = [] + new_data = {} + + for field in struct_type: + # Add field name to path for error context + self._current_field_path.append(field.name) + try: + field_data = data.get(field.name) + new_field_type, new_field_data = self.visit(field.type, field_data) + + new_fields.append(pa.field(field.name, new_field_type)) + new_data[field.name] = new_field_data + finally: + self._current_field_path.pop() + + return pa.struct(new_fields), new_data + + +class ValidationVisitor(ArrowTypeDataVisitor): + """ + Example visitor for data validation. + + This demonstrates how the visitor pattern can be extended for other use cases. + """ + + def __init__(self): + self.errors: list[str] = [] + self._current_field_path: list[str] = [] + + def visit_struct( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.DataType", Any]: + if data is None: + return struct_type, None + + # Check for missing required fields + field_names = {field.name for field in struct_type} + data_keys = set(data.keys()) + missing_fields = field_names - data_keys + + if missing_fields: + field_path = ( + ".".join(self._current_field_path) + if self._current_field_path + else "" + ) + self.errors.append( + f"Missing required fields {missing_fields} at '{field_path}'" + ) + + return self._visit_struct_fields(struct_type, data) + + def visit_list( + self, list_type: "pa.ListType", data: list | None + ) -> tuple["pa.DataType", Any]: + if data is None: + return list_type, None + + self._current_field_path.append("[*]") + try: + return self._visit_list_elements(list_type, data) + finally: + self._current_field_path.pop() + + def visit_map( + self, map_type: "pa.MapType", data: dict | None + ) -> tuple["pa.DataType", Any]: + return map_type, data + + def visit_primitive( + self, primitive_type: "pa.DataType", data: Any + ) -> tuple["pa.DataType", Any]: + return primitive_type, data + + def _visit_struct_fields( + self, struct_type: "pa.StructType", data: dict | None + ) -> tuple["pa.StructType", dict]: + """Override to add field path tracking""" + if data is None: + return struct_type, None + + new_fields = [] + new_data = {} + + for field in struct_type: + self._current_field_path.append(field.name) + try: + field_data = data.get(field.name) + new_field_type, new_field_data = self.visit(field.type, field_data) + + new_fields.append(pa.field(field.name, new_field_type)) + new_data[field.name] = new_field_data + finally: + self._current_field_path.pop() + + return pa.struct(new_fields), new_data diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 6fc7f05..752d368 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -5,7 +5,6 @@ from orcapod.protocols import store_protocols as sp from typing import Any from collections.abc import Collection -from orcapod.data.streams import WrappedStream import logging diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_protocols.py index 9a19b8d..1ce53bb 100644 --- a/src/orcapod/protocols/semantic_protocols.py +++ b/src/orcapod/protocols/semantic_protocols.py @@ -33,3 +33,26 @@ def can_handle_python_type(self, python_type: type) -> bool: def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: """Check if this converter can handle the given struct type.""" ... + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + """ + Compute hash of the semantic type from its struct dictionary representation. + + Args: + struct_dict: Arrow struct dictionary representation + add_prefix: If True, prefix with semantic type and algorithm info + + Returns: + Hash string, optionally prefixed like "path:sha256:abc123..." + + Raises: + Exception: If hashing fails (e.g., file not found for path types) + """ + ... + + @property + def hasher_id(self) -> str: + """Identifier for this hasher (for debugging/versioning)""" + ... diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index e675b61..3ba45f5 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -15,11 +15,66 @@ pa = LazyModule("pyarrow") +class SemanticStructConverterBase: + """ + Base class providing common functionality for semantic struct converters. + + Subclasses only need to implement the abstract methods and can use + the common hashing infrastructure. + """ + + def __init__(self, semantic_type_name: str): + self._semantic_type_name = semantic_type_name + self._hasher_id = f"{self.semantic_type_name}_content_sha256" + + @property + def semantic_type_name(self) -> str: + """The name of the semantic type this converter handles.""" + return self._semantic_type_name + + @property + def hasher_id(self) -> str: + """Default hasher ID based on semantic type name""" + return self._hasher_id + + def _format_hash_string(self, hash_bytes: bytes, add_prefix: bool = False) -> str: + """ + Format hash bytes into the standard hash string format. + + Args: + hash_bytes: Raw hash bytes + add_prefix: Whether to add semantic type and algorithm prefix + + Returns: + Formatted hash string + """ + hash_hex = hash_bytes.hex() + if add_prefix: + return f"{self.semantic_type_name}:sha256:{hash_hex}" + else: + return hash_hex + + def _compute_content_hash(self, content: bytes) -> bytes: + """ + Compute SHA-256 hash of content bytes. + + Args: + content: Content to hash + + Returns: + SHA-256 hash bytes + """ + import hashlib + + return hashlib.sha256(content).digest() + + # Path-specific implementation -class PathStructConverter: +class PathStructConverter(SemanticStructConverterBase): """Converter for pathlib.Path objects to/from semantic structs.""" def __init__(self): + super().__init__("path") self._python_type = Path # Define the Arrow struct type for paths @@ -77,3 +132,43 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: return set(struct_dict.keys()) == {"path"} and isinstance( struct_dict["path"], str ) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + """ + Compute hash of the file content pointed to by the path. + + Args: + struct_dict: Arrow struct dictionary with 'path' field + add_prefix: If True, prefix with semantic type and algorithm info + + Returns: + Hash string of the file content, optionally prefixed + + Raises: + FileNotFoundError: If the file doesn't exist + PermissionError: If the file can't be read + OSError: For other file system errors + """ + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct") + + path = Path(path_str) + + try: + # TODO: replace with FileHasher implementation + # Read file content and compute hash + content = path.read_bytes() + hash_bytes = self._compute_content_hash(content) + return self._format_hash_string(hash_bytes, add_prefix) + + except FileNotFoundError: + raise FileNotFoundError(f"File not found: {path}") + except PermissionError: + raise PermissionError(f"Permission denied reading file: {path}") + except IsADirectoryError: + raise ValueError(f"Path is a directory, not a file: {path}") + except OSError as e: + raise OSError(f"Error reading file {path}: {e}") diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index c728338..b7be792 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -6,6 +6,188 @@ from typing import Any +from typing import TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +def normalize_to_large_types(arrow_type: "pa.DataType") -> "pa.DataType": + """ + Recursively convert Arrow types to their large variants where available. + + This ensures consistent schema representation regardless of the original + type choices (e.g., string vs large_string, binary vs large_binary). + + Args: + arrow_type: Arrow data type to normalize + + Returns: + Arrow data type with large variants substituted + + Examples: + >>> normalize_to_large_types(pa.string()) + large_string + + >>> normalize_to_large_types(pa.list_(pa.string())) + large_list + + >>> normalize_to_large_types(pa.struct([pa.field("name", pa.string())])) + struct + """ + # Handle primitive types that have large variants + if pa.types.is_string(arrow_type): + return pa.large_string() + elif pa.types.is_binary(arrow_type): + return pa.large_binary() + elif pa.types.is_list(arrow_type): + # Regular list -> large_list with normalized element type + element_type = normalize_to_large_types(arrow_type.value_type) + return pa.large_list(element_type) + + # Large variants and fixed-size lists stay as-is (already normalized or no large variant) + elif pa.types.is_large_string(arrow_type) or pa.types.is_large_binary(arrow_type): + return arrow_type + elif pa.types.is_large_list(arrow_type): + # Still need to normalize the element type + element_type = normalize_to_large_types(arrow_type.value_type) + return pa.large_list(element_type) + elif pa.types.is_fixed_size_list(arrow_type): + # Fixed-size lists don't have large variants, but normalize element type + element_type = normalize_to_large_types(arrow_type.value_type) + return pa.list_(element_type, arrow_type.list_size) + + # Handle struct types recursively + elif pa.types.is_struct(arrow_type): + normalized_fields = [] + for field in arrow_type: + normalized_field_type = normalize_to_large_types(field.type) + normalized_fields.append( + pa.field( + field.name, + normalized_field_type, + nullable=field.nullable, + metadata=field.metadata, + ) + ) + return pa.struct(normalized_fields) + + # Handle map types (key and value types) + elif pa.types.is_map(arrow_type): + normalized_key_type = normalize_to_large_types(arrow_type.key_type) + normalized_value_type = normalize_to_large_types(arrow_type.item_type) + return pa.map_(normalized_key_type, normalized_value_type) + + # Handle union types + elif pa.types.is_union(arrow_type): + # Union types contain multiple child types + normalized_child_types = [] + for i in range(arrow_type.num_fields): + child_field = arrow_type[i] + normalized_child_type = normalize_to_large_types(child_field.type) + normalized_child_types.append( + pa.field(child_field.name, normalized_child_type) + ) + + # Reconstruct union with normalized child types + if isinstance(arrow_type, pa.SparseUnionType): + return pa.sparse_union(normalized_child_types) + else: # dense union + return pa.dense_union(normalized_child_types) + + # Handle dictionary types + elif pa.types.is_dictionary(arrow_type): + # Normalize the value type (dictionary values), keep index type as-is + normalized_value_type = normalize_to_large_types(arrow_type.value_type) + return pa.dictionary(arrow_type.index_type, normalized_value_type) # type: ignore + + # All other types (int, float, bool, date, timestamp, etc.) don't have large variants + else: + return arrow_type + + +def normalize_schema_to_large_types(schema: "pa.Schema") -> "pa.Schema": + """ + Convert a schema to use large variants of data types. + + This normalizes schemas so that string -> large_string, binary -> large_binary, + list -> large_list, etc., handling nested structures recursively. + + Args: + schema: Arrow schema to normalize + + Returns: + New schema with large type variants, or same schema if no changes needed + + Examples: + >>> schema = pa.schema([ + ... pa.field("name", pa.string()), + ... pa.field("files", pa.list_(pa.string())), + ... ]) + >>> normalize_schema_to_large_types(schema) + name: large_string + files: large_list + """ + normalized_fields = [] + schema_changed = False + + for field in schema: + normalized_type = normalize_to_large_types(field.type) + + # Check if the type actually changed + if normalized_type != field.type: + schema_changed = True + + normalized_field = pa.field( + field.name, + normalized_type, + nullable=field.nullable, + metadata=field.metadata, + ) + normalized_fields.append(normalized_field) + + # Only create new schema if something actually changed + if schema_changed: + return pa.schema(normalized_fields, metadata=schema.metadata) # type: ignore + else: + return schema + + +def normalize_table_to_large_types(table: "pa.Table") -> "pa.Table": + """ + Normalize table schema to use large type variants. + + Uses cast() which should be zero-copy for large variant conversions + since they have identical binary representations, but ensures proper + type validation and handles any edge cases safely. + + Args: + table: Arrow table to normalize + + Returns: + Table with normalized schema, or same table if no changes needed + + Examples: + >>> table = pa.table({"name": ["Alice", "Bob"], "age": [25, 30]}) + >>> normalized = normalize_table_to_large_types(table) + >>> normalized.schema + name: large_string + age: int64 + """ + normalized_schema = normalize_schema_to_large_types(table.schema) + + # If schema didn't change, return original table + if normalized_schema is table.schema: + return table + + # Use cast() for safety - should be zero-copy for large variant conversions + # but handles Arrow's internal type validation and any edge cases properly + return table.cast(normalized_schema) + + def pylist_to_pydict(pylist: list[dict]) -> dict: """ Convert a list of dictionaries to a dictionary of lists (columnar format). @@ -404,3 +586,87 @@ def drop_schema_columns(schema: pa.Schema, columns: Collection[str]) -> pa.Schem pa.Schema: New schema with specified columns removed. """ return pa.schema([field for field in schema if field.name not in columns]) + + +# Test function to demonstrate usage +def test_schema_normalization(): + """Test the schema normalization functions.""" + print("=== Testing Arrow Schema Normalization ===\n") + + # Test basic types + print("1. Basic type normalization:") + basic_types = [ + pa.string(), + pa.binary(), + pa.list_(pa.string()), + pa.large_string(), # Should stay the same + pa.int64(), # Should stay the same + ] + + for arrow_type in basic_types: + normalized = normalize_to_large_types(arrow_type) + print(f" {arrow_type} -> {normalized}") + + print("\n2. Complex nested type normalization:") + complex_types = [ + pa.struct( + [pa.field("name", pa.string()), pa.field("files", pa.list_(pa.binary()))] + ), + pa.map_(pa.string(), pa.list_(pa.string())), + pa.large_list( + pa.struct([pa.field("id", pa.int64()), pa.field("path", pa.string())]) + ), + ] + + for arrow_type in complex_types: + normalized = normalize_to_large_types(arrow_type) + print(f" {arrow_type}") + print(f" -> {normalized}") + print() + + print("3. Schema normalization:") + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("name", pa.string()), + pa.field("files", pa.list_(pa.string())), + pa.field( + "metadata", + pa.struct( + [pa.field("created", pa.string()), pa.field("size", pa.int64())] + ), + ), + ] + ) + + print("Original schema:") + print(schema) + + normalized_schema = normalize_schema_to_large_types(schema) + print("\nNormalized schema:") + print(normalized_schema) + + print("\n4. Table normalization:") + table_data = { + "name": ["Alice", "Bob", "Charlie"], + "files": [ + ["file1.txt", "file2.txt"], + ["data.csv"], + ["config.json", "output.log"], + ], + } + + table = pa.table(table_data) + print("Original table schema:") + print(table.schema) + + normalized_table = normalize_table_to_large_types(table) + print("\nNormalized table schema:") + print(normalized_table.schema) + + # Verify data is preserved + print(f"\nData preserved: {table.to_pydict() == normalized_table.to_pydict()}") + + +if __name__ == "__main__": + test_schema_normalization() diff --git a/src/orcapod/utils/object_spec.py b/src/orcapod/utils/object_spec.py index 453204f..8ecfd0a 100644 --- a/src/orcapod/utils/object_spec.py +++ b/src/orcapod/utils/object_spec.py @@ -1,26 +1,43 @@ import importlib from typing import Any +from weakref import ref -def parse_objectspec(obj_spec: Any, validate: bool = True) -> Any: +def parse_objectspec( + obj_spec: Any, + ref_lut: dict[str, Any] | None = None, + validate: bool = True, +) -> Any: """Enhanced ObjectSpec with better error handling and validation.""" + if ref_lut is None: + ref_lut = {} if isinstance(obj_spec, dict): if "_class" in obj_spec: - return _create_instance_from_spec(obj_spec, validate) + return _create_instance_from_spec(obj_spec, ref_lut, validate) + elif "_ref" in obj_spec: + ref_key = obj_spec["_ref"] + if ref_key in ref_lut: + return ref_lut[ref_key] + else: + raise ValueError(f"Unknown reference: {ref_key}") else: # Recursively process dict - return {k: parse_objectspec(v, validate) for k, v in obj_spec.items()} + return { + k: parse_objectspec(v, ref_lut, validate) for k, v in obj_spec.items() + } elif isinstance(obj_spec, (list, tuple)): - processed = [parse_objectspec(item, validate) for item in obj_spec] + processed = [parse_objectspec(item, ref_lut, validate) for item in obj_spec] return tuple(processed) if isinstance(obj_spec, tuple) else processed else: return obj_spec -def _create_instance_from_spec(spec: dict[str, Any], validate: bool) -> Any: +def _create_instance_from_spec( + spec: dict[str, Any], ref_lut: dict[str, Any], validate: bool +) -> Any: """Create instance with better error handling.""" try: class_path = spec["_class"] @@ -32,7 +49,7 @@ def _create_instance_from_spec(spec: dict[str, Any], validate: bool) -> Any: cls = getattr(module, class_name) # Process config recursively - processed_config = parse_objectspec(config, validate) + processed_config = parse_objectspec(config, ref_lut, validate) # Optional: validate config matches class signature if validate: diff --git a/uv.lock b/uv.lock index f525a8b..43629fc 100644 --- a/uv.lock +++ b/uv.lock @@ -2,10 +2,46 @@ version = 1 revision = 2 requires-python = ">=3.12.0" resolution-markers = [ - "python_full_version >= '3.13'", + "python_full_version >= '3.14'", + "python_full_version == '3.13.*'", "python_full_version < '3.13'", ] +[[package]] +name = "adlfs" +version = "2024.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "azure-core" }, + { name = "azure-datalake-store" }, + { name = "azure-identity" }, + { name = "azure-storage-blob" }, + { name = "fsspec" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/76/82/e30891af574fb358449fb9436aac53569814452cb88b0cba4f488171b8dc/adlfs-2024.12.0.tar.gz", hash = "sha256:04582bf7461a57365766d01a295a0a88b2b8c42c4fea06e2d673f62675cac5c6", size = 49189, upload-time = "2024-12-15T19:06:30.939Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/ed/d1bf75c089857d38332cf45416e419b47382b345ba5dfc4fae69397830d9/adlfs-2024.12.0-py3-none-any.whl", hash = "sha256:00aab061ddec0413b2039487e656b62e01ece8ef1ca0493f76034a596cf069e3", size = 41792, upload-time = "2024-12-15T19:06:27.718Z" }, +] + +[[package]] +name = "aiobotocore" +version = "2.24.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "aioitertools" }, + { name = "botocore" }, + { name = "jmespath" }, + { name = "multidict" }, + { name = "python-dateutil" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b2/ca/ac82c0c699815b6d5b4017f3d8fb2c2d49537f4937f4a0bdf58b4c75d321/aiobotocore-2.24.0.tar.gz", hash = "sha256:b32c0c45d38c22a18ce395a0b5448606c5260603296a152895b5bdb40ab3139d", size = 119597, upload-time = "2025-08-08T18:26:50.373Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl", hash = "sha256:72bb1f8eb1b962779a95e1bcc9cf35bc33196ad763b622a40ae7fa9d2e95c87c", size = 84971, upload-time = "2025-08-08T18:26:48.777Z" }, +] + [[package]] name = "aiohappyeyeballs" version = "2.6.1" @@ -90,6 +126,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/99/84ba7273339d0f3dfa57901b846489d2e5c2cd731470167757f1935fffbd/aiohttp_retry-2.9.1-py3-none-any.whl", hash = "sha256:66d2759d1921838256a05a3f80ad7e724936f083e35be5abb5e16eed6be6dc54", size = 9981, upload-time = "2024-11-06T10:44:52.917Z" }, ] +[[package]] +name = "aioitertools" +version = "0.12.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/06/de/38491a84ab323b47c7f86e94d2830e748780525f7a10c8600b67ead7e9ea/aioitertools-0.12.0.tar.gz", hash = "sha256:c2a9055b4fbb7705f561b9d86053e8af5d10cc845d22c32008c43490b2d8dd6b", size = 19369, upload-time = "2024-09-02T03:33:40.349Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl", hash = "sha256:fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796", size = 24345, upload-time = "2024-09-02T03:34:59.454Z" }, +] + [[package]] name = "aiosignal" version = "1.4.0" @@ -121,6 +166,78 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, ] +[[package]] +name = "argon2-cffi" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "argon2-cffi-bindings", version = "21.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" }, + { name = "argon2-cffi-bindings", version = "25.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/89/ce5af8a7d472a67cc819d5d998aa8c82c5d860608c4db9f46f1162d7dab9/argon2_cffi-25.1.0.tar.gz", hash = "sha256:694ae5cc8a42f4c4e2bf2ca0e64e51e23a040c6a517a85074683d3959e1346c1", size = 45706, upload-time = "2025-06-03T06:55:32.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl", hash = "sha256:fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741", size = 14657, upload-time = "2025-06-03T06:55:30.804Z" }, +] + +[[package]] +name = "argon2-cffi-bindings" +version = "21.2.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14'", +] +dependencies = [ + { name = "cffi", marker = "python_full_version >= '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/e9/184b8ccce6683b0aa2fbb7ba5683ea4b9c5763f1356347f1312c32e3c66e/argon2-cffi-bindings-21.2.0.tar.gz", hash = "sha256:bb89ceffa6c791807d1305ceb77dbfacc5aa499891d2c55661c6459651fc39e3", size = 1779911, upload-time = "2021-12-01T08:52:55.68Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/13/838ce2620025e9666aa8f686431f67a29052241692a3dd1ae9d3692a89d3/argon2_cffi_bindings-21.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ccb949252cb2ab3a08c02024acb77cfb179492d5701c7cbdbfd776124d4d2367", size = 29658, upload-time = "2021-12-01T09:09:17.016Z" }, + { url = "https://files.pythonhosted.org/packages/b3/02/f7f7bb6b6af6031edb11037639c697b912e1dea2db94d436e681aea2f495/argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9524464572e12979364b7d600abf96181d3541da11e23ddf565a32e70bd4dc0d", size = 80583, upload-time = "2021-12-01T09:09:19.546Z" }, + { url = "https://files.pythonhosted.org/packages/ec/f7/378254e6dd7ae6f31fe40c8649eea7d4832a42243acaf0f1fff9083b2bed/argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b746dba803a79238e925d9046a63aa26bf86ab2a2fe74ce6b009a1c3f5c8f2ae", size = 86168, upload-time = "2021-12-01T09:09:21.445Z" }, + { url = "https://files.pythonhosted.org/packages/74/f6/4a34a37a98311ed73bb80efe422fed95f2ac25a4cacc5ae1d7ae6a144505/argon2_cffi_bindings-21.2.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58ed19212051f49a523abb1dbe954337dc82d947fb6e5a0da60f7c8471a8476c", size = 82709, upload-time = "2021-12-01T09:09:18.182Z" }, + { url = "https://files.pythonhosted.org/packages/74/2b/73d767bfdaab25484f7e7901379d5f8793cccbb86c6e0cbc4c1b96f63896/argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:bd46088725ef7f58b5a1ef7ca06647ebaf0eb4baff7d1d0d177c6cc8744abd86", size = 83613, upload-time = "2021-12-01T09:09:22.741Z" }, + { url = "https://files.pythonhosted.org/packages/4f/fd/37f86deef67ff57c76f137a67181949c2d408077e2e3dd70c6c42912c9bf/argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_i686.whl", hash = "sha256:8cd69c07dd875537a824deec19f978e0f2078fdda07fd5c42ac29668dda5f40f", size = 84583, upload-time = "2021-12-01T09:09:24.177Z" }, + { url = "https://files.pythonhosted.org/packages/6f/52/5a60085a3dae8fded8327a4f564223029f5f54b0cb0455a31131b5363a01/argon2_cffi_bindings-21.2.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e", size = 88475, upload-time = "2021-12-01T09:09:26.673Z" }, + { url = "https://files.pythonhosted.org/packages/8b/95/143cd64feb24a15fa4b189a3e1e7efbaeeb00f39a51e99b26fc62fbacabd/argon2_cffi_bindings-21.2.0-cp36-abi3-win32.whl", hash = "sha256:603ca0aba86b1349b147cab91ae970c63118a0f30444d4bc80355937c950c082", size = 27698, upload-time = "2021-12-01T09:09:27.87Z" }, + { url = "https://files.pythonhosted.org/packages/37/2c/e34e47c7dee97ba6f01a6203e0383e15b60fb85d78ac9a15cd066f6fe28b/argon2_cffi_bindings-21.2.0-cp36-abi3-win_amd64.whl", hash = "sha256:b2ef1c30440dbbcba7a5dc3e319408b59676e2e039e2ae11a8775ecf482b192f", size = 30817, upload-time = "2021-12-01T09:09:30.267Z" }, + { url = "https://files.pythonhosted.org/packages/5a/e4/bf8034d25edaa495da3c8a3405627d2e35758e44ff6eaa7948092646fdcc/argon2_cffi_bindings-21.2.0-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e415e3f62c8d124ee16018e491a009937f8cf7ebf5eb430ffc5de21b900dad93", size = 53104, upload-time = "2021-12-01T09:09:31.335Z" }, +] + +[[package]] +name = "argon2-cffi-bindings" +version = "25.1.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version == '3.13.*'", + "python_full_version < '3.13'", +] +dependencies = [ + { name = "cffi", marker = "python_full_version < '3.14'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5c/2d/db8af0df73c1cf454f71b2bbe5e356b8c1f8041c979f505b3d3186e520a9/argon2_cffi_bindings-25.1.0.tar.gz", hash = "sha256:b957f3e6ea4d55d820e40ff76f450952807013d361a65d7f28acc0acbf29229d", size = 1783441, upload-time = "2025-07-30T10:02:05.147Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/3c0a35f46e52108d4707c44b95cfe2afcafc50800b5450c197454569b776/argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:3d3f05610594151994ca9ccb3c771115bdb4daef161976a266f0dd8aa9996b8f", size = 54393, upload-time = "2025-07-30T10:01:40.97Z" }, + { url = "https://files.pythonhosted.org/packages/9d/f4/98bbd6ee89febd4f212696f13c03ca302b8552e7dbf9c8efa11ea4a388c3/argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8b8efee945193e667a396cbc7b4fb7d357297d6234d30a489905d96caabde56b", size = 29328, upload-time = "2025-07-30T10:01:41.916Z" }, + { url = "https://files.pythonhosted.org/packages/43/24/90a01c0ef12ac91a6be05969f29944643bc1e5e461155ae6559befa8f00b/argon2_cffi_bindings-25.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3c6702abc36bf3ccba3f802b799505def420a1b7039862014a65db3205967f5a", size = 31269, upload-time = "2025-07-30T10:01:42.716Z" }, + { url = "https://files.pythonhosted.org/packages/d4/d3/942aa10782b2697eee7af5e12eeff5ebb325ccfb86dd8abda54174e377e4/argon2_cffi_bindings-25.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1c70058c6ab1e352304ac7e3b52554daadacd8d453c1752e547c76e9c99ac44", size = 86558, upload-time = "2025-07-30T10:01:43.943Z" }, + { url = "https://files.pythonhosted.org/packages/0d/82/b484f702fec5536e71836fc2dbc8c5267b3f6e78d2d539b4eaa6f0db8bf8/argon2_cffi_bindings-25.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e2fd3bfbff3c5d74fef31a722f729bf93500910db650c925c2d6ef879a7e51cb", size = 92364, upload-time = "2025-07-30T10:01:44.887Z" }, + { url = "https://files.pythonhosted.org/packages/c9/c1/a606ff83b3f1735f3759ad0f2cd9e038a0ad11a3de3b6c673aa41c24bb7b/argon2_cffi_bindings-25.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c4f9665de60b1b0e99bcd6be4f17d90339698ce954cfd8d9cf4f91c995165a92", size = 85637, upload-time = "2025-07-30T10:01:46.225Z" }, + { url = "https://files.pythonhosted.org/packages/44/b4/678503f12aceb0262f84fa201f6027ed77d71c5019ae03b399b97caa2f19/argon2_cffi_bindings-25.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ba92837e4a9aa6a508c8d2d7883ed5a8f6c308c89a4790e1e447a220deb79a85", size = 91934, upload-time = "2025-07-30T10:01:47.203Z" }, + { url = "https://files.pythonhosted.org/packages/f0/c7/f36bd08ef9bd9f0a9cff9428406651f5937ce27b6c5b07b92d41f91ae541/argon2_cffi_bindings-25.1.0-cp314-cp314t-win32.whl", hash = "sha256:84a461d4d84ae1295871329b346a97f68eade8c53b6ed9a7ca2d7467f3c8ff6f", size = 28158, upload-time = "2025-07-30T10:01:48.341Z" }, + { url = "https://files.pythonhosted.org/packages/b3/80/0106a7448abb24a2c467bf7d527fe5413b7fdfa4ad6d6a96a43a62ef3988/argon2_cffi_bindings-25.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b55aec3565b65f56455eebc9b9f34130440404f27fe21c3b375bf1ea4d8fbae6", size = 32597, upload-time = "2025-07-30T10:01:49.112Z" }, + { url = "https://files.pythonhosted.org/packages/05/b8/d663c9caea07e9180b2cb662772865230715cbd573ba3b5e81793d580316/argon2_cffi_bindings-25.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:87c33a52407e4c41f3b70a9c2d3f6056d88b10dad7695be708c5021673f55623", size = 28231, upload-time = "2025-07-30T10:01:49.92Z" }, + { url = "https://files.pythonhosted.org/packages/1d/57/96b8b9f93166147826da5f90376e784a10582dd39a393c99bb62cfcf52f0/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:aecba1723ae35330a008418a91ea6cfcedf6d31e5fbaa056a166462ff066d500", size = 54121, upload-time = "2025-07-30T10:01:50.815Z" }, + { url = "https://files.pythonhosted.org/packages/0a/08/a9bebdb2e0e602dde230bdde8021b29f71f7841bd54801bcfd514acb5dcf/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2630b6240b495dfab90aebe159ff784d08ea999aa4b0d17efa734055a07d2f44", size = 29177, upload-time = "2025-07-30T10:01:51.681Z" }, + { url = "https://files.pythonhosted.org/packages/b6/02/d297943bcacf05e4f2a94ab6f462831dc20158614e5d067c35d4e63b9acb/argon2_cffi_bindings-25.1.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:7aef0c91e2c0fbca6fc68e7555aa60ef7008a739cbe045541e438373bc54d2b0", size = 31090, upload-time = "2025-07-30T10:01:53.184Z" }, + { url = "https://files.pythonhosted.org/packages/c1/93/44365f3d75053e53893ec6d733e4a5e3147502663554b4d864587c7828a7/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e021e87faa76ae0d413b619fe2b65ab9a037f24c60a1e6cc43457ae20de6dc6", size = 81246, upload-time = "2025-07-30T10:01:54.145Z" }, + { url = "https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a", size = 87126, upload-time = "2025-07-30T10:01:55.074Z" }, + { url = "https://files.pythonhosted.org/packages/72/70/7a2993a12b0ffa2a9271259b79cc616e2389ed1a4d93842fac5a1f923ffd/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:c87b72589133f0346a1cb8d5ecca4b933e3c9b64656c9d175270a000e73b288d", size = 80343, upload-time = "2025-07-30T10:01:56.007Z" }, + { url = "https://files.pythonhosted.org/packages/78/9a/4e5157d893ffc712b74dbd868c7f62365618266982b64accab26bab01edc/argon2_cffi_bindings-25.1.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1db89609c06afa1a214a69a462ea741cf735b29a57530478c06eb81dd403de99", size = 86777, upload-time = "2025-07-30T10:01:56.943Z" }, + { url = "https://files.pythonhosted.org/packages/74/cd/15777dfde1c29d96de7f18edf4cc94c385646852e7c7b0320aa91ccca583/argon2_cffi_bindings-25.1.0-cp39-abi3-win32.whl", hash = "sha256:473bcb5f82924b1becbb637b63303ec8d10e84c8d241119419897a26116515d2", size = 27180, upload-time = "2025-07-30T10:01:57.759Z" }, + { url = "https://files.pythonhosted.org/packages/e2/c6/a759ece8f1829d1f162261226fbfd2c6832b3ff7657384045286d2afa384/argon2_cffi_bindings-25.1.0-cp39-abi3-win_amd64.whl", hash = "sha256:a98cd7d17e9f7ce244c0803cad3c23a7d379c301ba618a5fa76a67d116618b98", size = 31715, upload-time = "2025-07-30T10:01:58.56Z" }, + { url = "https://files.pythonhosted.org/packages/42/b9/f8d6fa329ab25128b7e98fd83a3cb34d9db5b059a9847eddb840a0af45dd/argon2_cffi_bindings-25.1.0-cp39-abi3-win_arm64.whl", hash = "sha256:b0fdbcf513833809c882823f98dc2f931cf659d9a1429616ac3adebb49f5db94", size = 27149, upload-time = "2025-07-30T10:01:59.329Z" }, +] + [[package]] name = "arro3-core" version = "0.5.1" @@ -172,6 +289,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3", size = 63815, upload-time = "2025-03-13T11:10:21.14Z" }, ] +[[package]] +name = "azure-core" +version = "1.35.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, + { name = "six" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ce/89/f53968635b1b2e53e4aad2dd641488929fef4ca9dfb0b97927fa7697ddf3/azure_core-1.35.0.tar.gz", hash = "sha256:c0be528489485e9ede59b6971eb63c1eaacf83ef53001bfe3904e475e972be5c", size = 339689, upload-time = "2025-07-03T00:55:23.496Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/78/bf94897361fdd650850f0f2e405b2293e2f12808239046232bdedf554301/azure_core-1.35.0-py3-none-any.whl", hash = "sha256:8db78c72868a58f3de8991eb4d22c4d368fae226dac1002998d6c50437e7dad1", size = 210708, upload-time = "2025-07-03T00:55:25.238Z" }, +] + +[[package]] +name = "azure-datalake-store" +version = "0.0.53" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, + { name = "msal" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/ff/61369d06422b5ac48067215ff404841342651b14a89b46c8d8e1507c8f17/azure-datalake-store-0.0.53.tar.gz", hash = "sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393", size = 71430, upload-time = "2023-05-10T21:17:05.665Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/2a/75f56b14f115189155cf12e46b366ad1fe3357af5a1a7c09f7446662d617/azure_datalake_store-0.0.53-py2.py3-none-any.whl", hash = "sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b", size = 55308, upload-time = "2023-05-10T21:17:02.629Z" }, +] + +[[package]] +name = "azure-identity" +version = "1.24.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "azure-core" }, + { name = "cryptography" }, + { name = "msal" }, + { name = "msal-extensions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/44/f3ee20bacb220b6b4a2b0a6cf7e742eecb383a5ccf604dd79ec27c286b7e/azure_identity-1.24.0.tar.gz", hash = "sha256:6c3a40b2a70af831e920b89e6421e8dcd4af78a0cb38b9642d86c67643d4930c", size = 271630, upload-time = "2025-08-07T22:27:36.258Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/74/17428cb429e8d52f6d0d69ed685f4760a545cb0156594963a9337b53b6c9/azure_identity-1.24.0-py3-none-any.whl", hash = "sha256:9e04997cde0ab02ed66422c74748548e620b7b29361c72ce622acab0267ff7c4", size = 187890, upload-time = "2025-08-07T22:27:38.033Z" }, +] + +[[package]] +name = "azure-storage-blob" +version = "12.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "azure-core" }, + { name = "cryptography" }, + { name = "isodate" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/95/3e3414491ce45025a1cde107b6ae72bf72049e6021597c201cd6a3029b9a/azure_storage_blob-12.26.0.tar.gz", hash = "sha256:5dd7d7824224f7de00bfeb032753601c982655173061e242f13be6e26d78d71f", size = 583332, upload-time = "2025-07-16T21:34:07.644Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl", hash = "sha256:8c5631b8b22b4f53ec5fff2f3bededf34cfef111e2af613ad42c9e6de00a77fe", size = 412907, upload-time = "2025-07-16T21:34:09.367Z" }, +] + [[package]] name = "beartype" version = "0.21.0" @@ -181,6 +357,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl", hash = "sha256:b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0", size = 1191340, upload-time = "2025-05-22T05:09:24.606Z" }, ] +[[package]] +name = "botocore" +version = "1.39.11" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/d0/9d64261186cff650fe63168441edb4f4cd33f085a74c0c54455630a71f91/botocore-1.39.11.tar.gz", hash = "sha256:953b12909d6799350e346ab038e55b6efe622c616f80aef74d7a6683ffdd972c", size = 14217749, upload-time = "2025-07-22T19:26:40.723Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl", hash = "sha256:1545352931a8a186f3e977b1e1a4542d7d434796e274c3c62efd0210b5ea76dc", size = 13876276, upload-time = "2025-07-22T19:26:35.164Z" }, +] + [[package]] name = "cachetools" version = "5.5.2" @@ -395,6 +585,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/1a/0b9c32220ad694d66062f571cc5cedfa9997b64a591e8a500bb63de1bd40/coverage-7.8.2-py3-none-any.whl", hash = "sha256:726f32ee3713f7359696331a18daf0c3b3a70bb0ae71141b9d3c52be7c595e32", size = 203623, upload-time = "2025-05-23T11:39:53.846Z" }, ] +[[package]] +name = "cryptography" +version = "45.0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/0d/d13399c94234ee8f3df384819dc67e0c5ce215fb751d567a55a1f4b028c7/cryptography-45.0.6.tar.gz", hash = "sha256:5c966c732cf6e4a276ce83b6e4c729edda2df6929083a952cc7da973c539c719", size = 744949, upload-time = "2025-08-05T23:59:27.93Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8c/29/2793d178d0eda1ca4a09a7c4e09a5185e75738cc6d526433e8663b460ea6/cryptography-45.0.6-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:048e7ad9e08cf4c0ab07ff7f36cc3115924e22e2266e034450a890d9e312dd74", size = 7042702, upload-time = "2025-08-05T23:58:23.464Z" }, + { url = "https://files.pythonhosted.org/packages/b3/b6/cabd07410f222f32c8d55486c464f432808abaa1f12af9afcbe8f2f19030/cryptography-45.0.6-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:44647c5d796f5fc042bbc6d61307d04bf29bccb74d188f18051b635f20a9c75f", size = 4206483, upload-time = "2025-08-05T23:58:27.132Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9e/f9c7d36a38b1cfeb1cc74849aabe9bf817990f7603ff6eb485e0d70e0b27/cryptography-45.0.6-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e40b80ecf35ec265c452eea0ba94c9587ca763e739b8e559c128d23bff7ebbbf", size = 4429679, upload-time = "2025-08-05T23:58:29.152Z" }, + { url = "https://files.pythonhosted.org/packages/9c/2a/4434c17eb32ef30b254b9e8b9830cee4e516f08b47fdd291c5b1255b8101/cryptography-45.0.6-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:00e8724bdad672d75e6f069b27970883179bd472cd24a63f6e620ca7e41cc0c5", size = 4210553, upload-time = "2025-08-05T23:58:30.596Z" }, + { url = "https://files.pythonhosted.org/packages/ef/1d/09a5df8e0c4b7970f5d1f3aff1b640df6d4be28a64cae970d56c6cf1c772/cryptography-45.0.6-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7a3085d1b319d35296176af31c90338eeb2ddac8104661df79f80e1d9787b8b2", size = 3894499, upload-time = "2025-08-05T23:58:32.03Z" }, + { url = "https://files.pythonhosted.org/packages/79/62/120842ab20d9150a9d3a6bdc07fe2870384e82f5266d41c53b08a3a96b34/cryptography-45.0.6-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1b7fa6a1c1188c7ee32e47590d16a5a0646270921f8020efc9a511648e1b2e08", size = 4458484, upload-time = "2025-08-05T23:58:33.526Z" }, + { url = "https://files.pythonhosted.org/packages/fd/80/1bc3634d45ddfed0871bfba52cf8f1ad724761662a0c792b97a951fb1b30/cryptography-45.0.6-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:275ba5cc0d9e320cd70f8e7b96d9e59903c815ca579ab96c1e37278d231fc402", size = 4210281, upload-time = "2025-08-05T23:58:35.445Z" }, + { url = "https://files.pythonhosted.org/packages/7d/fe/ffb12c2d83d0ee625f124880a1f023b5878f79da92e64c37962bbbe35f3f/cryptography-45.0.6-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:f4028f29a9f38a2025abedb2e409973709c660d44319c61762202206ed577c42", size = 4456890, upload-time = "2025-08-05T23:58:36.923Z" }, + { url = "https://files.pythonhosted.org/packages/8c/8e/b3f3fe0dc82c77a0deb5f493b23311e09193f2268b77196ec0f7a36e3f3e/cryptography-45.0.6-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ee411a1b977f40bd075392c80c10b58025ee5c6b47a822a33c1198598a7a5f05", size = 4333247, upload-time = "2025-08-05T23:58:38.781Z" }, + { url = "https://files.pythonhosted.org/packages/b3/a6/c3ef2ab9e334da27a1d7b56af4a2417d77e7806b2e0f90d6267ce120d2e4/cryptography-45.0.6-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e2a21a8eda2d86bb604934b6b37691585bd095c1f788530c1fcefc53a82b3453", size = 4565045, upload-time = "2025-08-05T23:58:40.415Z" }, + { url = "https://files.pythonhosted.org/packages/31/c3/77722446b13fa71dddd820a5faab4ce6db49e7e0bf8312ef4192a3f78e2f/cryptography-45.0.6-cp311-abi3-win32.whl", hash = "sha256:d063341378d7ee9c91f9d23b431a3502fc8bfacd54ef0a27baa72a0843b29159", size = 2928923, upload-time = "2025-08-05T23:58:41.919Z" }, + { url = "https://files.pythonhosted.org/packages/38/63/a025c3225188a811b82932a4dcc8457a26c3729d81578ccecbcce2cb784e/cryptography-45.0.6-cp311-abi3-win_amd64.whl", hash = "sha256:833dc32dfc1e39b7376a87b9a6a4288a10aae234631268486558920029b086ec", size = 3403805, upload-time = "2025-08-05T23:58:43.792Z" }, + { url = "https://files.pythonhosted.org/packages/5b/af/bcfbea93a30809f126d51c074ee0fac5bd9d57d068edf56c2a73abedbea4/cryptography-45.0.6-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:3436128a60a5e5490603ab2adbabc8763613f638513ffa7d311c900a8349a2a0", size = 7020111, upload-time = "2025-08-05T23:58:45.316Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/ea5173689e014f1a8470899cd5beeb358e22bb3cf5a876060f9d1ca78af4/cryptography-45.0.6-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0d9ef57b6768d9fa58e92f4947cea96ade1233c0e236db22ba44748ffedca394", size = 4198169, upload-time = "2025-08-05T23:58:47.121Z" }, + { url = "https://files.pythonhosted.org/packages/ba/73/b12995edc0c7e2311ffb57ebd3b351f6b268fed37d93bfc6f9856e01c473/cryptography-45.0.6-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea3c42f2016a5bbf71825537c2ad753f2870191134933196bee408aac397b3d9", size = 4421273, upload-time = "2025-08-05T23:58:48.557Z" }, + { url = "https://files.pythonhosted.org/packages/f7/6e/286894f6f71926bc0da67408c853dd9ba953f662dcb70993a59fd499f111/cryptography-45.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:20ae4906a13716139d6d762ceb3e0e7e110f7955f3bc3876e3a07f5daadec5f3", size = 4199211, upload-time = "2025-08-05T23:58:50.139Z" }, + { url = "https://files.pythonhosted.org/packages/de/34/a7f55e39b9623c5cb571d77a6a90387fe557908ffc44f6872f26ca8ae270/cryptography-45.0.6-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dac5ec199038b8e131365e2324c03d20e97fe214af051d20c49db129844e8b3", size = 3883732, upload-time = "2025-08-05T23:58:52.253Z" }, + { url = "https://files.pythonhosted.org/packages/f9/b9/c6d32edbcba0cd9f5df90f29ed46a65c4631c4fbe11187feb9169c6ff506/cryptography-45.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:18f878a34b90d688982e43f4b700408b478102dd58b3e39de21b5ebf6509c301", size = 4450655, upload-time = "2025-08-05T23:58:53.848Z" }, + { url = "https://files.pythonhosted.org/packages/77/2d/09b097adfdee0227cfd4c699b3375a842080f065bab9014248933497c3f9/cryptography-45.0.6-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:5bd6020c80c5b2b2242d6c48487d7b85700f5e0038e67b29d706f98440d66eb5", size = 4198956, upload-time = "2025-08-05T23:58:55.209Z" }, + { url = "https://files.pythonhosted.org/packages/55/66/061ec6689207d54effdff535bbdf85cc380d32dd5377173085812565cf38/cryptography-45.0.6-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:eccddbd986e43014263eda489abbddfbc287af5cddfd690477993dbb31e31016", size = 4449859, upload-time = "2025-08-05T23:58:56.639Z" }, + { url = "https://files.pythonhosted.org/packages/41/ff/e7d5a2ad2d035e5a2af116e1a3adb4d8fcd0be92a18032917a089c6e5028/cryptography-45.0.6-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:550ae02148206beb722cfe4ef0933f9352bab26b087af00e48fdfb9ade35c5b3", size = 4320254, upload-time = "2025-08-05T23:58:58.833Z" }, + { url = "https://files.pythonhosted.org/packages/82/27/092d311af22095d288f4db89fcaebadfb2f28944f3d790a4cf51fe5ddaeb/cryptography-45.0.6-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5b64e668fc3528e77efa51ca70fadcd6610e8ab231e3e06ae2bab3b31c2b8ed9", size = 4554815, upload-time = "2025-08-05T23:59:00.283Z" }, + { url = "https://files.pythonhosted.org/packages/7e/01/aa2f4940262d588a8fdf4edabe4cda45854d00ebc6eaac12568b3a491a16/cryptography-45.0.6-cp37-abi3-win32.whl", hash = "sha256:780c40fb751c7d2b0c6786ceee6b6f871e86e8718a8ff4bc35073ac353c7cd02", size = 2912147, upload-time = "2025-08-05T23:59:01.716Z" }, + { url = "https://files.pythonhosted.org/packages/0a/bc/16e0276078c2de3ceef6b5a34b965f4436215efac45313df90d55f0ba2d2/cryptography-45.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:20d15aed3ee522faac1a39fbfdfee25d17b1284bafd808e1640a74846d7c4d1b", size = 3390459, upload-time = "2025-08-05T23:59:03.358Z" }, +] + [[package]] name = "cycler" version = "0.12.1" @@ -583,11 +808,29 @@ wheels = [ [[package]] name = "fsspec" -version = "2025.5.1" +version = "2025.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/00/f7/27f15d41f0ed38e8fcc488584b57e902b331da7f7c6dcda53721b15838fc/fsspec-2025.5.1.tar.gz", hash = "sha256:2e55e47a540b91843b755e83ded97c6e897fa0942b11490113f09e9c443c2475", size = 303033, upload-time = "2025-05-24T12:03:23.792Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8b/02/0835e6ab9cfc03916fe3f78c0956cfcdb6ff2669ffa6651065d5ebf7fc98/fsspec-2025.7.0.tar.gz", hash = "sha256:786120687ffa54b8283d942929540d8bc5ccfa820deb555a2b5d0ed2b737bf58", size = 304432, upload-time = "2025-07-15T16:05:21.19Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/61/78c7b3851add1481b048b5fdc29067397a1784e2910592bc81bb3f608635/fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462", size = 199052, upload-time = "2025-05-24T12:03:21.66Z" }, + { url = "https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl", hash = "sha256:8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21", size = 199597, upload-time = "2025-07-15T16:05:19.529Z" }, +] + +[[package]] +name = "gcsfs" +version = "2025.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "decorator" }, + { name = "fsspec" }, + { name = "google-auth" }, + { name = "google-auth-oauthlib" }, + { name = "google-cloud-storage" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/d7/5eafe9f09f1bb09433a473cef7984cd52c398592c8fd09974e0ad87cfea4/gcsfs-2025.7.0.tar.gz", hash = "sha256:ad3ff66cf189ae8fc375ac8a2af409003dbca02357621cb94a66e457e02ba420", size = 82659, upload-time = "2025-07-15T16:49:21.647Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl", hash = "sha256:653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5", size = 36815, upload-time = "2025-07-15T16:49:20.333Z" }, ] [[package]] @@ -620,6 +863,81 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, ] +[[package]] +name = "google-auth-oauthlib" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "requests-oauthlib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload-time = "2025-04-22T16:40:29.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload-time = "2025-04-22T16:40:28.174Z" }, +] + +[[package]] +name = "google-cloud-core" +version = "2.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/b8/2b53838d2acd6ec6168fd284a990c76695e84c65deee79c9f3a4276f6b4f/google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53", size = 35861, upload-time = "2025-03-10T21:05:38.948Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e", size = 29348, upload-time = "2025-03-10T21:05:37.785Z" }, +] + +[[package]] +name = "google-cloud-storage" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, + { name = "google-cloud-core" }, + { name = "google-crc32c" }, + { name = "google-resumable-media" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/5b/6d4627484248e018a926dde114c4034656570da9c1c438e3db061fa42de5/google_cloud_storage-3.2.0.tar.gz", hash = "sha256:decca843076036f45633198c125d1861ffbf47ebf5c0e3b98dcb9b2db155896c", size = 7669611, upload-time = "2025-07-07T05:14:06.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/48/823ce62cf29d04db6508971a0db13a72c1c9faf67cea2c206b1c9c9f1f02/google_cloud_storage-3.2.0-py3-none-any.whl", hash = "sha256:ff7a9a49666954a7c3d1598291220c72d3b9e49d9dfcf9dfaecb301fc4fb0b24", size = 176133, upload-time = "2025-07-07T05:14:05.059Z" }, +] + +[[package]] +name = "google-crc32c" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, + { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, + { url = "https://files.pythonhosted.org/packages/3b/a5/7279055cf004561894ed3a7bfdf5bf90a53f28fadd01af7cd166e88ddf16/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d86616faaea68101195c6bdc40c494e4d76f41e07a37ffdef270879c15fb65", size = 32794, upload-time = "2025-03-26T14:41:33.264Z" }, + { url = "https://files.pythonhosted.org/packages/0f/d6/77060dbd140c624e42ae3ece3df53b9d811000729a5c821b9fd671ceaac6/google_crc32c-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:b7491bdc0c7564fcf48c0179d2048ab2f7c7ba36b84ccd3a3e1c3f7a72d3bba6", size = 33477, upload-time = "2025-03-26T14:29:10.94Z" }, + { url = "https://files.pythonhosted.org/packages/8b/72/b8d785e9184ba6297a8620c8a37cf6e39b81a8ca01bb0796d7cbb28b3386/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:df8b38bdaf1629d62d51be8bdd04888f37c451564c2042d36e5812da9eff3c35", size = 30467, upload-time = "2025-03-26T14:36:06.909Z" }, + { url = "https://files.pythonhosted.org/packages/34/25/5f18076968212067c4e8ea95bf3b69669f9fc698476e5f5eb97d5b37999f/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:e42e20a83a29aa2709a0cf271c7f8aefaa23b7ab52e53b322585297bb94d4638", size = 30309, upload-time = "2025-03-26T15:06:15.318Z" }, + { url = "https://files.pythonhosted.org/packages/92/83/9228fe65bf70e93e419f38bdf6c5ca5083fc6d32886ee79b450ceefd1dbd/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905a385140bf492ac300026717af339790921f411c0dfd9aa5a9e69a08ed32eb", size = 33133, upload-time = "2025-03-26T14:41:34.388Z" }, + { url = "https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6", size = 32773, upload-time = "2025-03-26T14:41:35.19Z" }, + { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, + { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, +] + +[[package]] +name = "google-resumable-media" +version = "2.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-crc32c" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099, upload-time = "2024-08-07T22:20:38.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251, upload-time = "2024-08-07T22:20:36.409Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -784,6 +1102,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl", hash = "sha256:764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb", size = 139806, upload-time = "2025-05-05T12:41:56.833Z" }, ] +[[package]] +name = "isodate" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, +] + [[package]] name = "jedi" version = "0.19.2" @@ -796,6 +1123,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, ] +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843, upload-time = "2022-06-17T18:00:12.224Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, +] + [[package]] name = "jsonschema" version = "4.25.0" @@ -983,6 +1319,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "minio" +version = "7.2.16" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "argon2-cffi" }, + { name = "certifi" }, + { name = "pycryptodome" }, + { name = "typing-extensions" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/a0/33ea2e18d5169817950edc13eba58cd781cedefe9f6696cae26aa2d75882/minio-7.2.16.tar.gz", hash = "sha256:81e365c8494d591d8204a63ee7596bfdf8a7d06ad1b1507d6b9c1664a95f299a", size = 139149, upload-time = "2025-07-21T20:11:15.911Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl", hash = "sha256:9288ab988ca57c181eb59a4c96187b293131418e28c164392186c2b89026b223", size = 95750, upload-time = "2025-07-21T20:11:14.139Z" }, +] + [[package]] name = "mmh3" version = "5.1.0" @@ -1023,6 +1375,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/71/4ad9a42f2772793a03cb698f0fc42499f04e6e8d2560ba2f7da0fb059a8e/mmh3-5.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:b22fe2e54be81f6c07dcb36b96fa250fb72effe08aa52fbb83eade6e1e2d5fd7", size = 38890, upload-time = "2025-01-25T08:39:25.28Z" }, ] +[[package]] +name = "msal" +version = "1.33.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "pyjwt", extra = ["crypto"] }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/da/81acbe0c1fd7e9e4ec35f55dadeba9833a847b9a6ba2e2d1e4432da901dd/msal-1.33.0.tar.gz", hash = "sha256:836ad80faa3e25a7d71015c990ce61f704a87328b1e73bcbb0623a18cbf17510", size = 153801, upload-time = "2025-07-22T19:36:33.693Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/5b/fbc73e91f7727ae1e79b21ed833308e99dc11cc1cd3d4717f579775de5e9/msal-1.33.0-py3-none-any.whl", hash = "sha256:c0cd41cecf8eaed733ee7e3be9e040291eba53b0f262d3ae9c58f38b04244273", size = 116853, upload-time = "2025-07-22T19:36:32.403Z" }, +] + +[[package]] +name = "msal-extensions" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "msal" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/01/99/5d239b6156eddf761a636bded1118414d161bd6b7b37a9335549ed159396/msal_extensions-1.3.1.tar.gz", hash = "sha256:c5b0fd10f65ef62b5f1d62f4251d51cbcaf003fcedae8c91b040a488614be1a4", size = 23315, upload-time = "2025-03-14T23:51:03.902Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl", hash = "sha256:96d3de4d034504e969ac5e85bae8106c8373b5c6568e4c8fa7af2eca9dbe6bca", size = 20583, upload-time = "2025-03-14T23:51:03.016Z" }, +] + [[package]] name = "msgpack" version = "1.1.1" @@ -1167,6 +1545,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, ] +[[package]] +name = "oauthlib" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, +] + [[package]] name = "opencensus" version = "0.11.4" @@ -1287,11 +1674,14 @@ redis = [ [package.dev-dependencies] dev = [ + { name = "adlfs" }, { name = "deltalake" }, + { name = "gcsfs" }, { name = "httpie" }, { name = "ipykernel" }, { name = "ipywidgets" }, { name = "jsonschema" }, + { name = "minio" }, { name = "pyarrow-stubs" }, { name = "pyiceberg" }, { name = "pytest" }, @@ -1299,6 +1689,7 @@ dev = [ { name = "ray", extra = ["default"] }, { name = "redis" }, { name = "ruff" }, + { name = "s3fs" }, { name = "tqdm" }, { name = "unitycatalog-client" }, ] @@ -1324,11 +1715,14 @@ provides-extras = ["redis", "ray", "all"] [package.metadata.requires-dev] dev = [ + { name = "adlfs", specifier = ">=2024.12.0" }, { name = "deltalake", specifier = ">=1.0.2" }, + { name = "gcsfs", specifier = ">=2025.7.0" }, { name = "httpie", specifier = ">=3.2.4" }, { name = "ipykernel", specifier = ">=6.29.5" }, { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jsonschema", specifier = ">=4.25.0" }, + { name = "minio", specifier = ">=7.2.16" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, @@ -1336,6 +1730,7 @@ dev = [ { name = "ray", extras = ["default"], specifier = "==2.48.0" }, { name = "redis", specifier = ">=6.2.0" }, { name = "ruff", specifier = ">=0.11.11" }, + { name = "s3fs", specifier = ">=2025.7.0" }, { name = "tqdm", specifier = ">=4.67.1" }, { name = "unitycatalog-client", specifier = ">=0.3.0" }, ] @@ -1715,6 +2110,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" }, ] +[[package]] +name = "pycryptodome" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/a6/8452177684d5e906854776276ddd34eca30d1b1e15aa1ee9cefc289a33f5/pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef", size = 4921276, upload-time = "2025-05-17T17:21:45.242Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/5d/bdb09489b63cd34a976cc9e2a8d938114f7a53a74d3dd4f125ffa49dce82/pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:0011f7f00cdb74879142011f95133274741778abba114ceca229adbf8e62c3e4", size = 2495152, upload-time = "2025-05-17T17:20:20.833Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ce/7840250ed4cc0039c433cd41715536f926d6e86ce84e904068eb3244b6a6/pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:90460fc9e088ce095f9ee8356722d4f10f86e5be06e2354230a9880b9c549aae", size = 1639348, upload-time = "2025-05-17T17:20:23.171Z" }, + { url = "https://files.pythonhosted.org/packages/ee/f0/991da24c55c1f688d6a3b5a11940567353f74590734ee4a64294834ae472/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4764e64b269fc83b00f682c47443c2e6e85b18273712b98aa43bcb77f8570477", size = 2184033, upload-time = "2025-05-17T17:20:25.424Z" }, + { url = "https://files.pythonhosted.org/packages/54/16/0e11882deddf00f68b68dd4e8e442ddc30641f31afeb2bc25588124ac8de/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb8f24adb74984aa0e5d07a2368ad95276cf38051fe2dc6605cbcf482e04f2a7", size = 2270142, upload-time = "2025-05-17T17:20:27.808Z" }, + { url = "https://files.pythonhosted.org/packages/d5/fc/4347fea23a3f95ffb931f383ff28b3f7b1fe868739182cb76718c0da86a1/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d97618c9c6684a97ef7637ba43bdf6663a2e2e77efe0f863cce97a76af396446", size = 2309384, upload-time = "2025-05-17T17:20:30.765Z" }, + { url = "https://files.pythonhosted.org/packages/6e/d9/c5261780b69ce66d8cfab25d2797bd6e82ba0241804694cd48be41add5eb/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9a53a4fe5cb075075d515797d6ce2f56772ea7e6a1e5e4b96cf78a14bac3d265", size = 2183237, upload-time = "2025-05-17T17:20:33.736Z" }, + { url = "https://files.pythonhosted.org/packages/5a/6f/3af2ffedd5cfa08c631f89452c6648c4d779e7772dfc388c77c920ca6bbf/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:763d1d74f56f031788e5d307029caef067febf890cd1f8bf61183ae142f1a77b", size = 2343898, upload-time = "2025-05-17T17:20:36.086Z" }, + { url = "https://files.pythonhosted.org/packages/9a/dc/9060d807039ee5de6e2f260f72f3d70ac213993a804f5e67e0a73a56dd2f/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:954af0e2bd7cea83ce72243b14e4fb518b18f0c1649b576d114973e2073b273d", size = 2269197, upload-time = "2025-05-17T17:20:38.414Z" }, + { url = "https://files.pythonhosted.org/packages/f9/34/e6c8ca177cb29dcc4967fef73f5de445912f93bd0343c9c33c8e5bf8cde8/pycryptodome-3.23.0-cp313-cp313t-win32.whl", hash = "sha256:257bb3572c63ad8ba40b89f6fc9d63a2a628e9f9708d31ee26560925ebe0210a", size = 1768600, upload-time = "2025-05-17T17:20:40.688Z" }, + { url = "https://files.pythonhosted.org/packages/e4/1d/89756b8d7ff623ad0160f4539da571d1f594d21ee6d68be130a6eccb39a4/pycryptodome-3.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6501790c5b62a29fcb227bd6b62012181d886a767ce9ed03b303d1f22eb5c625", size = 1799740, upload-time = "2025-05-17T17:20:42.413Z" }, + { url = "https://files.pythonhosted.org/packages/5d/61/35a64f0feaea9fd07f0d91209e7be91726eb48c0f1bfc6720647194071e4/pycryptodome-3.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:9a77627a330ab23ca43b48b130e202582e91cc69619947840ea4d2d1be21eb39", size = 1703685, upload-time = "2025-05-17T17:20:44.388Z" }, + { url = "https://files.pythonhosted.org/packages/db/6c/a1f71542c969912bb0e106f64f60a56cc1f0fabecf9396f45accbe63fa68/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:187058ab80b3281b1de11c2e6842a357a1f71b42cb1e15bce373f3d238135c27", size = 2495627, upload-time = "2025-05-17T17:20:47.139Z" }, + { url = "https://files.pythonhosted.org/packages/6e/4e/a066527e079fc5002390c8acdd3aca431e6ea0a50ffd7201551175b47323/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cfb5cd445280c5b0a4e6187a7ce8de5a07b5f3f897f235caa11f1f435f182843", size = 1640362, upload-time = "2025-05-17T17:20:50.392Z" }, + { url = "https://files.pythonhosted.org/packages/50/52/adaf4c8c100a8c49d2bd058e5b551f73dfd8cb89eb4911e25a0c469b6b4e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67bd81fcbe34f43ad9422ee8fd4843c8e7198dd88dd3d40e6de42ee65fbe1490", size = 2182625, upload-time = "2025-05-17T17:20:52.866Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575", size = 2268954, upload-time = "2025-05-17T17:20:55.027Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/ffe6474e0c551d54cab931918127c46d70cab8f114e0c2b5a3c071c2f484/pycryptodome-3.23.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa0698f65e5b570426fc31b8162ed4603b0c2841cbb9088e2b01641e3065915b", size = 2308534, upload-time = "2025-05-17T17:20:57.279Z" }, + { url = "https://files.pythonhosted.org/packages/18/28/e199677fc15ecf43010f2463fde4c1a53015d1fe95fb03bca2890836603a/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:53ecbafc2b55353edcebd64bf5da94a2a2cdf5090a6915bcca6eca6cc452585a", size = 2181853, upload-time = "2025-05-17T17:20:59.322Z" }, + { url = "https://files.pythonhosted.org/packages/ce/ea/4fdb09f2165ce1365c9eaefef36625583371ee514db58dc9b65d3a255c4c/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:156df9667ad9f2ad26255926524e1c136d6664b741547deb0a86a9acf5ea631f", size = 2342465, upload-time = "2025-05-17T17:21:03.83Z" }, + { url = "https://files.pythonhosted.org/packages/22/82/6edc3fc42fe9284aead511394bac167693fb2b0e0395b28b8bedaa07ef04/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:dea827b4d55ee390dc89b2afe5927d4308a8b538ae91d9c6f7a5090f397af1aa", size = 2267414, upload-time = "2025-05-17T17:21:06.72Z" }, + { url = "https://files.pythonhosted.org/packages/59/fe/aae679b64363eb78326c7fdc9d06ec3de18bac68be4b612fc1fe8902693c/pycryptodome-3.23.0-cp37-abi3-win32.whl", hash = "sha256:507dbead45474b62b2bbe318eb1c4c8ee641077532067fec9c1aa82c31f84886", size = 1768484, upload-time = "2025-05-17T17:21:08.535Z" }, + { url = "https://files.pythonhosted.org/packages/54/2f/e97a1b8294db0daaa87012c24a7bb714147c7ade7656973fd6c736b484ff/pycryptodome-3.23.0-cp37-abi3-win_amd64.whl", hash = "sha256:c75b52aacc6c0c260f204cbdd834f76edc9fb0d8e0da9fbf8352ef58202564e2", size = 1799636, upload-time = "2025-05-17T17:21:10.393Z" }, + { url = "https://files.pythonhosted.org/packages/18/3d/f9441a0d798bf2b1e645adc3265e55706aead1255ccdad3856dbdcffec14/pycryptodome-3.23.0-cp37-abi3-win_arm64.whl", hash = "sha256:11eeeb6917903876f134b56ba11abe95c0b0fd5e3330def218083c7d98bbcb3c", size = 1703675, upload-time = "2025-05-17T17:21:13.146Z" }, +] + [[package]] name = "pydantic" version = "2.11.5" @@ -1807,6 +2232,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/69/c0087d19c8d8e8530acee3ba485d54aedeebf2963784a16692ca4b439566/pyiceberg-0.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:124793c54a0c2fb5ac4ab19c38da116c068e277c85cbaa7e4064e635a70b595e", size = 595512, upload-time = "2025-04-30T14:59:14.464Z" }, ] +[[package]] +name = "pyjwt" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/46/bd74733ff231675599650d3e47f361794b22ef3e3770998dda30d3b63726/pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953", size = 87785, upload-time = "2024-11-28T03:43:29.933Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb", size = 22997, upload-time = "2024-11-28T03:43:27.893Z" }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography" }, +] + [[package]] name = "pyparsing" version = "3.2.3" @@ -2041,6 +2480,19 @@ socks = [ { name = "pysocks" }, ] +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "oauthlib" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" }, +] + [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -2179,6 +2631,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/44/42/d58086ec20f52d2b0140752ae54b355ea2be2ed46f914231136dd1effcc7/ruff-0.11.12-py3-none-win_arm64.whl", hash = "sha256:65194e37853158d368e333ba282217941029a28ea90913c67e558c611d04daa5", size = 10697770, upload-time = "2025-05-29T13:31:38.009Z" }, ] +[[package]] +name = "s3fs" +version = "2025.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiobotocore" }, + { name = "aiohttp" }, + { name = "fsspec" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/13/37438c4672ba1d23ec46df0e4b57e98469e5c5f4f98313cf6842b631652b/s3fs-2025.7.0.tar.gz", hash = "sha256:5e7f9ec0cad7745155e3eb86fae15b1481fa29946bf5b3a4ce3a60701ce6022d", size = 77795, upload-time = "2025-07-15T16:35:22.177Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl", hash = "sha256:b6b2d3f84b6aa1c2ba5e62e39dd9410cf54f10a2cce1ea6db1ba0d1a6bcce685", size = 30315, upload-time = "2025-07-15T16:35:20.734Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" From 3c2db88bda79e3c8b7a52947c8facfc9159c097f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 10 Aug 2025 23:28:49 +0000 Subject: [PATCH 181/224] refactor: use simpler names for classes --- .../01_introduction_to_orcapod.ipynb | 217 +++++++++++++++--- .../02_parallel_execution_on_ray.ipynb | 45 ++-- src/orcapod/data/operators/join.py | 4 +- src/orcapod/data/operators/mappers.py | 6 +- src/orcapod/data/operators/semijoin.py | 4 +- src/orcapod/data/sources.py | 10 +- src/orcapod/data/streams.py | 10 +- src/orcapod/execution_engines/__init__.py | 2 +- 8 files changed, 226 insertions(+), 72 deletions(-) diff --git a/notebooks/tutorials/01_introduction_to_orcapod.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb index 434dd49..b0ffb65 100644 --- a/notebooks/tutorials/01_introduction_to_orcapod.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "27cdd37d", + "id": "b6454c12", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ "id": "c2ac8f32", "metadata": {}, "source": [ - "Use `op.streams.ImmutableTableStream` to turn table into a stream. You will also have to specify which columns are the tags." + "Use `op.streams.TableStream` to turn table into a stream. You will also have to specify which columns are the tags." ] }, { @@ -103,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "stream = op.streams.ImmutableTableStream(table, tag_columns=[\"a\", \"b\"])" + "stream = op.streams.TableStream(table, tag_columns=[\"a\", \"b\"])" ] }, { @@ -302,7 +302,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 5)
abcd_content_hash
i64strboolf64str
1"x"true1.1"arrow_v0.1@3de5f8a7b9a2fe5e6cc…
2"y"false2.2"arrow_v0.1@cc022b33fc80a6639d2…
3"z"true3.3"arrow_v0.1@b0bb7434c813b4d5d7c…
" + "shape: (3, 5)
abcd_content_hash
i64strboolf64str
1"x"true1.1"arrow_v0.1@dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1@083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1@d4a11ad88c1d27eba1c…
" ], "text/plain": [ "shape: (3, 5)\n", @@ -311,9 +311,9 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", - "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@3de5f8a7b9a2fe5e6cc… │\n", - "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@cc022b33fc80a6639d2… │\n", - "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@b0bb7434c813b4d5d7c… │\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@dbd5a1efe0a1a306cc2… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@083f8c4d8a4c7608af3… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@d4a11ad88c1d27eba1c… │\n", "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, @@ -350,7 +350,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 5)
abcdmy_hash_values
i64strboolf64str
1"x"true1.1"arrow_v0.1@3de5f8a7b9a2fe5e6cc…
2"y"false2.2"arrow_v0.1@cc022b33fc80a6639d2…
3"z"true3.3"arrow_v0.1@b0bb7434c813b4d5d7c…
" + "shape: (3, 5)
abcdmy_hash_values
i64strboolf64str
1"x"true1.1"arrow_v0.1@dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1@083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1@d4a11ad88c1d27eba1c…
" ], "text/plain": [ "shape: (3, 5)\n", @@ -359,9 +359,9 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", - "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@3de5f8a7b9a2fe5e6cc… │\n", - "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@cc022b33fc80a6639d2… │\n", - "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@b0bb7434c813b4d5d7c… │\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@dbd5a1efe0a1a306cc2… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@083f8c4d8a4c7608af3… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@d4a11ad88c1d27eba1c… │\n", "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, @@ -989,8 +989,8 @@ " }\n", ")\n", "\n", - "stream1 = op.streams.ImmutableTableStream(table1, tag_columns=[\"id\"])\n", - "stream2 = op.streams.ImmutableTableStream(table2, tag_columns=[\"id\"])" + "stream1 = op.streams.TableStream(table1, tag_columns=[\"id\"])\n", + "stream2 = op.streams.TableStream(table2, tag_columns=[\"id\"])" ] }, { @@ -1390,7 +1390,7 @@ " }\n", ")\n", "\n", - "input_stream = op.streams.ImmutableTableStream(input_table, tag_columns=[\"id\"])" + "input_stream = op.streams.TableStream(input_table, tag_columns=[\"id\"])" ] }, { @@ -1463,7 +1463,7 @@ { "data": { "text/plain": [ - "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(ImmutableTableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" + "KernelStream(kernel=FunctionPod:add_numbers(a: int, b: int)-> , upstreams=(TableStream(table=['id', 'a', 'b'], tag_columns=('id',)),))" ] }, "execution_count": 46, @@ -1769,6 +1769,31 @@ "pipeline.add_numbers" ] }, + { + "cell_type": "code", + "execution_count": 57, + "id": "08add7d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'TableStream': KernelNode(kernel=),\n", + " 'add_numbers': PodNode(pod=FunctionPod:add_numbers),\n", + " 'multiply_numbers': PodNode(pod=FunctionPod:multiply_numbers),\n", + " 'Join': KernelNode(kernel=Join()),\n", + " 'combine_results': PodNode(pod=FunctionPod:combine_results)}" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.nodes" + ] + }, { "cell_type": "markdown", "id": "5f33f5a9", @@ -1787,12 +1812,12 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "id": "21086f72", "metadata": {}, "outputs": [], "source": [ - "pipeline.add_numbers.df" + "pipeline.add_numbers.as_df()" ] }, { @@ -1805,7 +1830,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "id": "1e741659", "metadata": {}, "outputs": [], @@ -1831,12 +1856,44 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "id": "c77154ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "pipeline.add_numbers.df" + "pipeline.add_numbers.as_df()" ] }, { @@ -1867,7 +1924,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "id": "37e65e33", "metadata": {}, "outputs": [], @@ -1879,7 +1936,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "id": "3bad8332", "metadata": {}, "outputs": [], @@ -1895,32 +1952,128 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "id": "8f146ae7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "pipeline2.my_summation.df" + "pipeline2.my_summation.as_df()" ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "id": "8fd7bf4e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idproduct
i64i64
010
140
290
3160
4250
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────────┐\n", + "│ id ┆ product │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════════╡\n", + "│ 0 ┆ 10 │\n", + "│ 1 ┆ 40 │\n", + "│ 2 ┆ 90 │\n", + "│ 3 ┆ 160 │\n", + "│ 4 ┆ 250 │\n", + "└─────┴─────────┘" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "pipeline2.my_product.df" + "pipeline2.my_product.as_df()" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "id": "2a918db1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idresult
i64str
0"Sum: 11, Product: 10"
1"Sum: 22, Product: 40"
2"Sum: 33, Product: 90"
3"Sum: 44, Product: 160"
4"Sum: 55, Product: 250"
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬───────────────────────┐\n", + "│ id ┆ result │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═════╪═══════════════════════╡\n", + "│ 0 ┆ Sum: 11, Product: 10 │\n", + "│ 1 ┆ Sum: 22, Product: 40 │\n", + "│ 2 ┆ Sum: 33, Product: 90 │\n", + "│ 3 ┆ Sum: 44, Product: 160 │\n", + "│ 4 ┆ Sum: 55, Product: 250 │\n", + "└─────┴───────────────────────┘" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "pipeline2.my_final_result.df" + "pipeline2.my_final_result.as_df()" ] }, { diff --git a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb index 73e04cc..d6ff7d5 100644 --- a/notebooks/tutorials/02_parallel_execution_on_ray.ipynb +++ b/notebooks/tutorials/02_parallel_execution_on_ray.ipynb @@ -7,14 +7,14 @@ "metadata": {}, "outputs": [], "source": [ - "from orcapod.execution_engines import NativeRayAsyncEngine\n", + "from orcapod.execution_engines import RayEngine\n", "import orcapod as op\n", "import pyarrow as pa" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9e1f338b", "metadata": {}, "outputs": [ @@ -22,9 +22,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-08-05 19:55:02,021\tINFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver\n", + "2025-08-10 23:27:14,560\tINFO client_builder.py:242 -- Passing the following kwargs to ray.init() on the server: log_to_driver\n", "SIGTERM handler is not set because current thread is not the main thread.\n", - "2025-08-05 19:55:04,766\tWARNING utils.py:1280 -- Python patch version mismatch: The cluster was started with:\n", + "2025-08-10 23:27:17,455\tWARNING utils.py:1280 -- Python patch version mismatch: The cluster was started with:\n", " Ray: 2.48.0\n", " Python: 3.13.5\n", "This process on Ray Client was started with:\n", @@ -37,19 +37,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[36m(autoscaler +15s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", - "\u001b[36m(autoscaler +15s)\u001b[0m Adding 5 node(s) of type workergroup.\n", - "\u001b[36m(autoscaler +15s)\u001b[0m Resized to 6 CPUs, 5 GPUs.\n", - "\u001b[36m(autoscaler +15s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*13. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +20s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*24. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +31s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*30. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +36s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*3. Add suitable node types to this cluster to resolve this issue.\n", - "\u001b[36m(autoscaler +46s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*14. Add suitable node types to this cluster to resolve this issue.\n" + "\u001b[36m(autoscaler +28s)\u001b[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", + "\u001b[36m(autoscaler +28s)\u001b[0m Adding 5 node(s) of type workergroup.\n", + "\u001b[36m(autoscaler +28s)\u001b[0m Resized to 6 CPUs, 5 GPUs.\n", + "\u001b[36m(autoscaler +28s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*44. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +34s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*22. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +49s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*11. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +55s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*22. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +1m0s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*27. Add suitable node types to this cluster to resolve this issue.\n", + "\u001b[36m(autoscaler +1m10s)\u001b[0m No available node types can fulfill resource requests {'CPU': 1.0}*6. Add suitable node types to this cluster to resolve this issue.\n" ] } ], "source": [ - "ray_engine = NativeRayAsyncEngine(\n", + "ray_engine = RayEngine(\n", " \"ray://raycluster-op-test-kuberay-head-svc.ray.svc.cluster.local:10001\"\n", ")" ] @@ -61,7 +62,7 @@ "metadata": {}, "outputs": [], "source": [ - "input_stream = op.streams.ImmutableTableStream(\n", + "input_stream = op.streams.TableStream(\n", " pa.Table.from_pylist([{\"id\": i, \"x\": i * 2, \"y\": i * 3} for i in range(50)]),\n", " tag_columns=[\"id\"],\n", ")" @@ -156,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "75ade620", "metadata": {}, "outputs": [ @@ -193,7 +194,7 @@ "└─────┴─────┘" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -222,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "f459da03", "metadata": {}, "outputs": [], @@ -235,7 +236,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "2befd400", "metadata": {}, "outputs": [], @@ -246,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "id": "e21ecaf2", "metadata": {}, "outputs": [], @@ -257,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "id": "8449cb5d", "metadata": {}, "outputs": [], @@ -267,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "id": "40743bb7", "metadata": {}, "outputs": [ @@ -304,7 +305,7 @@ "└─────┴─────┘" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index d6d92cc..eee955e 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -1,5 +1,5 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import ImmutableTableStream +from orcapod.data.streams import TableStream from orcapod.types import TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from typing import Any, TYPE_CHECKING @@ -94,7 +94,7 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: reordered_columns = [col for col in table.column_names if col in tag_keys] reordered_columns += [col for col in table.column_names if col not in tag_keys] - return ImmutableTableStream( + return TableStream( table.select(reordered_columns), tag_columns=tuple(tag_keys), source=self, diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py index f27cca8..c048c5a 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/data/operators/mappers.py @@ -1,5 +1,5 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import ImmutableTableStream +from orcapod.data.streams import TableStream from orcapod.types import TypeSpec from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule @@ -48,7 +48,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: name_map[c] = c renamed_table = table.rename_columns(name_map) - return ImmutableTableStream( + return TableStream( renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) ) @@ -128,7 +128,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: name_map[c] = c # no renaming on packet columns renamed_table = table.rename_columns(name_map) - return ImmutableTableStream( + return TableStream( renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) ) diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/data/operators/semijoin.py index eedfff0..9a10eec 100644 --- a/src/orcapod/data/operators/semijoin.py +++ b/src/orcapod/data/operators/semijoin.py @@ -1,5 +1,5 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import ImmutableTableStream +from orcapod.data.streams import TableStream from orcapod.types import TypeSpec from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from typing import Any, TYPE_CHECKING @@ -83,7 +83,7 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre join_type="left semi", ) - return ImmutableTableStream( + return TableStream( semi_joined_table, tag_columns=tuple(left_tag_typespec.keys()), source=self, diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index bcf2596..424af46 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -9,7 +9,7 @@ from orcapod.data.kernels import TrackedKernelBase from orcapod.data.streams import ( - ImmutableTableStream, + TableStream, KernelStream, OperatorStreamBaseMixin, ) @@ -284,12 +284,12 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: """ import pyarrow.csv as csv - from orcapod.data.streams import ImmutableTableStream + from orcapod.data.streams import TableStream # Load current state of the file table = csv.read_csv(self.file_path) - return ImmutableTableStream( + return TableStream( table=table, tag_columns=self.tag_columns, source=self, @@ -399,7 +399,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: as_large_types=True ).to_table() - return ImmutableTableStream( + return TableStream( arrow_data, tag_columns=self.tag_columns, source=self, upstreams=() ) @@ -708,7 +708,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: ), ) - return ImmutableTableStream( + return TableStream( table=table, tag_columns=self.tag_keys, source=self, diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index bf88836..94a21ac 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -358,7 +358,7 @@ def identity_structure(self) -> Any: return self._data_content_identity -class ImmutableTableStream(ImmutableStream): +class TableStream(ImmutableStream): """ An immutable stream based on a PyArrow Table. This stream is designed to be used with data that is already in a tabular format, @@ -1027,13 +1027,13 @@ async def run_async( if existing is not None and existing.num_rows > 0: # If there are existing entries, we can cache them - existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) + existing_stream = TableStream(existing, tag_columns=tag_keys) for tag, packet in existing_stream.iter_packets(): cached_results.append((tag, packet)) pending_calls = [] if missing is not None and missing.num_rows > 0: - for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): + for tag, packet in TableStream(missing, tag_columns=tag_keys): # Since these packets are known to be missing, skip the cache lookup pending = self.pod.async_call( tag, @@ -1130,13 +1130,13 @@ def iter_packets( if existing is not None and existing.num_rows > 0: # If there are existing entries, we can cache them - existing_stream = ImmutableTableStream(existing, tag_columns=tag_keys) + existing_stream = TableStream(existing, tag_columns=tag_keys) for tag, packet in existing_stream.iter_packets(): cached_results.append((tag, packet)) yield tag, packet if missing is not None and missing.num_rows > 0: - for tag, packet in ImmutableTableStream(missing, tag_columns=tag_keys): + for tag, packet in TableStream(missing, tag_columns=tag_keys): # Since these packets are known to be missing, skip the cache lookup tag, packet = self.pod.call( tag, diff --git a/src/orcapod/execution_engines/__init__.py b/src/orcapod/execution_engines/__init__.py index 9a76995..24050d5 100644 --- a/src/orcapod/execution_engines/__init__.py +++ b/src/orcapod/execution_engines/__init__.py @@ -1 +1 @@ -from .ray_execution_engine import NativeRayAsyncEngine +from .ray_execution_engine import RayEngine From e32488e413bfcb6dec4c5002a658731130016852 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 10 Aug 2025 23:31:54 +0000 Subject: [PATCH 182/224] fix: ignore common data files --- .gitignore | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.gitignore b/.gitignore index a5279a0..81e31ee 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,18 @@ notebooks/**/*.parquet notebooks/**/*.pkl notebooks/**/*.db +# Ignore npy and npz data files by default +*.np[yz] + +# Ignore common data types by default +*.csv +*.parquet +*.xls +*.xlsx +*.txt + +# Ignore profiler output +*.prof # Ignore any notebook that starts with an underscore notebooks/**/_*.ipynb From f2b70fd74d6dd77a8fba5e17b2ffd25267e0b858 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 11 Aug 2025 03:31:11 +0000 Subject: [PATCH 183/224] fix: include system tags in propagation and hash computation --- .../data/datagrams/arrow_tag_packet.py | 4 +- src/orcapod/data/kernels.py | 5 -- src/orcapod/data/pods.py | 81 ++++++++++++++----- src/orcapod/data/streams.py | 4 +- src/orcapod/data/system_constants.py | 10 +++ src/orcapod/data/trackers.py | 12 ++- src/orcapod/pipeline/nodes.py | 18 +++-- src/orcapod/protocols/data_protocols.py | 8 +- 8 files changed, 101 insertions(+), 41 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 4ae06a1..c6ee52e 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -169,7 +169,9 @@ def as_table( include_meta_columns=include_meta_columns, include_context=include_context, ) - if include_all_info or include_system_tags: + if ( + include_all_info or include_system_tags + ) and self._system_tags_table.num_rows > 0: # add system_tags only for existing data columns table = arrow_utils.hstack_tables(table, self._system_tags_table) return table diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index a70c91c..b628919 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -119,11 +119,6 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An # equivalence of the two by returning the same identity structure for both invocations. # This can be achieved, for example, by returning a set over the streams instead of a tuple. if streams is not None: - if len(streams) == 0: - # If no streams are provided, then this is a source kernel - # and we simply return None as the identity structure. - logger.debug(f"Kernel {self} is acting as a source!") - return None streams = self.pre_kernel_processing(*streams) self.validate_inputs(*streams) return self.kernel_identity_structure(streams) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 3827998..bc7bbd4 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -73,8 +73,12 @@ def output_packet_types(self) -> TypeSpec: """ ... + @property + def version(self) -> str: + return self._version + @abstractmethod - def get_record_id(self, packet: dp.Packet) -> str: + def get_record_id(self, packet: dp.Packet, execution_engine_hash: str) -> str: """ Return the record ID for the input packet. This is used to identify the pod in the system. """ @@ -92,11 +96,29 @@ def __init__( self, error_handling: error_handling_options = "raise", label: str | None = None, + version: str = "v0.0", **kwargs, ) -> None: super().__init__(label=label, **kwargs) self._active = True self.error_handling = error_handling + self._version = version + import re + + match = re.match(r"\D.*(\d+)", version) + major_version = 0 + if match: + major_version = int(match.group(1)) + else: + raise ValueError( + f"Version string {version} does not contain a valid version number" + ) + + self._major_version = major_version + + @property + def major_version(self) -> str: + return self._major_version def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: """ @@ -274,21 +296,8 @@ def __init__( # extract the first full index (potentially with leading 0) in the version string if not isinstance(version, str): raise TypeError(f"Version must be a string, got {type(version)}") - import re - match = re.match(r"\D.*(\d+)", version) - major_version = 0 - if match: - major_version = int(match.group(1)) - else: - raise ValueError( - f"Version string {version} does not contain a valid version number" - ) - - self.version = version - self.major_version = major_version - - super().__init__(label=label or self.function_name, **kwargs) + super().__init__(label=label or self.function_name, version=version, **kwargs) # extract input and output types from the function signature input_packet_types, output_packet_types = tsutils.extract_function_typespecs( @@ -334,10 +343,15 @@ def kernel_id(self) -> tuple[str, ...]: "v" + str(self.major_version), ) - def get_record_id(self, packet: dp.Packet) -> str: + def get_record_id( + self, + packet: dp.Packet, + execution_engine_hash: str, + ) -> str: return combine_hashes( packet.content_hash(), self._total_pod_id_hash, + execution_engine_hash, prefix_hasher_id=True, ) @@ -380,6 +394,8 @@ def call( ) return tag, None + execution_engine_hash = execution_engine.name if execution_engine else "default" + # any kernel/pod invocation happening inside the function will NOT be tracked if not isinstance(packet, dict): input_dict = packet.as_dict(include_source=False) @@ -397,7 +413,7 @@ def call( if record_id is None: # if record_id is not provided, generate it from the packet - record_id = self.get_record_id(packet) + record_id = self.get_record_id(packet, execution_engine_hash) source_info = { k: ":".join(self.kernel_id + (record_id, k)) for k in output_data } @@ -427,6 +443,8 @@ async def async_call( ) return tag, None + execution_engine_hash = execution_engine.name if execution_engine else "default" + # any kernel/pod invocation happening inside the function will NOT be tracked # with self._tracker_manager.no_tracking(): # FIXME: figure out how to properly make context manager work with async/await @@ -445,7 +463,7 @@ async def async_call( if record_id is None: # if record_id is not provided, generate it from the packet - record_id = self.get_record_id(packet) + record_id = self.get_record_id(packet, execution_engine_hash) source_info = { k: ":".join(self.kernel_id + (record_id, k)) for k in output_data } @@ -521,8 +539,8 @@ def kernel_id(self) -> tuple[str, ...]: """ return self.pod.kernel_id - def get_record_id(self, packet: dp.Packet) -> str: - return self.pod.get_record_id(packet) + def get_record_id(self, packet: dp.Packet, execution_engine_hash: str) -> str: + return self.pod.get_record_id(packet, execution_engine_hash) @property def tiered_pod_id(self) -> dict[str, str]: @@ -610,6 +628,10 @@ def __init__( self.match_tier = match_tier self.retrieval_mode = retrieval_mode + @property + def version(self) -> str: + return self.pod.version + @property def record_path(self) -> tuple[str, ...]: """ @@ -662,7 +684,12 @@ async def async_call( tag, packet, record_id=record_id, execution_engine=execution_engine ) if output_packet is not None and not skip_cache_insert: - self.record_packet(packet, output_packet, record_id=record_id) + self.record_packet( + packet, + output_packet, + record_id=record_id, + execution_engine=execution_engine, + ) return tag, output_packet @@ -675,6 +702,7 @@ def record_packet( input_packet: dp.Packet, output_packet: dp.Packet, record_id: str | None = None, + execution_engine: dp.ExecutionEngine | None = None, skip_duplicates: bool = False, ) -> dp.Packet: """ @@ -696,8 +724,17 @@ def record_packet( constants.INPUT_PACKET_HASH, pa.array([input_packet.content_hash()], type=pa.large_string()), ) + # add execution engine information + execution_engine_hash = execution_engine.name if execution_engine else "default" + data_table = data_table.append_column( + constants.EXECUTION_ENGINE, + pa.array([execution_engine_hash], type=pa.large_string()), + ) + if record_id is None: - record_id = self.get_record_id(input_packet) + record_id = self.get_record_id( + input_packet, execution_engine_hash=execution_engine_hash + ) self.result_store.add_record( self.record_path, diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 94a21ac..ac10d6a 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -980,7 +980,9 @@ async def run_async( # identify all entries in the input stream for which we still have not computed packets target_entries = self.input_stream.as_table( - include_content_hash=constants.INPUT_PACKET_HASH + include_content_hash=constants.INPUT_PACKET_HASH, + include_source=True, + include_system_tags=True, ) existing_entries = self.pod.get_all_records(include_system_columns=True) if existing_entries is None or existing_entries.num_rows == 0: diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py index 86a9461..527022b 100644 --- a/src/orcapod/data/system_constants.py +++ b/src/orcapod/data/system_constants.py @@ -7,6 +7,8 @@ INPUT_PACKET_HASH = "input_packet_hash" PACKET_RECORD_ID = "packet_id" SYSTEM_TAG_PREFIX = "system_tag_" +POD_VERSION = "pod_version" +EXECUTION_ENGINE = "execution_engine" class SystemConstant: @@ -45,5 +47,13 @@ def PACKET_RECORD_ID(self) -> str: def SYSTEM_TAG_PREFIX(self) -> str: return f"{self._global_prefix}{DATAGRAM_PREFIX}{SYSTEM_TAG_PREFIX}" + @property + def POD_VERSION(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{POD_VERSION}" + + @property + def EXECUTION_ENGINE(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{EXECUTION_ENGINE}" + constants = SystemConstant() diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 5c8d64c..d15ccb9 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -137,7 +137,7 @@ def __exit__(self, exc_type, exc_val, ext_tb): # TODO: rename this to stub source or simply use StreamSource -class StubKernel: +class StubSource: def __init__(self, stream: dp.Stream, label: str | None = None) -> None: """ A placeholder kernel that does nothing. @@ -172,8 +172,9 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An if streams is not None: # when checked for invocation id, act as a source # and just return the output packet types - _, packet_types = self.stream.types() - return packet_types + # _, packet_types = self.stream.types() + # return packet_types + return None # otherwise, return the identity structure of the stream return self.stream.identity_structure() @@ -210,7 +211,7 @@ def parents(self) -> tuple["Invocation", ...]: if stream.source is not None: parent_invoctions.append(Invocation(stream.source, stream.upstreams)) else: - source = StubKernel(stream) + source = StubSource(stream) parent_invoctions.append(Invocation(source)) return tuple(parent_invoctions) @@ -228,6 +229,9 @@ def identity_structure(self) -> Any: Return a structure that represents the identity of this invocation. This is used to uniquely identify the invocation in the tracker. """ + # if no upstreams, then we want to identify the source directly + if not self.upstreams: + return self.kernel.identity_structure() return self.kernel.identity_structure(self.upstreams) def __repr__(self) -> str: diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 83c946b..5a2779e 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -156,6 +156,7 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: key_column_name = self.HASH_COLUMN_NAME output_table = output_stream.as_table( include_data_context=True, + include_system_tags=True, include_source=True, include_content_hash=key_column_name, ) @@ -179,8 +180,7 @@ def get_all_records( c for c in results.column_names if c.startswith(constants.META_PREFIX) - or c.startswith(constants.CONTEXT_KEY) - or c.startswith(constants.SOURCE_PREFIX) + or c.startswith(constants.DATAGRAM_PREFIX) ] results = results.drop(system_columns) @@ -228,8 +228,9 @@ def call( skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: + execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: - record_id = self.get_record_id(packet) + record_id = self.get_record_id(packet, execution_engine_hash) tag, output_packet = super().call( tag, @@ -264,8 +265,9 @@ async def async_call( skip_cache_lookup: bool = False, skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: + execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: - record_id = self.get_record_id(packet) + record_id = self.get_record_id(packet, execution_engine_hash) tag, output_packet = await super().async_call( tag, @@ -300,7 +302,8 @@ def add_pipeline_record( skip_cache_lookup: bool = False, ) -> None: # combine dp.Tag with packet content hash to compute entry hash - tag_with_hash = tag.as_table().append_column( + # TODO: add system tag columns + tag_with_hash = tag.as_table(include_system_tags=True).append_column( constants.INPUT_PACKET_HASH, pa.array([input_packet.content_hash()], type=pa.large_string()), ) @@ -308,7 +311,6 @@ def add_pipeline_record( entry_id = self.data_context.arrow_hasher.hash_table( tag_with_hash, prefix_hasher_id=True ) - # FIXME: consider and implement more robust cache lookup logic existing_record = None if not skip_cache_lookup: @@ -342,7 +344,9 @@ def add_pipeline_record( .drop_columns(list(renamed_input_packet.keys())) ) - combined_record = arrow_utils.hstack_tables(tag.as_table(), input_packet_info) + combined_record = arrow_utils.hstack_tables( + tag.as_table(include_system_tags=True), input_packet_info + ) self.pipeline_store.add_record( self.pipeline_path, diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 8026765..c70fec4 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -12,6 +12,9 @@ class ExecutionEngine(Protocol): + @property + def name(self) -> str: ... + def submit_sync(self, function: Callable, *args, **kwargs) -> Any: """ Run the given function with the provided arguments. @@ -1832,7 +1835,10 @@ class Pod(Kernel, Protocol): and fine-grained caching. """ - def get_record_id(self, packet: Packet) -> str: ... + @property + def version(self) -> str: ... + + def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... @property def tiered_pod_id(self) -> dict[str, str]: From 351aa294d218412ddff8f473e6ecbb5c050bd022 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 11 Aug 2025 06:17:29 +0000 Subject: [PATCH 184/224] feat: implement tag schema-based pipeline record separation --- src/orcapod/contexts/__init__.py | 4 ++ src/orcapod/data/kernels.py | 20 ++++++--- src/orcapod/data/operators/base.py | 33 +++++++++++---- src/orcapod/data/operators/join.py | 15 +++++-- src/orcapod/data/pods.py | 19 ++++++--- src/orcapod/data/sources.py | 33 +++++++++------ src/orcapod/data/streams.py | 41 +++++++++++++------ src/orcapod/data/trackers.py | 10 +++-- .../execution_engines/ray_execution_engine.py | 4 ++ src/orcapod/pipeline/nodes.py | 29 +++++++++---- src/orcapod/protocols/data_protocols.py | 6 ++- 11 files changed, 153 insertions(+), 61 deletions(-) diff --git a/src/orcapod/contexts/__init__.py b/src/orcapod/contexts/__init__.py index 649c480..9adda27 100644 --- a/src/orcapod/contexts/__init__.py +++ b/src/orcapod/contexts/__init__.py @@ -41,6 +41,10 @@ def _get_registry() -> JSONDataContextRegistry: return _registry +def get_default_context_key() -> str: + return get_default_context().context_key + + def resolve_context(context_info: str | DataContext | None = None) -> DataContext: """ Resolve context information to a DataContext instance. diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index b628919..d267c13 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -87,16 +87,22 @@ def _set_modified_time( self._last_modified = datetime.now(timezone.utc) @abstractmethod - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ Return the output types of the kernel given the input streams. """ ... - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) - return self.kernel_output_types(*processed_streams) + return self.kernel_output_types( + *processed_streams, include_system_tags=include_system_tags + ) @abstractmethod def kernel_identity_structure( @@ -212,8 +218,12 @@ def computed_label(self) -> str | None: def kernel_id(self) -> tuple[str, ...]: return self.kernel.kernel_id - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - return self.kernel.output_types(*streams) + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + return self.kernel.output_types( + *streams, include_system_tags=include_system_tags + ) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/data/operators/base.py index a7fac6f..4f4ae60 100644 --- a/src/orcapod/data/operators/base.py +++ b/src/orcapod/data/operators/base.py @@ -56,9 +56,11 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: ) return output_substreams[0] - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: stream = streams[0] - return self.op_output_types(stream) + return self.op_output_types(stream, include_system_tags=include_system_tags) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -89,7 +91,9 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: ... @abstractmethod - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types( + self, stream: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. @@ -134,9 +138,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: left_stream, right_stream = streams return self.op_forward(left_stream, right_stream) - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: left_stream, right_stream = streams - return self.op_output_types(left_stream, right_stream) + return self.op_output_types( + left_stream, right_stream, include_system_tags=include_system_tags + ) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -170,7 +178,10 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre @abstractmethod def op_output_types( - self, left_stream: dp.Stream, right_stream: dp.Stream + self, + left_stream: dp.Stream, + right_stream: dp.Stream, + include_system_tags: bool = False, ) -> tuple[TypeSpec, TypeSpec]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. @@ -222,8 +233,10 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: """ return self.op_forward(*streams) - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - return self.op_output_types(*streams) + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + return self.op_output_types(*streams, include_system_tags=include_system_tags) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -251,7 +264,9 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: ... @abstractmethod - def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes at least one stream as input and returns a tuple of typespecs. diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index eee955e..1ff9a36 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -32,15 +32,22 @@ def op_validate_inputs(self, *streams: dp.Stream) -> None: # raise InputValidationError(f"Input streams are not compatible: {e}") from e raise e - def op_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: if len(streams) == 1: # If only one stream is provided, return its typespecs - return streams[0].types() + return streams[0].types(include_system_tags=include_system_tags) + # TODO: consider performing the check always with system tags on stream = streams[0] - tag_typespec, packet_typespec = stream.types() + tag_typespec, packet_typespec = stream.types( + include_system_tags=include_system_tags + ) for other_stream in streams[1:]: - other_tag_typespec, other_packet_typespec = other_stream.types() + other_tag_typespec, other_packet_typespec = other_stream.types( + include_system_tags=include_system_tags + ) tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) packet_typespec = intersection_typespecs( packet_typespec, other_packet_typespec diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index bc7bbd4..8b4dbea 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -117,15 +117,17 @@ def __init__( self._major_version = major_version @property - def major_version(self) -> str: + def major_version(self) -> int: return self._major_version - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. """ - tag_typespec, _ = streams[0].types() + tag_typespec, _ = streams[0].types(include_system_tags=include_system_tags) return tag_typespec, self.output_packet_types() def is_active(self) -> bool: @@ -650,8 +652,11 @@ def call( skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: # TODO: consider logic for overwriting existing records + execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: - record_id = self.get_record_id(packet) + record_id = self.get_record_id( + packet, execution_engine_hash=execution_engine_hash + ) output_packet = None if not skip_cache_lookup: output_packet = self.get_recorded_output_packet(packet) @@ -674,8 +679,12 @@ async def async_call( skip_cache_insert: bool = False, ) -> tuple[dp.Tag, dp.Packet | None]: # TODO: consider logic for overwriting existing records + execution_engine_hash = execution_engine.name if execution_engine else "default" + if record_id is None: - record_id = self.get_record_id(packet) + record_id = self.get_record_id( + packet, execution_engine_hash=execution_engine_hash + ) output_packet = None if not skip_cache_lookup: output_packet = self.get_recorded_output_packet(packet) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index 424af46..f41c941 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -19,6 +19,7 @@ from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants +from orcapod.semantic_types import infer_schema_from_pylist_data if TYPE_CHECKING: import pandas as pd @@ -109,9 +110,9 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """Delegate to the cached KernelStream.""" return self().keys() - def types(self) -> tuple[TypeSpec, TypeSpec]: + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """Delegate to the cached KernelStream.""" - return self().types() + return self().types(include_system_tags=include_system_tags) @property def last_modified(self): @@ -296,11 +297,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: upstreams=(), ) - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """Infer types from the file (could be cached).""" # For demonstration - in practice you might cache this sample_stream = self.forward() - return sample_stream.types() + return sample_stream.types(include_system_tags=include_system_tags) class ManualDeltaTableSource(SourceBase): @@ -412,8 +415,11 @@ def kernel_identity_structure( """ return (self.__class__.__name__, str(self.table_path)) - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """Return tag and packet types based on schema and tag columns.""" + # TODO: auto add system entry tag tag_types: TypeSpec = {} packet_types: TypeSpec = {} for field, field_type in self.python_schema.items(): @@ -645,8 +651,8 @@ def __init__( self, tags: Collection[dict[str, DataValue]], packets: Collection[dict[str, DataValue]], - tag_typespec: TypeSpec | None = None, - packet_typespec: TypeSpec | None = None, + tag_typespec: dict[str, type] | None = None, + packet_typespec: dict[str, type] | None = None, **kwargs, ): super().__init__(**kwargs) @@ -656,11 +662,9 @@ def __init__( raise ValueError( "Tags and packets must be non-empty collections of equal length" ) - self.tag_typespec = tag_typespec or typespec_utils.get_typespec_from_dict( - self.tags[0] - ) - self.packet_typespec = packet_typespec or typespec_utils.get_typespec_from_dict( - self.packets[0] + self.tag_typespec = tag_typespec or infer_schema_from_pylist_data(self.tags) + self.packet_typespec = packet_typespec or infer_schema_from_pylist_data( + self.packets ) source_info = ":".join(self.kernel_id) self.source_info = { @@ -715,6 +719,9 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: upstreams=(), ) - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """Return tag and packet types based on provided typespecs.""" + # TODO: add system tag return self.tag_typespec, self.packet_typespec diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index ac10d6a..654c763 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -202,7 +202,7 @@ def computed_label(self) -> str | None: def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... @abstractmethod - def types(self) -> tuple[TypeSpec, TypeSpec]: ... + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: ... @property def last_modified(self) -> datetime | None: @@ -392,7 +392,12 @@ def __init__( if data_context_table is None: data_context_table = pa.table( - {constants.CONTEXT_KEY: pa.nulls(len(data_table), pa.large_string())} + { + constants.CONTEXT_KEY: pa.array( + [contexts.get_default_context_key()] * len(data_table), + pa.large_string(), + ) + } ) prefix_info = {constants.SOURCE_PREFIX: source_info} @@ -477,15 +482,21 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ return self._tag_columns, self._packet_columns - def types(self) -> tuple[dict[str, type], dict[str, type]]: + def types( + self, include_system_tags: bool = False + ) -> tuple[dict[str, type], dict[str, type]]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ # TODO: consider using MappingProxyType to avoid copying the dicts converter = self._data_context.type_converter + if include_system_tags: + tag_schema = self._all_tag_schema + else: + tag_schema = self._tag_schema return ( - converter.arrow_schema_to_python_schema(self._tag_schema), + converter.arrow_schema_to_python_schema(tag_schema), converter.arrow_schema_to_python_schema(self._packet_schema), ) @@ -652,12 +663,14 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: tag_types, packet_types = self.kernel.output_types(*self.upstreams) return tuple(tag_types.keys()), tuple(packet_types.keys()) - def types(self) -> tuple[TypeSpec, TypeSpec]: + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ - return self.kernel.output_types(*self.upstreams) + return self.kernel.output_types( + *self.upstreams, include_system_tags=include_system_tags + ) @property def is_current(self) -> bool: @@ -853,8 +866,10 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys - def types(self) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, _ = self.prepared_stream.types() + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.prepared_stream.types( + include_system_tags=include_system_tags + ) # TODO: check if copying can be avoided packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec @@ -1194,8 +1209,10 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys - def types(self) -> tuple[TypeSpec, TypeSpec]: - tag_typespec, _ = self.input_stream.types() + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + tag_typespec, _ = self.input_stream.types( + include_system_tags=include_system_tags + ) # TODO: check if copying can be avoided packet_typespec = dict(self.pod.output_packet_types()) return tag_typespec, packet_typespec @@ -1305,12 +1322,12 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ return self._stream.keys() - def types(self) -> tuple[TypeSpec, TypeSpec]: + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ - return self._stream.types() + return self._stream.types(include_system_tags=include_system_tags) def as_table( self, diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index d15ccb9..6e14589 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,6 +1,6 @@ from orcapod import contexts from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod.protocols import data_protocols as dp, hashing_protocols as hp +from orcapod.protocols import data_protocols as dp from collections import defaultdict from collections.abc import Generator, Collection from abc import ABC, abstractmethod @@ -136,7 +136,7 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) -# TODO: rename this to stub source or simply use StreamSource +# TODO: Move this to sources.py class StubSource: def __init__(self, stream: dp.Stream, label: str | None = None) -> None: """ @@ -146,13 +146,15 @@ def __init__(self, stream: dp.Stream, label: str | None = None) -> None: self.label = label or stream.label self.stream = stream - def output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. """ assert len(streams) == 0, "StubKernel should not have any input streams." - return self.stream.types() + return self.stream.types(include_system_tags=include_system_tags) @property def kernel_id(self) -> tuple[str, ...]: diff --git a/src/orcapod/execution_engines/ray_execution_engine.py b/src/orcapod/execution_engines/ray_execution_engine.py index 6e581f9..d3443df 100644 --- a/src/orcapod/execution_engines/ray_execution_engine.py +++ b/src/orcapod/execution_engines/ray_execution_engine.py @@ -38,6 +38,10 @@ def __init__(self, ray_address: str | None = None, **ray_init_kwargs): logger.info("Native Ray async engine initialized") logger.info(f"Cluster resources: {ray.cluster_resources()}") + @property + def name(self) -> str: + return "ray" + def submit_sync(self, func: Callable[..., T], *args, **kwargs) -> T: """ Submit a function synchronously using Ray. diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 5a2779e..fe185f4 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -43,6 +43,10 @@ def __init__( self.invocation_hash = self.data_context.object_hasher.hash_to_hex( self.identity_structure(()), prefix_hasher_id=True ) + tag_types, _ = self.types(include_system_tags=True) + self.tag_schema_hash = self.data_context.object_hasher.hash_to_hex( + tag_types, prefix_hasher_id=True + ) @property def contained_kernel(self) -> dp.Kernel: @@ -56,7 +60,12 @@ def pipeline_path(self) -> tuple[str, ...]: Return the path to the pipeline run records. This is used to store the run-associated tag info. """ - return self.pipeline_path_prefix + self.kernel_id + (self.invocation_hash,) + # TODO: include output tag hash! + return ( + self.pipeline_path_prefix + + self.kernel_id + + (self.invocation_hash, self.tag_schema_hash) + ) def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: @@ -67,12 +76,16 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: # super().validate_inputs(*self.input_streams) return super().forward(*self.input_streams) # type: ignore[return-value] - def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ Return the output types of the node. This is used to determine the types of the output streams. """ - return self.contained_kernel.output_types(*self.input_streams) + return self.contained_kernel.output_types( + *self.input_streams, include_system_tags=include_system_tags + ) def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None @@ -140,10 +153,12 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An """ # construct identity structure from the node's information and the # contained kernel - if streams is not None and len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) + if streams is not None: + if len(streams) > 0: + raise NotImplementedError( + "At this moment, Node does not yet support handling additional input streams." + ) + return None return self.kernel.identity_structure(self.input_streams) def forward(self, *streams: dp.Stream) -> dp.Stream: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index c70fec4..7273095 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1332,7 +1332,7 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ ... - def types(self) -> tuple[TypeSpec, TypeSpec]: + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ Type specifications for the stream content. @@ -1739,7 +1739,9 @@ def forward(self, *streams: Stream) -> Stream: """ ... - def output_types(self, *streams: Stream) -> tuple[TypeSpec, TypeSpec]: + def output_types( + self, *streams: Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: """ Determine output types without triggering computation. From e5b2efdf469f2e149eddaedef871736747b24e07 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 12 Aug 2025 20:47:15 +0000 Subject: [PATCH 185/224] build: add mkdocs and minor cleanups --- pyproject.toml | 2 + src/orcapod/data/system_constants.py | 5 + src/orcapod/pipeline/legacy_pipeline.py | 257 ------------------------ uv.lock | 193 ++++++++++++++++++ 4 files changed, 200 insertions(+), 257 deletions(-) delete mode 100644 src/orcapod/pipeline/legacy_pipeline.py diff --git a/pyproject.toml b/pyproject.toml index 7c48254..5ac2066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ version_file = "src/orcapod/_version.py" [dependency-groups] dev = [ "adlfs>=2024.12.0", + "boto3>=1.39.11", "deltalake>=1.0.2", "gcsfs>=2025.7.0", "httpie>=3.2.4", @@ -51,6 +52,7 @@ dev = [ "ipywidgets>=8.1.7", "jsonschema>=4.25.0", "minio>=7.2.16", + "mkdocs>=1.6.1", "pyarrow-stubs>=20.0.0.20250716", "pyiceberg>=0.9.1", "pytest>=8.3.5", diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/data/system_constants.py index 527022b..edd3503 100644 --- a/src/orcapod/data/system_constants.py +++ b/src/orcapod/data/system_constants.py @@ -9,6 +9,7 @@ SYSTEM_TAG_PREFIX = "system_tag_" POD_VERSION = "pod_version" EXECUTION_ENGINE = "execution_engine" +POD_TIMESTAMP = "pod_ts" class SystemConstant: @@ -55,5 +56,9 @@ def POD_VERSION(self) -> str: def EXECUTION_ENGINE(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{EXECUTION_ENGINE}" + @property + def POD_TIMESTAMP(self) -> str: + return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}{POD_TIMESTAMP}" + constants = SystemConstant() diff --git a/src/orcapod/pipeline/legacy_pipeline.py b/src/orcapod/pipeline/legacy_pipeline.py deleted file mode 100644 index 8c931f7..0000000 --- a/src/orcapod/pipeline/legacy_pipeline.py +++ /dev/null @@ -1,257 +0,0 @@ -from collections import defaultdict -from collections.abc import Collection -import logging -import pickle -import sys -import time -from pathlib import Path -from typing import Any - - -from orcapod.core import Invocation, Kernel, SyncStream -from orcapod.core.pod import FunctionPod -from orcapod.pipeline.legacy_nodes import KernelNode, FunctionPodNode, Node - -from orcapod.core.tracker import GraphTracker -from orcapod.stores import ArrowDataStore - -logger = logging.getLogger(__name__) - - -class SerializationError(Exception): - """Raised when pipeline cannot be serialized""" - - pass - - -class Pipeline(GraphTracker): - """ - Enhanced pipeline that tracks operations and provides queryable views. - Replaces the old Tracker with better persistence and view capabilities. - """ - - def __init__( - self, - name: str | tuple[str, ...], - pipeline_store: ArrowDataStore, - results_store: ArrowDataStore | None = None, - auto_compile: bool = True, - ) -> None: - super().__init__() - if not isinstance(name, tuple): - name = (name,) - self.name = name - self.pipeline_store_path_prefix = self.name - self.results_store_path_prefix = () - if results_store is None: - if pipeline_store is None: - raise ValueError( - "Either pipeline_store or results_store must be provided" - ) - results_store = pipeline_store - self.results_store_path_prefix = self.name + ("_results",) - - self.pipeline_store = pipeline_store - self.results_store = results_store - self.nodes = {} - self.auto_compile = auto_compile - self._dirty = False - self._ordered_nodes = [] # Track order of invocations - - # Core Pipeline Operations - def save(self, path: Path | str) -> None: - """Save complete pipeline state - named functions only""" - path = Path(path) - - # Validate serializability first - self._validate_serializable() - - state = { - "name": self.name, - "invocation_lut": self.invocation_lut, - "metadata": { - "created_at": time.time(), - "python_version": sys.version_info[:2], - "orcapod_version": "0.1.0", # TODO: make this dynamic - }, - } - - # Atomic write - temp_path = path.with_suffix(".tmp") - try: - with open(temp_path, "wb") as f: - pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL) - temp_path.replace(path) - logger.info(f"Pipeline '{self.name}' saved to {path}") - except Exception: - if temp_path.exists(): - temp_path.unlink() - raise - - def flush(self) -> None: - """Flush all pending writes to the data store""" - self.pipeline_store.flush() - self.results_store.flush() - logger.info("Pipeline stores flushed") - - def record(self, invocation: Invocation) -> None: - """ - Record an invocation in the pipeline. - This method is called automatically by the Kernel when an operation is invoked. - """ - super().record(invocation) - self._dirty = True - - def wrap_invocation( - self, kernel: Kernel, input_nodes: Collection[Node], label: str | None = None - ) -> Node: - if isinstance(kernel, FunctionPod): - return FunctionPodNode( - kernel, - input_nodes, - output_store=self.results_store, - tag_store=self.pipeline_store, - output_store_path_prefix=self.results_store_path_prefix, - tag_store_path_prefix=self.pipeline_store_path_prefix, - label=label, - ) - return KernelNode( - kernel, - input_nodes, - output_store=self.pipeline_store, - store_path_prefix=self.pipeline_store_path_prefix, - label=label, - ) - - def compile(self): - import networkx as nx - - G = self.generate_graph() - - # Proposed labels for each Kernel in the graph - # If name collides, unique name is generated by appending an index - proposed_labels = defaultdict(list) - node_lut = {} - edge_lut: dict[SyncStream, Node] = {} - ordered_nodes = [] - for invocation in nx.topological_sort(G): - # map streams to the new streams based on Nodes - input_nodes = [edge_lut[stream] for stream in invocation.streams] - label = None - if invocation.has_assigned_label: - # If the invocation has a label, use it directly - label = invocation.label - new_node = self.wrap_invocation(invocation.kernel, input_nodes, label=label) - - # register the new node against the original invocation - node_lut[invocation] = new_node - ordered_nodes.append(new_node) - # register the new node in the proposed labels -- if duplicates occur, will resolve later - proposed_labels[new_node.label].append(new_node) - - for edge in G.out_edges(invocation): - edge_lut[G.edges[edge]["stream"]] = new_node - - self._ordered_nodes = ordered_nodes - - # resolve duplicates in proposed_labels - labels_to_nodes = {} - for label, nodes in proposed_labels.items(): - if len(nodes) > 1: - # If multiple nodes have the same label, append index to make it unique - for idx, node in enumerate(nodes): - node.label = f"{label}_{idx}" - labels_to_nodes[node.label] = node - else: - # If only one node, keep the original label - nodes[0].label = label - labels_to_nodes[label] = nodes[0] - - # store as pipeline's nodes attribute - self.nodes = labels_to_nodes - self._dirty = False - return node_lut, edge_lut, proposed_labels, labels_to_nodes - - def __exit__(self, exc_type, exc_val, ext_tb): - super().__exit__(exc_type, exc_val, ext_tb) - if self.auto_compile: - self.compile() - - def __getattr__(self, item: str) -> Any: - """Allow direct access to pipeline attributes""" - if item in self.nodes: - return self.nodes[item] - raise AttributeError(f"Pipeline has no attribute '{item}'") - - def __dir__(self): - # Include both regular attributes and dynamic ones - return list(super().__dir__()) + list(self.nodes.keys()) - - def rename(self, old_name: str, new_name: str) -> None: - """ - Rename a node in the pipeline. - This will update the label and the internal mapping. - """ - if old_name not in self.nodes: - raise KeyError(f"Node '{old_name}' does not exist in the pipeline.") - if new_name in self.nodes: - raise KeyError(f"Node '{new_name}' already exists in the pipeline.") - node = self.nodes[old_name] - del self.nodes[old_name] - node.label = new_name - self.nodes[new_name] = node - logger.info(f"Node '{old_name}' renamed to '{new_name}'") - - def run(self, full_sync: bool = False) -> None: - """ - Run the pipeline, compiling it if necessary. - This method is a no-op if auto_compile is False. - """ - if self.auto_compile and self._dirty: - self.compile() - - # Run in topological order - for node in self._ordered_nodes: - if full_sync: - node.reset_cache() - node.flow() - - self.flush() - - @classmethod - def load(cls, path: Path | str) -> "Pipeline": - """Load complete pipeline state""" - path = Path(path) - - with open(path, "rb") as f: - state = pickle.load(f) - - pipeline = cls(state["name"], state["output_store"]) - pipeline.invocation_lut = state["invocation_lut"] - - logger.info(f"Pipeline '{pipeline.name}' loaded from {path}") - return pipeline - - def _validate_serializable(self) -> None: - """Ensure pipeline contains only serializable operations""" - issues = [] - - for operation, invocations in self.invocation_lut.items(): - # Check for lambda functions - if hasattr(operation, "function"): - func = getattr(operation, "function", None) - if func and hasattr(func, "__name__") and func.__name__ == "": - issues.append(f"Lambda function in {operation.__class__.__name__}") - - # Test actual serializability - try: - pickle.dumps(operation) - except Exception as e: - issues.append(f"Non-serializable operation {operation}: {e}") - - if issues: - raise SerializationError( - "Pipeline contains non-serializable elements:\n" - + "\n".join(f" - {issue}" for issue in issues) - + "\n\nOnly named functions are supported for serialization." - ) diff --git a/uv.lock b/uv.lock index 43629fc..ec6bcb4 100644 --- a/uv.lock +++ b/uv.lock @@ -357,6 +357,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl", hash = "sha256:b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0", size = 1191340, upload-time = "2025-05-22T05:09:24.606Z" }, ] +[[package]] +name = "boto3" +version = "1.39.11" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b6/2e/ed75ea3ee0fd1afacc3379bc2b7457c67a6b0f0e554e1f7ccbdbaed2351b/boto3-1.39.11.tar.gz", hash = "sha256:3027edf20642fe1d5f9dc50a420d0fe2733073ed6a9f0f047b60fe08c3682132", size = 111869, upload-time = "2025-07-22T19:26:50.867Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/66/88566a6484e746c0b075f7c9bb248e8548eda0a486de4460d150a41e2d57/boto3-1.39.11-py3-none-any.whl", hash = "sha256:af8f1dad35eceff7658fab43b39b0f55892b6e3dd12308733521cc24dd2c9a02", size = 139900, upload-time = "2025-07-22T19:26:48.706Z" }, +] + [[package]] name = "botocore" version = "1.39.11" @@ -833,6 +847,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl", hash = "sha256:653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5", size = 36815, upload-time = "2025-07-15T16:49:20.333Z" }, ] +[[package]] +name = "ghp-import" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, +] + [[package]] name = "google-api-core" version = "2.25.1" @@ -1123,6 +1149,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, ] +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + [[package]] name = "jmespath" version = "1.0.1" @@ -1249,6 +1287,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4c/fa/be89a49c640930180657482a74970cdcf6f7072c8d2471e1babe17a222dc/kiwisolver-1.4.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be4816dc51c8a471749d664161b434912eee82f2ea66bd7628bd14583a833e85", size = 2349213, upload-time = "2024-12-24T18:30:40.019Z" }, ] +[[package]] +name = "markdown" +version = "3.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/c2/4ab49206c17f75cb08d6311171f2d65798988db4360c4d1485bd0eedd67c/markdown-3.8.2.tar.gz", hash = "sha256:247b9a70dd12e27f67431ce62523e675b866d254f900c4fe75ce3dda62237c45", size = 362071, upload-time = "2025-06-19T17:12:44.483Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl", hash = "sha256:5c83764dbd4e00bdd94d85a19b8d55ccca20fe35b2e678a1422b380324dd5f24", size = 106827, upload-time = "2025-06-19T17:12:42.994Z" }, +] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -1261,6 +1308,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" }, ] +[[package]] +name = "markupsafe" +version = "3.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274, upload-time = "2024-10-18T15:21:13.777Z" }, + { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348, upload-time = "2024-10-18T15:21:14.822Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149, upload-time = "2024-10-18T15:21:15.642Z" }, + { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118, upload-time = "2024-10-18T15:21:17.133Z" }, + { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993, upload-time = "2024-10-18T15:21:18.064Z" }, + { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178, upload-time = "2024-10-18T15:21:18.859Z" }, + { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319, upload-time = "2024-10-18T15:21:19.671Z" }, + { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352, upload-time = "2024-10-18T15:21:20.971Z" }, + { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097, upload-time = "2024-10-18T15:21:22.646Z" }, + { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601, upload-time = "2024-10-18T15:21:23.499Z" }, + { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274, upload-time = "2024-10-18T15:21:24.577Z" }, + { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352, upload-time = "2024-10-18T15:21:25.382Z" }, + { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122, upload-time = "2024-10-18T15:21:26.199Z" }, + { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085, upload-time = "2024-10-18T15:21:27.029Z" }, + { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978, upload-time = "2024-10-18T15:21:27.846Z" }, + { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208, upload-time = "2024-10-18T15:21:28.744Z" }, + { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357, upload-time = "2024-10-18T15:21:29.545Z" }, + { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344, upload-time = "2024-10-18T15:21:30.366Z" }, + { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101, upload-time = "2024-10-18T15:21:31.207Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603, upload-time = "2024-10-18T15:21:32.032Z" }, + { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510, upload-time = "2024-10-18T15:21:33.625Z" }, + { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486, upload-time = "2024-10-18T15:21:34.611Z" }, + { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480, upload-time = "2024-10-18T15:21:35.398Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914, upload-time = "2024-10-18T15:21:36.231Z" }, + { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796, upload-time = "2024-10-18T15:21:37.073Z" }, + { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473, upload-time = "2024-10-18T15:21:37.932Z" }, + { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114, upload-time = "2024-10-18T15:21:39.799Z" }, + { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098, upload-time = "2024-10-18T15:21:40.813Z" }, + { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208, upload-time = "2024-10-18T15:21:41.814Z" }, + { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" }, +] + [[package]] name = "matplotlib" version = "3.10.3" @@ -1319,6 +1404,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "mergedeep" +version = "1.3.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" }, +] + [[package]] name = "minio" version = "7.2.16" @@ -1335,6 +1429,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl", hash = "sha256:9288ab988ca57c181eb59a4c96187b293131418e28c164392186c2b89026b223", size = 95750, upload-time = "2025-07-21T20:11:14.139Z" }, ] +[[package]] +name = "mkdocs" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "ghp-import" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mergedeep" }, + { name = "mkdocs-get-deps" }, + { name = "packaging" }, + { name = "pathspec" }, + { name = "pyyaml" }, + { name = "pyyaml-env-tag" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, +] + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mergedeep" }, + { name = "platformdirs" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/f5/ed29cd50067784976f25ed0ed6fcd3c2ce9eb90650aa3b2796ddf7b6870b/mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c", size = 10239, upload-time = "2023-11-20T17:51:09.981Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, +] + [[package]] name = "mmh3" version = "5.1.0" @@ -1675,6 +1807,7 @@ redis = [ [package.dev-dependencies] dev = [ { name = "adlfs" }, + { name = "boto3" }, { name = "deltalake" }, { name = "gcsfs" }, { name = "httpie" }, @@ -1682,6 +1815,7 @@ dev = [ { name = "ipywidgets" }, { name = "jsonschema" }, { name = "minio" }, + { name = "mkdocs" }, { name = "pyarrow-stubs" }, { name = "pyiceberg" }, { name = "pytest" }, @@ -1716,6 +1850,7 @@ provides-extras = ["redis", "ray", "all"] [package.metadata.requires-dev] dev = [ { name = "adlfs", specifier = ">=2024.12.0" }, + { name = "boto3", specifier = ">=1.39.11" }, { name = "deltalake", specifier = ">=1.0.2" }, { name = "gcsfs", specifier = ">=2025.7.0" }, { name = "httpie", specifier = ">=3.2.4" }, @@ -1723,6 +1858,7 @@ dev = [ { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, + { name = "mkdocs", specifier = ">=1.6.1" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, @@ -1787,6 +1923,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, ] +[[package]] +name = "pathspec" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -2352,6 +2497,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, ] +[[package]] +name = "pyyaml-env-tag" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, +] + [[package]] name = "pyzmq" version = "26.4.0" @@ -2645,6 +2802,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl", hash = "sha256:b6b2d3f84b6aa1c2ba5e62e39dd9410cf54f10a2cce1ea6db1ba0d1a6bcce685", size = 30315, upload-time = "2025-07-15T16:35:20.734Z" }, ] +[[package]] +name = "s3transfer" +version = "0.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/05/d52bf1e65044b4e5e27d4e63e8d1579dbdec54fce685908ae09bc3720030/s3transfer-0.13.1.tar.gz", hash = "sha256:c3fdba22ba1bd367922f27ec8032d6a1cf5f10c934fb5d68cf60fd5a23d936cf", size = 150589, upload-time = "2025-07-18T19:22:42.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl", hash = "sha256:a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724", size = 85308, upload-time = "2025-07-18T19:22:40.947Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -2829,6 +2998,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/87/b22cf40cdf7e2b2bf83f38a94d2c90c5ad6c304896e5a12d0c08a602eb59/virtualenv-20.33.0-py3-none-any.whl", hash = "sha256:106b6baa8ab1b526d5a9b71165c85c456fbd49b16976c88e2bc9352ee3bc5d3f", size = 6060205, upload-time = "2025-08-03T08:09:16.674Z" }, ] +[[package]] +name = "watchdog" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" }, + { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" }, + { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" }, + { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" }, + { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" }, + { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" }, + { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" }, + { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" }, + { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" }, + { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" }, + { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" }, + { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, +] + [[package]] name = "wcwidth" version = "0.2.13" From bff569b1d5f8de751d6960add0f853fb9060c079 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 12 Aug 2025 20:47:37 +0000 Subject: [PATCH 186/224] feat: keep track of timestamps for data addition --- src/orcapod/data/pods.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 8b4dbea..80536d6 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -740,6 +740,12 @@ def record_packet( pa.array([execution_engine_hash], type=pa.large_string()), ) + timestamp = datetime.now(timezone.utc) + data_table = data_table.append_column( + constants.POD_TIMESTAMP, + pa.array([timestamp], type=pa.timestamp("us", tz="UTC")), + ) + if record_id is None: record_id = self.get_record_id( input_packet, execution_engine_hash=execution_engine_hash From 43fe9d4958ac87ea79af4569b128e4214c3fc753 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 12 Aug 2025 20:55:36 +0000 Subject: [PATCH 187/224] test: add comprehensive tests for datagrams --- tests/__init__.py | 0 tests/test_data/__init__.py | 0 tests/test_data/test_datagrams/__init__.py | 1 + .../test_datagrams/test_arrow_datagram.py | 934 ++++++++++++++++++ .../test_datagrams/test_arrow_tag_packet.py | 844 ++++++++++++++++ .../test_datagrams/test_base_integration.py | 594 +++++++++++ .../test_datagrams/test_dict_datagram.py | 765 ++++++++++++++ .../test_datagrams/test_dict_tag_packet.py | 566 +++++++++++ 8 files changed, 3704 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/test_data/__init__.py create mode 100644 tests/test_data/test_datagrams/__init__.py create mode 100644 tests/test_data/test_datagrams/test_arrow_datagram.py create mode 100644 tests/test_data/test_datagrams/test_arrow_tag_packet.py create mode 100644 tests/test_data/test_datagrams/test_base_integration.py create mode 100644 tests/test_data/test_datagrams/test_dict_datagram.py create mode 100644 tests/test_data/test_datagrams/test_dict_tag_packet.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/__init__.py b/tests/test_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/test_datagrams/__init__.py b/tests/test_data/test_datagrams/__init__.py new file mode 100644 index 0000000..94f78e2 --- /dev/null +++ b/tests/test_data/test_datagrams/__init__.py @@ -0,0 +1 @@ +# Test package for datagrams diff --git a/tests/test_data/test_datagrams/test_arrow_datagram.py b/tests/test_data/test_datagrams/test_arrow_datagram.py new file mode 100644 index 0000000..44fe537 --- /dev/null +++ b/tests/test_data/test_datagrams/test_arrow_datagram.py @@ -0,0 +1,934 @@ +""" +Comprehensive tests for ArrowDatagram class. + +This module tests all functionality of the ArrowDatagram class including: +- Initialization and validation +- Dict-like interface operations +- Structural information methods +- Format conversion methods +- Meta column operations +- Data column operations +- Context operations +- Utility operations +""" + +import pytest +import pyarrow as pa +from datetime import datetime, date + +from orcapod.data.datagrams import ArrowDatagram +from orcapod.data.system_constants import constants + + +class TestArrowDatagramInitialization: + """Test ArrowDatagram initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with PyArrow table.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + + datagram = ArrowDatagram(table) + + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + assert datagram["score"] == 85.5 + + def test_initialization_multiple_rows_fails(self): + """Test initialization with multiple rows fails.""" + table = pa.Table.from_pydict({"user_id": [123, 456], "name": ["Alice", "Bob"]}) + + with pytest.raises(ValueError, match="exactly one row"): + ArrowDatagram(table) + + def test_initialization_empty_table_fails(self): + """Test initialization with empty table fails.""" + table = pa.Table.from_pydict({"user_id": [], "name": []}) + + with pytest.raises(ValueError, match="exactly one row"): + ArrowDatagram(table) + + def test_initialization_with_meta_info(self): + """Test initialization with meta information.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + meta_info = {"pipeline_version": "v1.0", "timestamp": "2024-01-01"} + + datagram = ArrowDatagram(table, meta_info=meta_info) + + assert datagram["user_id"] == 123 + assert datagram.get_meta_value("pipeline_version") == "v1.0" + assert datagram.get_meta_value("timestamp") == "2024-01-01" + + def test_initialization_with_context_in_table(self): + """Test initialization when context is included in table.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + constants.CONTEXT_KEY: ["v0.1"], + } + ) + + datagram = ArrowDatagram(table) + + assert datagram.data_context_key == "std:v0.1:default" + assert constants.CONTEXT_KEY not in datagram._data_table.column_names + + def test_initialization_with_meta_columns_in_table(self): + """Test initialization when meta columns are included in table.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + "__timestamp": ["2024-01-01"], + } + ) + + datagram = ArrowDatagram(table) + + assert datagram["user_id"] == 123 + assert datagram.get_meta_value("version") == "1.0" + assert datagram.get_meta_value("timestamp") == "2024-01-01" + + def test_initialization_with_explicit_context(self): + """Test initialization with explicit data context.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + + datagram = ArrowDatagram(table, data_context="v0.1") + + assert datagram.data_context_key == "std:v0.1:default" + + def test_initialization_no_data_columns_fails(self): + """Test initialization with no data columns fails.""" + table = pa.Table.from_pydict( + {"__version": ["1.0"], constants.CONTEXT_KEY: ["v0.1"]} + ) + + with pytest.raises(ValueError, match="at least one data column"): + ArrowDatagram(table) + + +class TestArrowDatagramDictInterface: + """Test dict-like interface operations.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5], "active": [True]} + ) + return ArrowDatagram(table) + + def test_getitem(self, sample_datagram): + """Test __getitem__ method.""" + assert sample_datagram["user_id"] == 123 + assert sample_datagram["name"] == "Alice" + assert sample_datagram["score"] == 85.5 + assert sample_datagram["active"] is True + + def test_getitem_missing_key(self, sample_datagram): + """Test __getitem__ with missing key raises KeyError.""" + with pytest.raises(KeyError): + _ = sample_datagram["nonexistent"] + + def test_contains(self, sample_datagram): + """Test __contains__ method.""" + assert "user_id" in sample_datagram + assert "name" in sample_datagram + assert "nonexistent" not in sample_datagram + + def test_iter(self, sample_datagram): + """Test __iter__ method.""" + keys = list(sample_datagram) + expected_keys = ["user_id", "name", "score", "active"] + assert set(keys) == set(expected_keys) + + def test_get(self, sample_datagram): + """Test get method with and without default.""" + assert sample_datagram.get("user_id") == 123 + assert sample_datagram.get("nonexistent") is None + assert sample_datagram.get("nonexistent", "default") == "default" + + +class TestArrowDatagramStructuralInfo: + """Test structural information methods.""" + + @pytest.fixture + def datagram_with_meta(self): + """Create a datagram with meta columns.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + "__pipeline_id": ["test_pipeline"], + } + ) + return ArrowDatagram(table) + + def test_keys_data_only(self, datagram_with_meta): + """Test keys method with data columns only.""" + keys = datagram_with_meta.keys() + expected = ("user_id", "name") + assert set(keys) == set(expected) + + def test_keys_with_meta_columns(self, datagram_with_meta): + """Test keys method including meta columns.""" + keys = datagram_with_meta.keys(include_meta_columns=True) + expected = ("user_id", "name", "__version", "__pipeline_id") + assert set(keys) == set(expected) + + def test_keys_with_context(self, datagram_with_meta): + """Test keys method including context.""" + keys = datagram_with_meta.keys(include_context=True) + expected = ("user_id", "name", constants.CONTEXT_KEY) + assert set(keys) == set(expected) + + def test_keys_with_all_info(self, datagram_with_meta): + """Test keys method including all information.""" + keys = datagram_with_meta.keys(include_all_info=True) + expected = ( + "user_id", + "name", + "__version", + "__pipeline_id", + constants.CONTEXT_KEY, + ) + assert set(keys) == set(expected) + + def test_keys_with_specific_meta_prefix(self, datagram_with_meta): + """Test keys method with specific meta column prefixes.""" + keys = datagram_with_meta.keys(include_meta_columns=["__version"]) + expected = ("user_id", "name", "__version") + assert set(keys) == set(expected) + + def test_types_data_only(self, datagram_with_meta): + """Test types method with data columns only.""" + types = datagram_with_meta.types() + expected_keys = {"user_id", "name"} + assert set(types.keys()) == expected_keys + assert types["user_id"] is int + assert types["name"] is str + + def test_types_with_meta_columns(self, datagram_with_meta): + """Test types method including meta columns.""" + types = datagram_with_meta.types(include_meta_columns=True) + expected_keys = {"user_id", "name", "__version", "__pipeline_id"} + assert set(types.keys()) == expected_keys + + def test_types_with_context(self, datagram_with_meta): + """Test types method including context.""" + types = datagram_with_meta.types(include_context=True) + expected_keys = {"user_id", "name", constants.CONTEXT_KEY} + assert set(types.keys()) == expected_keys + assert types[constants.CONTEXT_KEY] is str + + def test_arrow_schema_data_only(self, datagram_with_meta): + """Test arrow_schema method with data columns only.""" + schema = datagram_with_meta.arrow_schema() + expected_names = {"user_id", "name"} + assert set(schema.names) == expected_names + + def test_arrow_schema_with_meta_columns(self, datagram_with_meta): + """Test arrow_schema method including meta columns.""" + schema = datagram_with_meta.arrow_schema(include_meta_columns=True) + expected_names = {"user_id", "name", "__version", "__pipeline_id"} + assert set(schema.names) == expected_names + + def test_arrow_schema_with_context(self, datagram_with_meta): + """Test arrow_schema method including context.""" + schema = datagram_with_meta.arrow_schema(include_context=True) + expected_names = {"user_id", "name", constants.CONTEXT_KEY} + assert set(schema.names) == expected_names + + def test_content_hash(self, datagram_with_meta): + """Test content hash calculation.""" + hash1 = datagram_with_meta.content_hash() + hash2 = datagram_with_meta.content_hash() + + # Hash should be consistent + assert hash1 == hash2 + assert isinstance(hash1, str) + assert len(hash1) > 0 + + def test_content_hash_different_data(self): + """Test that different data produces different hashes.""" + table1 = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + table2 = pa.Table.from_pydict({"user_id": [456], "name": ["Bob"]}) + + datagram1 = ArrowDatagram(table1) + datagram2 = ArrowDatagram(table2) + + hash1 = datagram1.content_hash() + hash2 = datagram2.content_hash() + + assert hash1 != hash2 + + +class TestArrowDatagramFormatConversions: + """Test format conversion methods.""" + + @pytest.fixture + def datagram_with_all(self): + """Create a datagram with data, meta, and context.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + constants.CONTEXT_KEY: ["v0.1"], + } + ) + return ArrowDatagram(table) + + def test_as_dict_data_only(self, datagram_with_all): + """Test as_dict method with data columns only.""" + result = datagram_with_all.as_dict() + expected = {"user_id": 123, "name": "Alice"} + assert result == expected + + def test_as_dict_with_meta_columns(self, datagram_with_all): + """Test as_dict method including meta columns.""" + result = datagram_with_all.as_dict(include_meta_columns=True) + expected = {"user_id": 123, "name": "Alice", "__version": "1.0"} + assert result == expected + + def test_as_dict_with_context(self, datagram_with_all): + """Test as_dict method including context.""" + result = datagram_with_all.as_dict(include_context=True) + expected = { + "user_id": 123, + "name": "Alice", + constants.CONTEXT_KEY: "std:v0.1:default", + } + assert result == expected + + def test_as_dict_with_all_info(self, datagram_with_all): + """Test as_dict method including all information.""" + result = datagram_with_all.as_dict(include_all_info=True) + expected = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + constants.CONTEXT_KEY: "std:v0.1:default", + } + assert result == expected + + def test_as_table_data_only(self, datagram_with_all): + """Test as_table method with data columns only.""" + table = datagram_with_all.as_table() + + assert len(table) == 1 + assert set(table.column_names) == {"user_id", "name"} + assert table["user_id"].to_pylist()[0] == 123 + assert table["name"].to_pylist()[0] == "Alice" + + def test_as_table_with_meta_columns(self, datagram_with_all): + """Test as_table method including meta columns.""" + table = datagram_with_all.as_table(include_meta_columns=True) + + assert len(table) == 1 + expected_columns = {"user_id", "name", "__version"} + assert set(table.column_names) == expected_columns + + def test_as_table_with_context(self, datagram_with_all): + """Test as_table method including context.""" + table = datagram_with_all.as_table(include_context=True) + + assert len(table) == 1 + expected_columns = {"user_id", "name", constants.CONTEXT_KEY} + assert set(table.column_names) == expected_columns + + def test_as_arrow_compatible_dict(self, datagram_with_all): + """Test as_arrow_compatible_dict method.""" + result = datagram_with_all.as_arrow_compatible_dict() + + # Should have same keys as as_dict + dict_result = datagram_with_all.as_dict() + assert set(result.keys()) == set(dict_result.keys()) + + +class TestArrowDatagramMetaOperations: + """Test meta column operations.""" + + @pytest.fixture + def datagram_with_meta(self): + """Create a datagram with meta columns.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + "__pipeline_id": ["test"], + } + ) + return ArrowDatagram(table) + + def test_meta_columns_property(self, datagram_with_meta): + """Test meta_columns property.""" + meta_cols = datagram_with_meta.meta_columns + expected = ("__version", "__pipeline_id") + assert set(meta_cols) == set(expected) + + def test_get_meta_value(self, datagram_with_meta): + """Test get_meta_value method.""" + # With prefix + assert datagram_with_meta.get_meta_value("__version") == "1.0" + + # Without prefix + assert datagram_with_meta.get_meta_value("version") == "1.0" + + # With default + assert datagram_with_meta.get_meta_value("nonexistent", "default") == "default" + + def test_with_meta_columns(self, datagram_with_meta): + """Test with_meta_columns method.""" + updated = datagram_with_meta.with_meta_columns( + version="2.0", # Update existing + new_meta="new_value", # Add new + ) + + # Original should be unchanged + assert datagram_with_meta.get_meta_value("version") == "1.0" + + # Updated should have new values + assert updated.get_meta_value("version") == "2.0" + assert updated.get_meta_value("new_meta") == "new_value" + + # Data should be preserved + assert updated["user_id"] == 123 + assert updated["name"] == "Alice" + + def test_with_meta_columns_prefixed_keys(self, datagram_with_meta): + """Test with_meta_columns method with prefixed keys.""" + updated = datagram_with_meta.with_meta_columns(__version="2.0") + + assert updated.get_meta_value("version") == "2.0" + + def test_drop_meta_columns(self, datagram_with_meta): + """Test drop_meta_columns method.""" + updated = datagram_with_meta.drop_meta_columns("version") + + # Original should be unchanged + assert datagram_with_meta.get_meta_value("version") == "1.0" + + # Updated should not have dropped column + assert updated.get_meta_value("version") is None + assert updated.get_meta_value("pipeline_id") == "test" + + # Data should be preserved + assert updated["user_id"] == 123 + + def test_drop_meta_columns_prefixed(self, datagram_with_meta): + """Test drop_meta_columns method with prefixed keys.""" + updated = datagram_with_meta.drop_meta_columns("__version") + + assert updated.get_meta_value("version") is None + + def test_drop_meta_columns_multiple(self, datagram_with_meta): + """Test dropping multiple meta columns.""" + updated = datagram_with_meta.drop_meta_columns("version", "pipeline_id") + + assert updated.get_meta_value("version") is None + assert updated.get_meta_value("pipeline_id") is None + + # Data should be preserved + assert updated["user_id"] == 123 + + def test_drop_meta_columns_missing_key(self, datagram_with_meta): + """Test drop_meta_columns with missing key raises KeyError.""" + with pytest.raises(KeyError): + datagram_with_meta.drop_meta_columns("nonexistent") + + def test_drop_meta_columns_ignore_missing(self, datagram_with_meta): + """Test drop_meta_columns with ignore_missing=True.""" + updated = datagram_with_meta.drop_meta_columns( + "version", "nonexistent", ignore_missing=True + ) + + assert updated.get_meta_value("version") is None + assert updated.get_meta_value("pipeline_id") == "test" + + +class TestArrowDatagramDataOperations: + """Test data column operations.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5], "active": [True]} + ) + return ArrowDatagram(table) + + def test_select(self, sample_datagram): + """Test select method.""" + selected = sample_datagram.select("user_id", "name") + + assert set(selected.keys()) == {"user_id", "name"} + assert selected["user_id"] == 123 + assert selected["name"] == "Alice" + + # Original should be unchanged + assert set(sample_datagram.keys()) == {"user_id", "name", "score", "active"} + + def test_select_single_column(self, sample_datagram): + """Test select method with single column.""" + selected = sample_datagram.select("user_id") + + assert set(selected.keys()) == {"user_id"} + assert selected["user_id"] == 123 + + def test_select_missing_column(self, sample_datagram): + """Test select method with missing column raises ValueError.""" + with pytest.raises(ValueError): + sample_datagram.select("user_id", "nonexistent") + + def test_drop(self, sample_datagram): + """Test drop method.""" + dropped = sample_datagram.drop("score", "active") + + assert set(dropped.keys()) == {"user_id", "name"} + assert dropped["user_id"] == 123 + assert dropped["name"] == "Alice" + + # Original should be unchanged + assert set(sample_datagram.keys()) == {"user_id", "name", "score", "active"} + + def test_drop_single_column(self, sample_datagram): + """Test drop method with single column.""" + dropped = sample_datagram.drop("score") + + assert set(dropped.keys()) == {"user_id", "name", "active"} + + def test_drop_missing_column(self, sample_datagram): + """Test drop method with missing column raises KeyError.""" + with pytest.raises(KeyError): + sample_datagram.drop("nonexistent") + + def test_drop_ignore_missing(self, sample_datagram): + """Test drop method with ignore_missing=True.""" + dropped = sample_datagram.drop("score", "nonexistent", ignore_missing=True) + + assert set(dropped.keys()) == {"user_id", "name", "active"} + + def test_rename(self, sample_datagram): + """Test rename method.""" + renamed = sample_datagram.rename({"user_id": "id", "name": "username"}) + + expected_keys = {"id", "username", "score", "active"} + assert set(renamed.keys()) == expected_keys + assert renamed["id"] == 123 + assert renamed["username"] == "Alice" + assert renamed["score"] == 85.5 + + # Original should be unchanged + assert "user_id" in sample_datagram + assert "id" not in sample_datagram + + def test_rename_empty_mapping(self, sample_datagram): + """Test rename method with empty mapping.""" + renamed = sample_datagram.rename({}) + + # Should be identical + assert set(renamed.keys()) == set(sample_datagram.keys()) + assert renamed["user_id"] == sample_datagram["user_id"] + + def test_update(self, sample_datagram): + """Test update method.""" + updated = sample_datagram.update(score=95.0, active=False) + + # Original should be unchanged + assert sample_datagram["score"] == 85.5 + assert sample_datagram["active"] is True + + # Updated should have new values + assert updated["score"] == 95.0 + assert not updated["active"] + assert updated["user_id"] == 123 # Unchanged columns preserved + + def test_update_missing_column(self, sample_datagram): + """Test update method with missing column raises KeyError.""" + with pytest.raises(KeyError): + sample_datagram.update(nonexistent="value") + + def test_update_empty(self, sample_datagram): + """Test update method with no updates returns same instance.""" + updated = sample_datagram.update() + + # Should return the same instance + assert updated is sample_datagram + + def test_with_columns(self, sample_datagram): + """Test with_columns method.""" + new_datagram = sample_datagram.with_columns( + department="Engineering", salary=75000 + ) + + # Original should be unchanged + assert "department" not in sample_datagram + assert "salary" not in sample_datagram + + # New datagram should have additional columns + expected_keys = {"user_id", "name", "score", "active", "department", "salary"} + assert set(new_datagram.keys()) == expected_keys + assert new_datagram["department"] == "Engineering" + assert new_datagram["salary"] == 75000 + + def test_with_columns_with_types(self, sample_datagram): + """Test with_columns method with explicit types.""" + new_datagram = sample_datagram.with_columns( + column_types={"salary": int, "rate": float}, salary=75000, rate=85.5 + ) + + types = new_datagram.types() + assert types["salary"] is int + assert types["rate"] is float + + def test_with_columns_existing_column_fails(self, sample_datagram): + """Test with_columns method with existing column raises ValueError.""" + with pytest.raises(ValueError): + sample_datagram.with_columns(user_id=456) + + def test_with_columns_empty(self, sample_datagram): + """Test with_columns method with no columns returns same instance.""" + new_datagram = sample_datagram.with_columns() + + assert new_datagram is sample_datagram + + +class TestArrowDatagramContextOperations: + """Test context operations.""" + + def test_with_context_key(self): + """Test with_context_key method.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + original_datagram = ArrowDatagram(table, data_context="v0.1") + + new_datagram = original_datagram.with_context_key("v0.1") + + # Original should be unchanged + assert original_datagram.data_context_key == "std:v0.1:default" + + # New should have updated context + assert new_datagram.data_context_key == "std:v0.1:default" + + # Data should be preserved + assert new_datagram["user_id"] == 123 + assert new_datagram["name"] == "Alice" + + +class TestArrowDatagramUtilityOperations: + """Test utility operations.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "__version": ["1.0"]} + ) + return ArrowDatagram(table) + + def test_copy_with_cache(self, sample_datagram): + """Test copy method with cache included.""" + # Force cache creation + _ = sample_datagram.as_dict() + + copied = sample_datagram.copy(include_cache=True) + + # Should be different instances + assert copied is not sample_datagram + + # Should have same data + assert copied["user_id"] == sample_datagram["user_id"] + assert copied["name"] == sample_datagram["name"] + + # Should share cached values + assert copied._cached_python_dict is sample_datagram._cached_python_dict + + def test_copy_without_cache(self, sample_datagram): + """Test copy method without cache.""" + # Force cache creation + _ = sample_datagram.as_dict() + + copied = sample_datagram.copy(include_cache=False) + + # Should be different instances + assert copied is not sample_datagram + + # Should have same data + assert copied["user_id"] == sample_datagram["user_id"] + + # Should not share cached values + assert copied._cached_python_dict is None + + def test_str_representation(self, sample_datagram): + """Test string representation.""" + str_repr = str(sample_datagram) + + # Should contain data values + assert "123" in str_repr + assert "Alice" in str_repr + + # Should not contain meta columns + assert "__version" not in str_repr + + def test_repr_representation(self, sample_datagram): + """Test repr representation.""" + repr_str = repr(sample_datagram) + + # Should contain data values + assert "123" in repr_str + assert "Alice" in repr_str + + +class TestArrowDatagramEdgeCases: + """Test edge cases and error conditions.""" + + def test_none_values(self): + """Test handling of None values.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": [None], "optional": [None]} + ) + datagram = ArrowDatagram(table) + + assert datagram["user_id"] == 123 + assert datagram["name"] is None + assert datagram["optional"] is None + + def test_complex_data_types(self): + """Test handling of complex Arrow data types.""" + # Create table with various Arrow types + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array(["Alice"], type=pa.string()), + pa.array([85.5], type=pa.float64()), + pa.array([True], type=pa.bool_()), + pa.array([[1, 2, 3]], type=pa.list_(pa.int32())), + ], + names=["id", "name", "score", "active", "numbers"], + ) + + datagram = ArrowDatagram(table) + + assert datagram["id"] == 123 + assert datagram["name"] == "Alice" + assert datagram["score"] == 85.5 + assert datagram["active"] is True + assert datagram["numbers"] == [1, 2, 3] + + def test_large_string_types(self): + """Test handling of large string types.""" + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array(["A very long string " * 100], type=pa.large_string()), + ], + names=["id", "text"], + ) + + datagram = ArrowDatagram(table) + + assert datagram["id"] == 123 + assert len(datagram["text"]) > 1000 + + def test_timestamp_types(self): + """Test handling of timestamp types.""" + now = datetime.now() + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([now], type=pa.timestamp("ns")), + ], + names=["id", "timestamp"], + ) + + datagram = ArrowDatagram(table) + + assert datagram["id"] == 123 + # Arrow timestamps are returned as pandas Timestamp objects + assert datagram["timestamp"] is not None + + def test_date_types(self): + """Test handling of date types.""" + today = date.today() + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([today], type=pa.date32()), + ], + names=["id", "date"], + ) + + datagram = ArrowDatagram(table) + + assert datagram["id"] == 123 + assert datagram["date"] is not None + + def test_duplicate_operations(self): + """Test operations that shouldn't change anything.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + datagram = ArrowDatagram(table) + + # Select all columns + selected = datagram.select("user_id", "name") + assert set(selected.keys()) == set(datagram.keys()) + + # Update with same values + updated = datagram.update(user_id=123, name="Alice") + assert updated["user_id"] == datagram["user_id"] + assert updated["name"] == datagram["name"] + + # Rename with identity mapping + renamed = datagram.rename({"user_id": "user_id", "name": "name"}) + assert set(renamed.keys()) == set(datagram.keys()) + + +class TestArrowDatagramIntegration: + """Test integration between different operations.""" + + def test_chained_operations(self): + """Test chaining multiple operations.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "first_name": ["Alice"], + "last_name": ["Smith"], + "score": [85.5], + "active": [True], + "__version": ["1.0"], + } + ) + + datagram = ArrowDatagram(table) + + # Chain operations + result = ( + datagram.with_columns(full_name="Alice Smith") + .drop("first_name", "last_name") + .update(score=90.0) + .with_meta_columns(version="2.0") + ) + + # Verify final state + assert set(result.keys()) == {"user_id", "score", "active", "full_name"} + assert result["full_name"] == "Alice Smith" + assert result["score"] == 90.0 + assert result.get_meta_value("version") == "2.0" + + def test_dict_roundtrip(self): + """Test conversion to dict and back preserves data.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + original = ArrowDatagram(table) + + # Convert to dict + data_dict = original.as_dict() + + # Create new table from dict + new_table = pa.Table.from_pylist([data_dict]) + reconstructed = ArrowDatagram(new_table) + + # Should have same data + assert reconstructed["user_id"] == original["user_id"] + assert reconstructed["name"] == original["name"] + assert reconstructed["score"] == original["score"] + + def test_mixed_include_options(self): + """Test various combinations of include options.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + "__pipeline": ["test"], + } + ) + + datagram = ArrowDatagram(table) + + # Test different combinations + dict1 = datagram.as_dict(include_meta_columns=True, include_context=True) + dict2 = datagram.as_dict(include_all_info=True) + + # Should be equivalent + assert dict1 == dict2 + + # Test specific meta prefixes + dict3 = datagram.as_dict(include_meta_columns=["__version"]) + expected_keys = {"user_id", "name", "__version"} + assert set(dict3.keys()) == expected_keys + + def test_arrow_table_schema_preservation(self): + """Test that Arrow table schemas are preserved through operations.""" + # Create table with specific Arrow types + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int32()), # Specific int type + pa.array(["Alice"], type=pa.large_string()), # Large string + pa.array([85.5], type=pa.float32()), # Specific float type + ], + names=["id", "name", "score"], + ) + + datagram = ArrowDatagram(table) + + # Get schema + schema = datagram.arrow_schema() + + # Types should be preserved + assert schema.field("id").type == pa.int32() + assert schema.field("name").type == pa.large_string() + assert schema.field("score").type == pa.float32() + + # Operations should preserve types - but this might not be implemented yet + # For now, let's just test that the basic schema is correct + # updated = datagram.update(score=90.0) + # updated_schema = updated.arrow_schema() + # assert updated_schema.field("score").type == pa.float32() + + +class TestArrowDatagramPerformance: + """Test performance-related aspects.""" + + def test_caching_behavior(self): + """Test that caching works as expected.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + datagram = ArrowDatagram(table) + + # First call should populate cache + dict1 = datagram.as_dict() + assert datagram._cached_python_dict is not None + cached_dict_id = id(datagram._cached_python_dict) + + # Second call should use same cache (not create new one) + dict2 = datagram.as_dict() + assert id(datagram._cached_python_dict) == cached_dict_id # Same cached object + # Returned dicts are copies for safety, so they're not identical + assert dict1 == dict2 # Same content + assert dict1 is not dict2 # Different objects (copies) + + # Operations should invalidate cache + updated = datagram.update(name="Bob") + assert updated._cached_python_dict is None + + def test_lazy_evaluation(self): + """Test that expensive operations are performed lazily.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + datagram = ArrowDatagram(table) + + # Hash should not be calculated until requested + assert datagram._cached_content_hash is None + + # First hash call should calculate + hash1 = datagram.content_hash() + assert datagram._cached_content_hash is not None + + # Second call should use cache + hash2 = datagram.content_hash() + assert hash1 == hash2 + assert hash1 is hash2 # Should be same object diff --git a/tests/test_data/test_datagrams/test_arrow_tag_packet.py b/tests/test_data/test_datagrams/test_arrow_tag_packet.py new file mode 100644 index 0000000..6fbe9d9 --- /dev/null +++ b/tests/test_data/test_datagrams/test_arrow_tag_packet.py @@ -0,0 +1,844 @@ +""" +Comprehensive tests for ArrowTag and ArrowPacket classes. + +This module tests all functionality of the Arrow-based tag and packet classes including: +- Tag-specific functionality (system tags) +- Packet-specific functionality (source info) +- Integration with Arrow datagram functionality +- Conversion operations +- Arrow-specific optimizations +""" + +import pytest +import pyarrow as pa +from datetime import datetime, date + +from orcapod.data.datagrams import ArrowTag, ArrowPacket +from orcapod.data.system_constants import constants + + +class TestArrowTagInitialization: + """Test ArrowTag initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with PyArrow table.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + + tag = ArrowTag(table) + + assert tag["user_id"] == 123 + assert tag["name"] == "Alice" + assert tag["score"] == 85.5 + + def test_initialization_multiple_rows_fails(self): + """Test initialization with multiple rows fails.""" + table = pa.Table.from_pydict({"user_id": [123, 456], "name": ["Alice", "Bob"]}) + + with pytest.raises(ValueError, match="single row"): + ArrowTag(table) + + def test_initialization_with_system_tags(self): + """Test initialization with system tags.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + system_tags = {"tag_type": "user", "created_by": "system"} + + tag = ArrowTag(table, system_tags=system_tags) + + assert tag["user_id"] == 123 + system_tag_dict = tag.system_tags() + assert system_tag_dict["tag_type"] == "user" + assert system_tag_dict["created_by"] == "system" + + def test_initialization_with_system_tags_in_table(self): + """Test initialization when system tags are included in table.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + f"{constants.SYSTEM_TAG_PREFIX}tag_type": ["user"], + f"{constants.SYSTEM_TAG_PREFIX}version": ["1.0"], + } + ) + + tag = ArrowTag(table) + + assert tag["user_id"] == 123 + assert tag["name"] == "Alice" + + system_tags = tag.system_tags() + assert system_tags[f"{constants.SYSTEM_TAG_PREFIX}tag_type"] == "user" + assert system_tags[f"{constants.SYSTEM_TAG_PREFIX}version"] == "1.0" + + def test_initialization_mixed_system_tags(self): + """Test initialization with both embedded and explicit system tags.""" + table = pa.Table.from_pydict( + {"user_id": [123], f"{constants.SYSTEM_TAG_PREFIX}embedded": ["value1"]} + ) + system_tags = {"explicit": "value2"} + + tag = ArrowTag(table, system_tags=system_tags) + + system_tag_dict = tag.system_tags() + assert system_tag_dict[f"{constants.SYSTEM_TAG_PREFIX}embedded"] == "value1" + assert system_tag_dict["explicit"] == "value2" + + +class TestArrowTagSystemTagOperations: + """Test system tag specific operations.""" + + @pytest.fixture + def sample_tag(self): + """Create a sample tag for testing.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + system_tags = {"tag_type": "user", "version": "1.0"} + return ArrowTag(table, system_tags=system_tags) + + def test_system_tags_method(self, sample_tag): + """Test system_tags method.""" + system_tags = sample_tag.system_tags() + + assert isinstance(system_tags, dict) + assert system_tags["tag_type"] == "user" + assert system_tags["version"] == "1.0" + + def test_keys_with_system_tags(self, sample_tag): + """Test keys method including system tags.""" + keys_data_only = sample_tag.keys() + keys_with_system = sample_tag.keys(include_system_tags=True) + + assert "user_id" in keys_data_only + assert "name" in keys_data_only + assert len(keys_with_system) > len(keys_data_only) + assert "tag_type" in keys_with_system + assert "version" in keys_with_system + + def test_types_with_system_tags(self, sample_tag): + """Test types method including system tags.""" + types_data_only = sample_tag.types() + types_with_system = sample_tag.types(include_system_tags=True) + + assert len(types_with_system) > len(types_data_only) + assert "tag_type" in types_with_system + assert "version" in types_with_system + + def test_arrow_schema_with_system_tags(self, sample_tag): + """Test arrow_schema method including system tags.""" + schema_data_only = sample_tag.arrow_schema() + schema_with_system = sample_tag.arrow_schema(include_system_tags=True) + + assert len(schema_with_system) > len(schema_data_only) + assert "tag_type" in schema_with_system.names + assert "version" in schema_with_system.names + + def test_as_dict_with_system_tags(self, sample_tag): + """Test as_dict method including system tags.""" + dict_data_only = sample_tag.as_dict() + dict_with_system = sample_tag.as_dict(include_system_tags=True) + + assert "user_id" in dict_data_only + assert "name" in dict_data_only + assert "tag_type" not in dict_data_only + + assert "user_id" in dict_with_system + assert "tag_type" in dict_with_system + assert "version" in dict_with_system + + def test_as_table_with_system_tags(self, sample_tag): + """Test as_table method including system tags.""" + table_data_only = sample_tag.as_table() + table_with_system = sample_tag.as_table(include_system_tags=True) + + assert len(table_with_system.column_names) > len(table_data_only.column_names) + assert "tag_type" in table_with_system.column_names + assert "version" in table_with_system.column_names + + def test_as_datagram_conversion(self, sample_tag): + """Test conversion to datagram.""" + datagram = sample_tag.as_datagram() + + # Should preserve data + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + + # Should not include system tags by default + assert "tag_type" not in datagram.keys() + + def test_as_datagram_with_system_tags(self, sample_tag): + """Test conversion to datagram including system tags.""" + datagram = sample_tag.as_datagram(include_system_tags=True) + + # Should preserve data and include system tags + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + assert "tag_type" in datagram.keys() + + +class TestArrowPacketInitialization: + """Test ArrowPacket initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with PyArrow table.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + + packet = ArrowPacket(table) + + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + assert packet["score"] == 85.5 + + def test_initialization_multiple_rows_fails(self): + """Test initialization with multiple rows fails.""" + table = pa.Table.from_pydict({"user_id": [123, 456], "name": ["Alice", "Bob"]}) + + with pytest.raises(ValueError, match="single row"): + ArrowPacket(table) + + def test_initialization_with_source_info(self): + """Test initialization with source info.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + source_info = {"user_id": "database", "name": "user_input"} + + packet = ArrowPacket(table, source_info=source_info) + + assert packet["user_id"] == 123 + source_dict = packet.source_info() + assert source_dict["user_id"] == "database" + assert source_dict["name"] == "user_input" + + def test_initialization_with_source_info_in_table(self): + """Test initialization when source info is included in table.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + f"{constants.SOURCE_PREFIX}user_id": ["database"], + f"{constants.SOURCE_PREFIX}name": ["user_input"], + } + ) + + packet = ArrowPacket(table) + + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + + source_info = packet.source_info() + assert source_info["user_id"] == "database" + assert source_info["name"] == "user_input" + + def test_initialization_mixed_source_info(self): + """Test initialization with both embedded and explicit source info.""" + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + f"{constants.SOURCE_PREFIX}user_id": ["embedded_source"], + } + ) + source_info = {"name": "explicit_source"} + + packet = ArrowPacket(table, source_info=source_info) + + source_dict = packet.source_info() + assert source_dict["user_id"] == "embedded_source" + assert source_dict["name"] == "explicit_source" + + def test_initialization_with_recordbatch(self): + """Test initialization with RecordBatch instead of Table.""" + batch = pa.RecordBatch.from_pydict({"user_id": [123], "name": ["Alice"]}) + + packet = ArrowPacket(batch) + + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + + +class TestArrowPacketSourceInfoOperations: + """Test source info specific operations.""" + + @pytest.fixture + def sample_packet(self): + """Create a sample packet for testing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + source_info = { + "user_id": "database", + "name": "user_input", + "score": "calculation", + } + return ArrowPacket(table, source_info=source_info) + + def test_source_info_method(self, sample_packet): + """Test source_info method.""" + source_info = sample_packet.source_info() + + assert isinstance(source_info, dict) + assert source_info["user_id"] == "database" + assert source_info["name"] == "user_input" + assert source_info["score"] == "calculation" + + def test_source_info_with_missing_keys(self): + """Test source_info method when some keys are missing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + source_info = {"user_id": "database"} # Only partial source info + + packet = ArrowPacket(table, source_info=source_info) + full_source_info = packet.source_info() + + assert full_source_info["user_id"] == "database" + assert full_source_info["name"] is None + assert full_source_info["score"] is None + + def test_with_source_info(self, sample_packet): + """Test with_source_info method.""" + updated = sample_packet.with_source_info( + user_id="new_database", name="new_input" + ) + + # Original should be unchanged + original_source = sample_packet.source_info() + assert original_source["user_id"] == "database" + + # Updated should have new values + updated_source = updated.source_info() + assert updated_source["user_id"] == "new_database" + assert updated_source["name"] == "new_input" + assert updated_source["score"] == "calculation" # Unchanged + + def test_keys_with_source_info(self, sample_packet): + """Test keys method including source info.""" + keys_data_only = sample_packet.keys() + keys_with_source = sample_packet.keys(include_source=True) + + assert "user_id" in keys_data_only + assert "name" in keys_data_only + assert len(keys_with_source) > len(keys_data_only) + + # Should include prefixed source columns + source_keys = [ + k for k in keys_with_source if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + def test_types_with_source_info(self, sample_packet): + """Test types method including source info.""" + types_data_only = sample_packet.types() + types_with_source = sample_packet.types(include_source=True) + + assert len(types_with_source) > len(types_data_only) + + # Source columns should be string type + source_keys = [ + k for k in types_with_source.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + for key in source_keys: + assert types_with_source[key] is str + + def test_arrow_schema_with_source_info(self, sample_packet): + """Test arrow_schema method including source info.""" + schema_data_only = sample_packet.arrow_schema() + schema_with_source = sample_packet.arrow_schema(include_source=True) + + assert len(schema_with_source) > len(schema_data_only) + + source_columns = [ + name + for name in schema_with_source.names + if name.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_columns) > 0 + + def test_as_dict_with_source_info(self, sample_packet): + """Test as_dict method including source info.""" + dict_data_only = sample_packet.as_dict() + dict_with_source = sample_packet.as_dict(include_source=True) + + assert "user_id" in dict_data_only + assert "name" in dict_data_only + assert not any( + k.startswith(constants.SOURCE_PREFIX) for k in dict_data_only.keys() + ) + + assert "user_id" in dict_with_source + source_keys = [ + k for k in dict_with_source.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + def test_as_table_with_source_info(self, sample_packet): + """Test as_table method including source info.""" + table_data_only = sample_packet.as_table() + table_with_source = sample_packet.as_table(include_source=True) + + assert len(table_with_source.column_names) > len(table_data_only.column_names) + + source_columns = [ + name + for name in table_with_source.column_names + if name.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_columns) > 0 + + def test_as_datagram_conversion(self, sample_packet): + """Test conversion to datagram.""" + datagram = sample_packet.as_datagram() + + # Should preserve data + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + + # Should not include source info by default + assert not any(k.startswith(constants.SOURCE_PREFIX) for k in datagram.keys()) + + def test_as_datagram_with_source_info(self, sample_packet): + """Test conversion to datagram including source info.""" + datagram = sample_packet.as_datagram(include_source=True) + + # Should preserve data and include source info + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + source_keys = [ + k for k in datagram.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + +class TestArrowPacketDataOperations: + """Test data operations specific to packets.""" + + @pytest.fixture + def sample_packet(self): + """Create a sample packet for testing.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5]} + ) + source_info = { + "user_id": "database", + "name": "user_input", + "score": "calculation", + } + return ArrowPacket(table, source_info=source_info) + + def test_rename_preserves_source_info(self, sample_packet): + """Test that rename operation preserves source info mapping.""" + renamed = sample_packet.rename({"user_id": "id", "name": "username"}) + + # Data should be renamed + assert "id" in renamed.keys() + assert "username" in renamed.keys() + assert "user_id" not in renamed.keys() + assert "name" not in renamed.keys() + + # Source info should follow the rename + source_info = renamed.source_info() + assert source_info["id"] == "database" + assert source_info["username"] == "user_input" + assert source_info["score"] == "calculation" + + def test_with_columns_creates_source_info_columns(self, sample_packet): + """Test that with_columns() creates corresponding source info columns with correct data types.""" + # Add new columns + updated = sample_packet.with_columns( + full_name="Alice Smith", age=30, is_active=True + ) + + # Verify new data columns exist + assert "full_name" in updated.keys() + assert "age" in updated.keys() + assert "is_active" in updated.keys() + assert updated["full_name"] == "Alice Smith" + assert updated["age"] == 30 + assert updated["is_active"] is True + + # Verify corresponding source info columns are created + source_info = updated.source_info() + assert "full_name" in source_info + assert "age" in source_info + assert "is_active" in source_info + + # New source info columns should be initialized as None + assert source_info["full_name"] is None + assert source_info["age"] is None + assert source_info["is_active"] is None + + # Verify existing source info is preserved + assert source_info["user_id"] == "database" + assert source_info["name"] == "user_input" + assert source_info["score"] == "calculation" + + # Verify Arrow schema has correct data types for source info columns + schema = updated.arrow_schema(include_source=True) + + # All source info columns should be large_string type + source_columns = [col for col in schema if col.name.startswith("_source_")] + assert len(source_columns) == 6 # 3 original + 3 new + + for field in source_columns: + assert field.type == pa.large_string(), ( + f"Source column {field.name} should be large_string, got {field.type}" + ) + + # Verify we can set source info for new columns + with_source = updated.with_source_info( + full_name="calculated", age="user_input", is_active="default" + ) + + final_source_info = with_source.source_info() + assert final_source_info["full_name"] == "calculated" + assert final_source_info["age"] == "user_input" + assert final_source_info["is_active"] == "default" + + +class TestArrowTagPacketIntegration: + """Test integration between tags, packets, and base functionality.""" + + def test_tag_to_packet_conversion(self): + """Test converting a tag to a packet-like structure.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + system_tags = {"tag_type": "user", "version": "1.0"} + tag = ArrowTag(table, system_tags=system_tags) + + # Convert to full dictionary + full_dict = tag.as_dict(include_all_info=True) + + # Should include data, system tags, meta columns, and context + assert "user_id" in full_dict + assert "tag_type" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_packet_comprehensive_dict(self): + """Test packet with all information types.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "__meta_field": ["meta_value"]} + ) + source_info = {"user_id": "database", "name": "user_input"} + + packet = ArrowPacket(table, source_info=source_info) + + # Get comprehensive dictionary + full_dict = packet.as_dict(include_all_info=True) + + # Should include data, source info, meta columns, and context + assert "user_id" in full_dict + assert f"{constants.SOURCE_PREFIX}user_id" in full_dict + assert "__meta_field" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_chained_operations_tag(self): + """Test chaining operations on tags.""" + table = pa.Table.from_pydict( + {"user_id": [123], "first_name": ["Alice"], "last_name": ["Smith"]} + ) + system_tags = {"tag_type": "user"} + + tag = ArrowTag(table, system_tags=system_tags) + + # Chain operations + result = ( + tag.with_columns(full_name="Alice Smith") + .drop("first_name", "last_name") + .update(user_id=456) + ) + + # Verify final state + assert set(result.keys()) == {"user_id", "full_name"} + assert result["user_id"] == 456 + assert result["full_name"] == "Alice Smith" + + # System tags should be preserved + system_tags = result.system_tags() + assert system_tags["tag_type"] == "user" + + def test_chained_operations_packet(self): + """Test chaining operations on packets.""" + table = pa.Table.from_pydict( + {"user_id": [123], "first_name": ["Alice"], "last_name": ["Smith"]} + ) + source_info = {"user_id": "database", "first_name": "form", "last_name": "form"} + + packet = ArrowPacket(table, source_info=source_info) + + # Chain operations + result = ( + packet.with_columns(full_name="Alice Smith") + .drop("first_name", "last_name") + .update(user_id=456) + .with_source_info(full_name="calculated") + ) + + # Verify final state + assert set(result.keys()) == {"user_id", "full_name"} + assert result["user_id"] == 456 + assert result["full_name"] == "Alice Smith" + + # Source info should be updated + source_info = result.source_info() + assert source_info["user_id"] == "database" + assert source_info["full_name"] == "calculated" + + def test_copy_operations(self): + """Test copy operations preserve all information.""" + # Test tag copy + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + system_tags = {"tag_type": "user"} + tag = ArrowTag(table, system_tags=system_tags) + + tag_copy = tag.copy() + assert tag_copy is not tag + assert tag_copy["user_id"] == tag["user_id"] + assert tag_copy.system_tags() == tag.system_tags() + + # Test packet copy + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + source_info = {"user_id": "database"} + packet = ArrowPacket(table, source_info=source_info) + + packet_copy = packet.copy() + assert packet_copy is not packet + assert packet_copy["user_id"] == packet["user_id"] + assert packet_copy.source_info() == packet.source_info() + + +class TestArrowTagPacketArrowSpecific: + """Test Arrow-specific functionality and optimizations.""" + + def test_tag_arrow_schema_preservation(self): + """Test that Arrow schemas are preserved in tags.""" + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int32()), + pa.array(["Alice"], type=pa.large_string()), + ], + names=["id", "name"], + ) + + tag = ArrowTag(table) + + schema = tag.arrow_schema() + assert schema.field("id").type == pa.int32() + assert schema.field("name").type == pa.large_string() + + def test_packet_arrow_schema_preservation(self): + """Test that Arrow schemas are preserved in packets.""" + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([85.5], type=pa.float32()), + ], + names=["id", "score"], + ) + + packet = ArrowPacket(table) + + schema = packet.arrow_schema() + assert schema.field("id").type == pa.int64() + assert schema.field("score").type == pa.float32() + + def test_tag_complex_arrow_types(self): + """Test tags with complex Arrow data types.""" + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([[1, 2, 3]], type=pa.list_(pa.int32())), + pa.array( + [{"nested": "value"}], + type=pa.struct([pa.field("nested", pa.string())]), + ), + ], + names=["id", "numbers", "struct_field"], + ) + + tag = ArrowTag(table) + + assert tag["id"] == 123 + assert tag["numbers"] == [1, 2, 3] + assert tag["struct_field"]["nested"] == "value" + + def test_packet_complex_arrow_types(self): + """Test packets with complex Arrow data types.""" + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([[1, 2, 3]], type=pa.list_(pa.int32())), + pa.array( + [{"nested": "value"}], + type=pa.struct([pa.field("nested", pa.string())]), + ), + ], + names=["id", "numbers", "struct_field"], + ) + + packet = ArrowPacket(table) + + assert packet["id"] == 123 + assert packet["numbers"] == [1, 2, 3] + assert packet["struct_field"]["nested"] == "value" + + def test_tag_timestamp_handling(self): + """Test tag handling of timestamp types.""" + now = datetime.now() + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([now], type=pa.timestamp("ns")), + ], + names=["id", "timestamp"], + ) + + tag = ArrowTag(table) + + assert tag["id"] == 123 + assert tag["timestamp"] is not None + + def test_packet_date_handling(self): + """Test packet handling of date types.""" + today = date.today() + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int64()), + pa.array([today], type=pa.date32()), + ], + names=["id", "date"], + ) + + packet = ArrowPacket(table) + + assert packet["id"] == 123 + assert packet["date"] is not None + + def test_tag_arrow_memory_efficiency(self): + """Test that tags share Arrow memory efficiently.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + + tag = ArrowTag(table) + + # The important thing is that underlying arrays are shared for memory efficiency + # Whether the table object itself is the same depends on whether system tag columns needed extraction + original_array = table["user_id"] + tag_array = tag._data_table["user_id"] + assert tag_array.to_pylist() == original_array.to_pylist() + + # Test with a table that has system tag columns to ensure processing works + table_with_system = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "_system_tag_type": ["user"]} + ) + tag_with_system = ArrowTag(table_with_system) + # This should create a different table since system columns are extracted + assert tag_with_system._data_table is not table_with_system + assert set(tag_with_system._data_table.column_names) == {"user_id", "name"} + + def test_packet_arrow_memory_efficiency(self): + """Test that packets handle Arrow memory efficiently.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + + packet = ArrowPacket(table) + + # Should efficiently handle memory + assert ( + packet._data_table is not table + ) # Different due to source info processing + + # But data should be preserved + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + + +class TestArrowTagPacketEdgeCases: + """Test edge cases and error conditions.""" + + def test_tag_empty_system_tags(self): + """Test tag with empty system tags.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + tag = ArrowTag(table, system_tags={}) + + assert tag["user_id"] == 123 + assert tag.system_tags() == {} + + def test_packet_empty_source_info(self): + """Test packet with empty source info.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + packet = ArrowPacket(table, source_info={}) + + assert packet["user_id"] == 123 + source_info = packet.source_info() + assert all(v is None for v in source_info.values()) + + def test_tag_none_system_tags(self): + """Test tag with None system tags.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + tag = ArrowTag(table, system_tags=None) + + assert tag["user_id"] == 123 + assert tag.system_tags() == {} + + def test_packet_none_source_info(self): + """Test packet with None source info.""" + table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) + packet = ArrowPacket(table, source_info=None) + + assert packet["user_id"] == 123 + source_info = packet.source_info() + assert all(v is None for v in source_info.values()) + + def test_tag_with_meta_and_system_tags(self): + """Test tag with both meta columns and system tags.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "__meta_field": ["meta_value"]} + ) + system_tags = {"tag_type": "user"} + + tag = ArrowTag(table, system_tags=system_tags) + + # All information should be accessible + full_dict = tag.as_dict(include_all_info=True) + assert "user_id" in full_dict + assert "__meta_field" in full_dict + assert "tag_type" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_packet_with_meta_and_source_info(self): + """Test packet with both meta columns and source info.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "__meta_field": ["meta_value"]} + ) + source_info = {"user_id": "database"} + + packet = ArrowPacket(table, source_info=source_info) + + # All information should be accessible + full_dict = packet.as_dict(include_all_info=True) + assert "user_id" in full_dict + assert "__meta_field" in full_dict + assert f"{constants.SOURCE_PREFIX}user_id" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_tag_large_system_tags(self): + """Test tag with many system tags.""" + table = pa.Table.from_pydict({"user_id": [123]}) + system_tags = {f"tag_{i}": f"value_{i}" for i in range(100)} + + tag = ArrowTag(table, system_tags=system_tags) + + assert tag["user_id"] == 123 + retrieved_tags = tag.system_tags() + assert len(retrieved_tags) == 100 + assert retrieved_tags["tag_50"] == "value_50" + + def test_packet_large_source_info(self): + """Test packet with source info for many columns.""" + data = {f"col_{i}": [i] for i in range(50)} + table = pa.Table.from_pydict(data) + source_info = {f"col_{i}": f"source_{i}" for i in range(50)} + + packet = ArrowPacket(table, source_info=source_info) + + assert packet["col_25"] == 25 + retrieved_source = packet.source_info() + assert len(retrieved_source) == 50 + assert retrieved_source["col_25"] == "source_25" diff --git a/tests/test_data/test_datagrams/test_base_integration.py b/tests/test_data/test_datagrams/test_base_integration.py new file mode 100644 index 0000000..dae53c4 --- /dev/null +++ b/tests/test_data/test_datagrams/test_base_integration.py @@ -0,0 +1,594 @@ +""" +Comprehensive tests for base datagram functionality and integration tests. + +This module tests: +- Base datagram abstract interface +- Integration between different datagram implementations +- Cross-format conversions +- Performance and memory considerations +""" + +import pytest +import pyarrow as pa + +from orcapod.data.datagrams import ( + DictDatagram, + ArrowDatagram, + DictTag, + DictPacket, + ArrowTag, + ArrowPacket, +) +from orcapod.data.datagrams.base import ( + BaseDatagram, + ImmutableDict, + contains_prefix_from, +) +from orcapod.data.system_constants import constants + + +class TestImmutableDict: + """Test ImmutableDict utility class.""" + + def test_basic_functionality(self): + """Test basic ImmutableDict operations.""" + data = {"a": 1, "b": 2, "c": 3} + immutable = ImmutableDict(data) + + assert immutable["a"] == 1 + assert immutable["b"] == 2 + assert immutable["c"] == 3 + assert len(immutable) == 3 + + def test_iteration(self): + """Test iteration over ImmutableDict.""" + data = {"a": 1, "b": 2, "c": 3} + immutable = ImmutableDict(data) + + keys = list(immutable) + assert set(keys) == {"a", "b", "c"} + + items = list(immutable.items()) + assert set(items) == {("a", 1), ("b", 2), ("c", 3)} + + def test_merge_operation(self): + """Test merge operation with | operator.""" + data1 = {"a": 1, "b": 2} + data2 = {"c": 3, "d": 4} + + immutable1 = ImmutableDict(data1) + immutable2 = ImmutableDict(data2) + + merged = immutable1 | immutable2 + + assert len(merged) == 4 + assert merged["a"] == 1 + assert merged["c"] == 3 + + def test_merge_with_dict(self): + """Test merge operation with regular dict.""" + data1 = {"a": 1, "b": 2} + data2 = {"c": 3, "d": 4} + + immutable = ImmutableDict(data1) + merged = immutable | data2 + + assert len(merged) == 4 + assert merged["a"] == 1 + assert merged["c"] == 3 + + def test_string_representations(self): + """Test string representations.""" + data = {"a": 1, "b": 2} + immutable = ImmutableDict(data) + + str_repr = str(immutable) + repr_str = repr(immutable) + + assert "a" in str_repr and "1" in str_repr + assert "a" in repr_str and "1" in repr_str + + +class TestUtilityFunctions: + """Test utility functions.""" + + def test_contains_prefix_from(self): + """Test contains_prefix_from function.""" + prefixes = ["__", "_source_", "_system_"] + + assert contains_prefix_from("__version", prefixes) + assert contains_prefix_from("_source_file", prefixes) + assert contains_prefix_from("_system_tag", prefixes) + assert not contains_prefix_from("regular_column", prefixes) + assert not contains_prefix_from("_other_prefix", prefixes) + + def test_contains_prefix_from_empty(self): + """Test contains_prefix_from with empty prefixes.""" + assert not contains_prefix_from("any_column", []) + + def test_contains_prefix_from_edge_cases(self): + """Test contains_prefix_from edge cases.""" + prefixes = ["__"] + + assert contains_prefix_from("__", prefixes) + assert not contains_prefix_from("_", prefixes) + assert not contains_prefix_from("", prefixes) + + +class TestBaseDatagram: + """Test BaseDatagram abstract interface.""" + + def test_is_abstract(self): + """Test that BaseDatagram cannot be instantiated directly.""" + try: + # This should raise TypeError for abstract class + BaseDatagram() + pytest.fail("Expected TypeError for abstract class instantiation") + except TypeError as e: + # Expected behavior - BaseDatagram is abstract + assert "abstract" in str(e).lower() or "instantiate" in str(e).lower() + + def test_abstract_methods(self): + """Test that all abstract methods are defined.""" + # Get all abstract methods + abstract_methods = BaseDatagram.__abstractmethods__ + + # Verify key abstract methods exist + expected_methods = { + "__getitem__", + "__contains__", + "__iter__", + "get", + "keys", + "types", + "arrow_schema", + "content_hash", + "as_dict", + "as_table", + "meta_columns", + "get_meta_value", + "with_meta_columns", + "drop_meta_columns", + "select", + "drop", + "rename", + "update", + "with_columns", + } + + assert expected_methods.issubset(abstract_methods) + + +class TestCrossFormatConversions: + """Test conversions between different datagram formats.""" + + @pytest.fixture + def sample_data(self): + """Sample data for conversion tests.""" + return { + "user_id": 123, + "name": "Alice", + "score": 85.5, + "active": True, + "__version": "1.0", + "__pipeline": "test", + } + + def test_dict_to_arrow_conversion(self, sample_data): + """Test converting DictDatagram to ArrowDatagram.""" + dict_datagram = DictDatagram(sample_data) + + # Convert via table + table = dict_datagram.as_table(include_all_info=True) + arrow_datagram = ArrowDatagram(table) + + # Data should be preserved + assert arrow_datagram["user_id"] == dict_datagram["user_id"] + assert arrow_datagram["name"] == dict_datagram["name"] + assert arrow_datagram["score"] == dict_datagram["score"] + assert arrow_datagram["active"] == dict_datagram["active"] + + # Meta columns should be preserved + assert arrow_datagram.get_meta_value("version") == dict_datagram.get_meta_value( + "version" + ) + assert arrow_datagram.get_meta_value( + "pipeline" + ) == dict_datagram.get_meta_value("pipeline") + + def test_arrow_to_dict_conversion(self, sample_data): + """Test converting ArrowDatagram to DictDatagram.""" + table = pa.Table.from_pylist([sample_data]) + arrow_datagram = ArrowDatagram(table) + + # Convert via dict + data_dict = arrow_datagram.as_dict(include_all_info=True) + dict_datagram = DictDatagram(data_dict) + + # Data should be preserved + assert dict_datagram["user_id"] == arrow_datagram["user_id"] + assert dict_datagram["name"] == arrow_datagram["name"] + assert dict_datagram["score"] == arrow_datagram["score"] + assert dict_datagram["active"] == arrow_datagram["active"] + + # Meta columns should be preserved + assert dict_datagram.get_meta_value("version") == arrow_datagram.get_meta_value( + "version" + ) + + def test_tag_conversions(self): + """Test conversions between tag formats.""" + data = {"user_id": 123, "name": "Alice"} + system_tags = {"tag_type": "user", "version": "1.0"} + + # Dict to Arrow tag + dict_tag = DictTag(data, system_tags=system_tags) + table = dict_tag.as_table(include_all_info=True) + arrow_tag = ArrowTag(table) + + # Data and system tags should be preserved + assert arrow_tag["user_id"] == dict_tag["user_id"] + assert arrow_tag["name"] == dict_tag["name"] + + # Arrow to Dict tag + full_dict = arrow_tag.as_dict(include_all_info=True) + reconstructed_dict_tag = DictTag(full_dict) + + assert reconstructed_dict_tag["user_id"] == arrow_tag["user_id"] + assert reconstructed_dict_tag["name"] == arrow_tag["name"] + + def test_packet_conversions(self): + """Test conversions between packet formats.""" + data = {"user_id": 123, "name": "Alice"} + source_info = {"user_id": "database", "name": "user_input"} + + # Dict to Arrow packet + dict_packet = DictPacket(data, source_info=source_info) + table = dict_packet.as_table(include_all_info=True) + arrow_packet = ArrowPacket(table) + + # Data and source info should be preserved + assert arrow_packet["user_id"] == dict_packet["user_id"] + assert arrow_packet["name"] == dict_packet["name"] + + # Arrow to Dict packet + full_dict = arrow_packet.as_dict(include_all_info=True) + reconstructed_dict_packet = DictPacket(full_dict) + + assert reconstructed_dict_packet["user_id"] == arrow_packet["user_id"] + assert reconstructed_dict_packet["name"] == arrow_packet["name"] + + +class TestDatagramIntegration: + """Test integration between different datagram types.""" + + def test_mixed_operations(self): + """Test operations that mix different datagram types.""" + # Start with dict datagram + dict_data = {"user_id": 123, "name": "Alice", "score": 85.5} + dict_datagram = DictDatagram(dict_data) + + # Convert to arrow + table = dict_datagram.as_table() + arrow_datagram = ArrowDatagram(table) + + # Perform operations on arrow datagram + modified_arrow = arrow_datagram.update(score=90.0).with_columns(grade="A") + + # Convert back to dict + modified_dict = DictDatagram(modified_arrow.as_dict()) + + # Verify final state + assert modified_dict["user_id"] == 123 + assert modified_dict["score"] == 90.0 + assert modified_dict["grade"] == "A" + + def test_tag_packet_interoperability(self): + """Test interoperability between tags and packets.""" + # Create a tag + tag_data = {"entity_id": "user_123", "entity_type": "user"} + system_tags = {"created_by": "system", "version": "1.0"} + tag = DictTag(tag_data, system_tags=system_tags) + + # Convert tag to packet-like structure + tag_as_dict = tag.as_dict(include_system_tags=True) + packet = DictPacket(tag_as_dict, source_info={"entity_id": "tag_system"}) + + # Verify data preservation + assert packet["entity_id"] == tag["entity_id"] + assert packet["entity_type"] == tag["entity_type"] + + # Source info should be available + source_info = packet.source_info() + assert source_info["entity_id"] == "tag_system" + + def test_comprehensive_roundtrip(self): + """Test comprehensive roundtrip through all formats.""" + original_data = { + "user_id": 123, + "name": "Alice", + "score": 85.5, + "active": True, + "__version": "1.0", + constants.CONTEXT_KEY: "v0.1", + } + + # Start with DictDatagram + dict_datagram = DictDatagram(original_data) + + # Convert to ArrowDatagram + table = dict_datagram.as_table(include_all_info=True) + arrow_datagram = ArrowDatagram(table) + + # Convert to DictTag with some system tags + tag_dict = arrow_datagram.as_dict(include_all_info=True) + dict_tag = DictTag(tag_dict, system_tags={"tag_type": "test", "version": "1.0"}) + + # Convert to ArrowTag + tag_table = dict_tag.as_table(include_all_info=True) + arrow_tag = ArrowTag(tag_table) + + # Convert to DictPacket with some source info + packet_dict = arrow_tag.as_dict(include_all_info=True) + dict_packet = DictPacket( + packet_dict, source_info={"source": "test", "timestamp": "2024-01-01"} + ) + + # Convert to ArrowPacket + packet_table = dict_packet.as_table(include_all_info=True) + arrow_packet = ArrowPacket(packet_table) + + # Convert back to DictDatagram + final_dict = arrow_packet.as_dict(include_all_info=True) + final_datagram = DictDatagram(final_dict) + + # Verify data preservation through the entire journey + assert final_datagram["user_id"] == original_data["user_id"] + assert final_datagram["name"] == original_data["name"] + assert final_datagram["score"] == original_data["score"] + assert final_datagram["active"] == original_data["active"] + assert final_datagram.get_meta_value("version") == "1.0" + assert final_datagram.data_context_key == "std:v0.1:default" + + +class TestDatagramConsistency: + """Test consistency across different datagram implementations.""" + + @pytest.fixture + def equivalent_datagrams(self): + """Create equivalent datagrams in different formats.""" + data = { + "user_id": 123, + "name": "Alice", + "score": 85.5, + "active": True, + "__version": "1.0", + } + + dict_datagram = DictDatagram(data) + table = pa.Table.from_pylist([data]) + arrow_datagram = ArrowDatagram(table) + + return dict_datagram, arrow_datagram + + def test_consistent_dict_interface(self, equivalent_datagrams): + """Test that dict-like interface is consistent.""" + dict_dg, arrow_dg = equivalent_datagrams + + # __getitem__ + assert dict_dg["user_id"] == arrow_dg["user_id"] + assert dict_dg["name"] == arrow_dg["name"] + assert dict_dg["score"] == arrow_dg["score"] + assert dict_dg["active"] == arrow_dg["active"] + + # __contains__ + assert ("user_id" in dict_dg) == ("user_id" in arrow_dg) + assert ("nonexistent" in dict_dg) == ("nonexistent" in arrow_dg) + + # get + assert dict_dg.get("user_id") == arrow_dg.get("user_id") + assert dict_dg.get("nonexistent", "default") == arrow_dg.get( + "nonexistent", "default" + ) + + def test_consistent_structural_info(self, equivalent_datagrams): + """Test that structural information is consistent.""" + dict_dg, arrow_dg = equivalent_datagrams + + # keys + assert set(dict_dg.keys()) == set(arrow_dg.keys()) + assert set(dict_dg.keys(include_meta_columns=True)) == set( + arrow_dg.keys(include_meta_columns=True) + ) + + # meta_columns + assert set(dict_dg.meta_columns) == set(arrow_dg.meta_columns) + + # types (basic structure, not exact types due to inference differences) + dict_types = dict_dg.types() + arrow_types = arrow_dg.types() + assert set(dict_types.keys()) == set(arrow_types.keys()) + + def test_consistent_meta_operations(self, equivalent_datagrams): + """Test that meta operations are consistent.""" + dict_dg, arrow_dg = equivalent_datagrams + + # get_meta_value + assert dict_dg.get_meta_value("version") == arrow_dg.get_meta_value("version") + assert dict_dg.get_meta_value( + "nonexistent", "default" + ) == arrow_dg.get_meta_value("nonexistent", "default") + + def test_consistent_data_operations(self, equivalent_datagrams): + """Test that data operations produce consistent results.""" + dict_dg, arrow_dg = equivalent_datagrams + + # select + dict_selected = dict_dg.select("user_id", "name") + arrow_selected = arrow_dg.select("user_id", "name") + + assert set(dict_selected.keys()) == set(arrow_selected.keys()) + assert dict_selected["user_id"] == arrow_selected["user_id"] + assert dict_selected["name"] == arrow_selected["name"] + + # update + dict_updated = dict_dg.update(score=95.0) + arrow_updated = arrow_dg.update(score=95.0) + + assert dict_updated["score"] == arrow_updated["score"] + assert dict_updated["user_id"] == arrow_updated["user_id"] # Unchanged + + def test_consistent_format_conversions(self, equivalent_datagrams): + """Test that format conversions are consistent.""" + dict_dg, arrow_dg = equivalent_datagrams + + # as_dict + dict_as_dict = dict_dg.as_dict() + arrow_as_dict = arrow_dg.as_dict() + + assert dict_as_dict == arrow_as_dict + + # as_table + dict_as_table = dict_dg.as_table() + arrow_as_table = arrow_dg.as_table() + + assert dict_as_table.column_names == arrow_as_table.column_names + assert len(dict_as_table) == len(arrow_as_table) + + +class TestDatagramPerformance: + """Test performance characteristics of different implementations.""" + + def test_memory_efficiency(self): + """Test memory efficiency considerations.""" + # Create large-ish data + n_cols = 100 + data = {f"col_{i}": [i * 1.5] for i in range(n_cols)} + + # Dict implementation + dict_datagram = DictDatagram(data) + + # Arrow implementation - get data in correct format from dict datagram + arrow_data = dict_datagram.as_dict() + # Convert scalar values to single-element lists for PyArrow + arrow_data_lists = {k: [v] for k, v in arrow_data.items()} + table = pa.Table.from_pydict(arrow_data_lists) + arrow_datagram = ArrowDatagram(table) + + # Both should handle the data efficiently + assert len(dict_datagram.keys()) == n_cols + assert len(arrow_datagram.keys()) == n_cols + + # Verify data integrity - both should have consistent data + # Note: The original data has lists, so both implementations should handle lists consistently + assert dict_datagram["col_50"] == [ + 75.0 + ] # DictDatagram preserves list structure + assert arrow_datagram["col_50"] == [ + 75.0 + ] # ArrowDatagram also preserves list structure + + def test_caching_behavior(self): + """Test caching behavior across implementations.""" + data = {"user_id": [123], "name": ["Alice"]} # Lists for PyArrow + + # Test dict caching + dict_datagram = DictDatagram(data) + dict1 = dict_datagram.as_dict() + dict2 = dict_datagram.as_dict() + # Dict implementation may or may not cache, but should be consistent + assert dict1 == dict2 + + # Test arrow caching + table = pa.Table.from_pydict(data) + arrow_datagram = ArrowDatagram(table) + arrow_dict1 = arrow_datagram.as_dict() + arrow_dict2 = arrow_datagram.as_dict() + # Arrow implementation should cache + assert arrow_dict1 == arrow_dict2 # Same content + # Note: ArrowDatagram returns copies for safety, not identical objects + + def test_operation_efficiency(self): + """Test efficiency of common operations.""" + # Create moderately sized data + data = {f"col_{i}": [i] for i in range(50)} # Lists for PyArrow + + dict_datagram = DictDatagram(data) + table = pa.Table.from_pydict(data) + arrow_datagram = ArrowDatagram(table) + + # Select operations should be efficient + dict_selected = dict_datagram.select("col_0", "col_25", "col_49") + arrow_selected = arrow_datagram.select("col_0", "col_25", "col_49") + + assert len(dict_selected.keys()) == 3 + assert len(arrow_selected.keys()) == 3 + + # Update operations should be efficient + dict_updated = dict_datagram.update(col_25=999) + arrow_updated = arrow_datagram.update(col_25=999) + + assert dict_updated["col_25"] == 999 + assert arrow_updated["col_25"] == 999 + + +class TestDatagramErrorHandling: + """Test error handling consistency across implementations.""" + + def test_consistent_key_errors(self): + """Test that KeyError handling is consistent.""" + data = {"user_id": [123], "name": ["Alice"]} # Lists for PyArrow + + dict_datagram = DictDatagram(data) + table = pa.Table.from_pydict(data) + arrow_datagram = ArrowDatagram(table) + + # Both should raise KeyError for missing keys + with pytest.raises(KeyError): + _ = dict_datagram["nonexistent"] + + with pytest.raises(KeyError): + _ = arrow_datagram["nonexistent"] + + def test_consistent_operation_errors(self): + """Test that operation errors are consistent.""" + data = {"user_id": [123], "name": ["Alice"]} # Lists for PyArrow + + dict_datagram = DictDatagram(data) + table = pa.Table.from_pydict(data) + arrow_datagram = ArrowDatagram(table) + + # Both should raise appropriate errors for invalid operations + with pytest.raises((KeyError, ValueError)): + dict_datagram.select("nonexistent") + + with pytest.raises((KeyError, ValueError)): + arrow_datagram.select("nonexistent") + + with pytest.raises(KeyError): + dict_datagram.update(nonexistent="value") + + with pytest.raises(KeyError): + arrow_datagram.update(nonexistent="value") + + def test_consistent_validation(self): + """Test that validation is consistent.""" + data = {"user_id": [123], "name": ["Alice"]} # Lists for PyArrow + + dict_datagram = DictDatagram(data) + table = pa.Table.from_pydict(data) + arrow_datagram = ArrowDatagram(table) + + # Both should handle edge cases consistently + # Test that empty select behavior is consistent (may select all or raise error) + try: + dict_result = dict_datagram.select() + arrow_result = arrow_datagram.select() + # If both succeed, they should have the same keys + assert set(dict_result.keys()) == set(arrow_result.keys()) + except (ValueError, TypeError): + # If one raises an error, both should raise similar errors + with pytest.raises((ValueError, TypeError)): + arrow_datagram.select() + + # Note: The important thing is that both behave the same way diff --git a/tests/test_data/test_datagrams/test_dict_datagram.py b/tests/test_data/test_datagrams/test_dict_datagram.py new file mode 100644 index 0000000..0e1af69 --- /dev/null +++ b/tests/test_data/test_datagrams/test_dict_datagram.py @@ -0,0 +1,765 @@ +""" +Comprehensive tests for DictDatagram class. + +This module tests all functionality of the DictDatagram class including: +- Initialization and validation +- Dict-like interface operations +- Structural information methods +- Format conversion methods +- Meta column operations +- Data column operations +- Context operations +- Utility operations +""" + +import pytest +import pyarrow as pa + +from orcapod.data.datagrams import DictDatagram +from orcapod.data.system_constants import constants + + +class TestDictDatagramInitialization: + """Test DictDatagram initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with simple data.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + datagram = DictDatagram(data) + + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + assert datagram["score"] == 85.5 + + def test_initialization_with_meta_info(self): + """Test initialization with meta information.""" + data = {"user_id": 123, "name": "Alice"} + meta_info = {"__pipeline_version": "v1.0", "__timestamp": "2024-01-01"} + + datagram = DictDatagram(data, meta_info=meta_info) + + assert datagram["user_id"] == 123 + assert datagram.get_meta_value("pipeline_version") == "v1.0" + assert datagram.get_meta_value("timestamp") == "2024-01-01" + + def test_initialization_with_context_in_data(self): + """Test initialization when context is included in data.""" + data = {"user_id": 123, "name": "Alice", constants.CONTEXT_KEY: "v0.1"} + + datagram = DictDatagram(data) + + # The context key is transformed to include full context path + assert "v0.1" in datagram.data_context_key + assert constants.CONTEXT_KEY not in datagram._data + + def test_initialization_with_meta_columns_in_data(self): + """Test initialization when meta columns are included in data.""" + data = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + "__timestamp": "2024-01-01", + } + + datagram = DictDatagram(data) + + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + assert datagram.get_meta_value("version") == "1.0" + assert datagram.get_meta_value("timestamp") == "2024-01-01" + # Meta columns should not be in regular data + assert "__version" not in datagram._data + assert "__timestamp" not in datagram._data + + def test_initialization_with_python_schema(self): + """Test initialization with explicit Python schema.""" + data = {"user_id": "123", "score": "85.5"} # String values + python_schema = {"user_id": int, "score": float} + + datagram = DictDatagram(data, python_schema=python_schema) + + # Data should be stored as provided (conversion happens during export) + assert datagram["user_id"] == "123" + assert datagram["score"] == "85.5" + + def test_empty_data_initialization(self): + """Test initialization with empty data succeeds.""" + # Empty data should be allowed in OrcaPod + data = {} + datagram = DictDatagram(data) + + assert len(datagram.keys()) == 0 + assert datagram.as_dict() == {} + + +class TestDictDatagramDictInterface: + """Test dict-like interface methods.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5, "active": True} + return DictDatagram(data) + + def test_getitem(self, sample_datagram): + """Test __getitem__ method.""" + assert sample_datagram["user_id"] == 123 + assert sample_datagram["name"] == "Alice" + assert sample_datagram["score"] == 85.5 + assert sample_datagram["active"] is True + + def test_getitem_missing_key(self, sample_datagram): + """Test __getitem__ with missing key raises KeyError.""" + with pytest.raises(KeyError): + _ = sample_datagram["nonexistent"] + + def test_contains(self, sample_datagram): + """Test __contains__ method.""" + assert "user_id" in sample_datagram + assert "name" in sample_datagram + assert "nonexistent" not in sample_datagram + + def test_iter(self, sample_datagram): + """Test __iter__ method.""" + keys = list(sample_datagram) + expected_keys = {"user_id", "name", "score", "active"} + assert set(keys) == expected_keys + + def test_get(self, sample_datagram): + """Test get method.""" + assert sample_datagram.get("user_id") == 123 + assert sample_datagram.get("nonexistent") is None + assert sample_datagram.get("nonexistent", "default") == "default" + + +class TestDictDatagramStructuralInfo: + """Test structural information methods.""" + + @pytest.fixture + def datagram_with_meta(self): + """Create a datagram with meta data for testing.""" + data = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + "__pipeline_id": "test_pipeline", + } + return DictDatagram(data, data_context="v0.1") + + def test_keys_data_only(self, datagram_with_meta): + """Test keys method with data columns only.""" + keys = datagram_with_meta.keys() + expected_keys = {"user_id", "name"} + assert set(keys) == expected_keys + + def test_keys_with_meta_columns(self, datagram_with_meta): + """Test keys method including meta columns.""" + keys = datagram_with_meta.keys(include_meta_columns=True) + expected_keys = {"user_id", "name", "__version", "__pipeline_id"} + assert set(keys) == expected_keys + + def test_keys_with_context(self, datagram_with_meta): + """Test keys method including context.""" + keys = datagram_with_meta.keys(include_context=True) + expected_keys = {"user_id", "name", constants.CONTEXT_KEY} + assert set(keys) == expected_keys + + def test_keys_with_all_info(self, datagram_with_meta): + """Test keys method including all information.""" + keys = datagram_with_meta.keys(include_meta_columns=True, include_context=True) + expected_keys = { + "user_id", + "name", + "__version", + "__pipeline_id", + constants.CONTEXT_KEY, + } + assert set(keys) == expected_keys + + def test_keys_with_specific_meta_prefix(self, datagram_with_meta): + """Test keys method with specific meta columns.""" + # Test selecting specific meta columns by getting all first + all_keys_with_meta = datagram_with_meta.keys(include_meta_columns=True) + + # Should include data columns and meta columns + expected_keys = {"user_id", "name", "__version", "__pipeline_id"} + assert set(all_keys_with_meta) == expected_keys + + def test_types_data_only(self, datagram_with_meta): + """Test types method with data columns only.""" + types = datagram_with_meta.types() + expected_keys = {"user_id", "name"} + assert set(types.keys()) == expected_keys + assert types["user_id"] is int + assert types["name"] is str + + def test_types_with_meta_columns(self, datagram_with_meta): + """Test types method including meta columns.""" + types = datagram_with_meta.types(include_meta_columns=True) + expected_keys = {"user_id", "name", "__version", "__pipeline_id"} + assert set(types.keys()) == expected_keys + assert types["__version"] is str + assert types["__pipeline_id"] is str + + def test_types_with_context(self, datagram_with_meta): + """Test types method including context.""" + types = datagram_with_meta.types(include_context=True) + expected_keys = {"user_id", "name", constants.CONTEXT_KEY} + assert set(types.keys()) == expected_keys + assert types[constants.CONTEXT_KEY] is str + + def test_arrow_schema_data_only(self, datagram_with_meta): + """Test arrow_schema method with data columns only.""" + schema = datagram_with_meta.arrow_schema() + expected_names = {"user_id", "name"} + assert set(schema.names) == expected_names + # Access field by name, not index + assert schema.field("user_id").type == pa.int64() + assert schema.field("name").type == pa.large_string() + + def test_arrow_schema_with_meta_columns(self, datagram_with_meta): + """Test arrow_schema method including meta columns.""" + schema = datagram_with_meta.arrow_schema(include_meta_columns=True) + expected_names = {"user_id", "name", "__version", "__pipeline_id"} + assert set(schema.names) == expected_names + assert schema.field("__version").type == pa.large_string() + assert schema.field("__pipeline_id").type == pa.large_string() + + def test_arrow_schema_with_context(self, datagram_with_meta): + """Test arrow_schema method including context.""" + schema = datagram_with_meta.arrow_schema(include_context=True) + expected_names = {"user_id", "name", constants.CONTEXT_KEY} + assert set(schema.names) == expected_names + + def test_content_hash(self, datagram_with_meta): + """Test content hash calculation.""" + hash1 = datagram_with_meta.content_hash() + hash2 = datagram_with_meta.content_hash() + + # Hash should be consistent + assert hash1 == hash2 + assert isinstance(hash1, str) + assert len(hash1) > 0 + + def test_content_hash_different_data(self): + """Test content hash is different for different data.""" + datagram1 = DictDatagram({"user_id": 123, "name": "Alice"}) + datagram2 = DictDatagram({"user_id": 456, "name": "Bob"}) + + hash1 = datagram1.content_hash() + hash2 = datagram2.content_hash() + + assert hash1 != hash2 + + +class TestDictDatagramFormatConversions: + """Test format conversion methods.""" + + @pytest.fixture + def datagram_with_all(self): + """Create a datagram with data, meta, and context.""" + data = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + constants.CONTEXT_KEY: "v0.1", + } + return DictDatagram(data) + + def test_as_dict_data_only(self, datagram_with_all): + """Test as_dict method with data columns only.""" + result = datagram_with_all.as_dict() + expected = {"user_id": 123, "name": "Alice"} + assert result == expected + + def test_as_dict_with_meta_columns(self, datagram_with_all): + """Test as_dict method including meta columns.""" + result = datagram_with_all.as_dict(include_meta_columns=True) + expected = {"user_id": 123, "name": "Alice", "__version": "1.0"} + assert result == expected + + def test_as_dict_with_context(self, datagram_with_all): + """Test as_dict method including context.""" + result = datagram_with_all.as_dict(include_context=True) + expected_user_id = 123 + expected_name = "Alice" + + assert result["user_id"] == expected_user_id + assert result["name"] == expected_name + # Context key should be present but value might be transformed + assert constants.CONTEXT_KEY in result + + def test_as_dict_with_all_info(self, datagram_with_all): + """Test as_dict method including all information.""" + result = datagram_with_all.as_dict( + include_meta_columns=True, include_context=True + ) + + assert result["user_id"] == 123 + assert result["name"] == "Alice" + assert result["__version"] == "1.0" + # Context key should be present but value might be transformed + assert constants.CONTEXT_KEY in result + + def test_as_table_data_only(self, datagram_with_all): + """Test as_table method with data columns only.""" + table = datagram_with_all.as_table() + + assert table.num_rows == 1 + assert set(table.column_names) == {"user_id", "name"} + assert table["user_id"].to_pylist() == [123] + assert table["name"].to_pylist() == ["Alice"] + + def test_as_table_with_meta_columns(self, datagram_with_all): + """Test as_table method including meta columns.""" + table = datagram_with_all.as_table(include_meta_columns=True) + + assert table.num_rows == 1 + expected_columns = {"user_id", "name", "__version"} + assert set(table.column_names) == expected_columns + assert table["__version"].to_pylist() == ["1.0"] + + def test_as_table_with_context(self, datagram_with_all): + """Test as_table method including context.""" + table = datagram_with_all.as_table(include_context=True) + + assert table.num_rows == 1 + expected_columns = {"user_id", "name", constants.CONTEXT_KEY} + assert set(table.column_names) == expected_columns + # Context value might be transformed, just check it exists + assert len(table[constants.CONTEXT_KEY].to_pylist()) == 1 + + def test_as_arrow_compatible_dict(self, datagram_with_all): + """Test as_arrow_compatible_dict method.""" + result = datagram_with_all.as_arrow_compatible_dict() + + # Should be dict with list values suitable for PyArrow + assert isinstance(result, dict) + # The method returns single values, not lists for single-row data + assert result["user_id"] == 123 + assert result["name"] == "Alice" + + +class TestDictDatagramMetaOperations: + """Test meta column operations.""" + + @pytest.fixture + def datagram_with_meta(self): + """Create a datagram with meta columns.""" + data = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + "__pipeline_id": "test_pipeline", + "__timestamp": "2024-01-01", + } + return DictDatagram(data) + + def test_meta_columns_property(self, datagram_with_meta): + """Test meta_columns property.""" + meta_columns = datagram_with_meta.meta_columns + expected = {"__version", "__pipeline_id", "__timestamp"} + assert set(meta_columns) == expected + + def test_get_meta_value(self, datagram_with_meta): + """Test get_meta_value method.""" + assert datagram_with_meta.get_meta_value("version") == "1.0" + assert datagram_with_meta.get_meta_value("pipeline_id") == "test_pipeline" + assert datagram_with_meta.get_meta_value("timestamp") == "2024-01-01" + assert datagram_with_meta.get_meta_value("nonexistent") is None + assert datagram_with_meta.get_meta_value("nonexistent", "default") == "default" + + def test_with_meta_columns(self, datagram_with_meta): + """Test with_meta_columns method.""" + new_datagram = datagram_with_meta.with_meta_columns( + new_meta="new_value", updated_version="2.0" + ) + + # Original should be unchanged + assert datagram_with_meta.get_meta_value("version") == "1.0" + assert datagram_with_meta.get_meta_value("new_meta") is None + + # New datagram should have updates + assert new_datagram.get_meta_value("version") == "1.0" # unchanged + assert new_datagram.get_meta_value("updated_version") == "2.0" # new + assert new_datagram.get_meta_value("new_meta") == "new_value" # new + + def test_with_meta_columns_prefixed_keys(self, datagram_with_meta): + """Test with_meta_columns method with already prefixed keys.""" + new_datagram = datagram_with_meta.with_meta_columns( + **{"__direct_meta": "direct_value"} + ) + + assert new_datagram.get_meta_value("direct_meta") == "direct_value" + + def test_drop_meta_columns(self, datagram_with_meta): + """Test drop_meta_columns method.""" + new_datagram = datagram_with_meta.drop_meta_columns("version", "timestamp") + + # Original should be unchanged + assert datagram_with_meta.get_meta_value("version") == "1.0" + assert datagram_with_meta.get_meta_value("timestamp") == "2024-01-01" + + # New datagram should have dropped columns + assert new_datagram.get_meta_value("version") is None + assert new_datagram.get_meta_value("timestamp") is None + assert ( + new_datagram.get_meta_value("pipeline_id") == "test_pipeline" + ) # unchanged + + def test_drop_meta_columns_prefixed(self, datagram_with_meta): + """Test drop_meta_columns method with prefixed keys.""" + new_datagram = datagram_with_meta.drop_meta_columns("__version") + + assert new_datagram.get_meta_value("version") is None + assert ( + new_datagram.get_meta_value("pipeline_id") == "test_pipeline" + ) # unchanged + + def test_drop_meta_columns_multiple(self, datagram_with_meta): + """Test dropping multiple meta columns.""" + new_datagram = datagram_with_meta.drop_meta_columns("version", "pipeline_id") + + assert new_datagram.get_meta_value("version") is None + assert new_datagram.get_meta_value("pipeline_id") is None + assert new_datagram.get_meta_value("timestamp") == "2024-01-01" # unchanged + + def test_drop_meta_columns_missing_key(self, datagram_with_meta): + """Test drop_meta_columns with missing key raises KeyError.""" + with pytest.raises(KeyError): + datagram_with_meta.drop_meta_columns("nonexistent") + + def test_drop_meta_columns_ignore_missing(self, datagram_with_meta): + """Test drop_meta_columns with ignore_missing=True.""" + new_datagram = datagram_with_meta.drop_meta_columns( + "version", "nonexistent", ignore_missing=True + ) + + assert new_datagram.get_meta_value("version") is None + assert ( + new_datagram.get_meta_value("pipeline_id") == "test_pipeline" + ) # unchanged + + +class TestDictDatagramDataOperations: + """Test data column operations.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5, "active": True} + return DictDatagram(data) + + def test_select(self, sample_datagram): + """Test select method.""" + new_datagram = sample_datagram.select("user_id", "name") + + assert set(new_datagram.keys()) == {"user_id", "name"} + assert new_datagram["user_id"] == 123 + assert new_datagram["name"] == "Alice" + + # Original should be unchanged + assert len(sample_datagram.keys()) == 4 + + def test_select_single_column(self, sample_datagram): + """Test select method with single column.""" + new_datagram = sample_datagram.select("user_id") + + assert list(new_datagram.keys()) == ["user_id"] + assert new_datagram["user_id"] == 123 + + def test_select_missing_column(self, sample_datagram): + """Test select method with missing column raises KeyError.""" + with pytest.raises(KeyError): + sample_datagram.select("user_id", "nonexistent") + + def test_drop(self, sample_datagram): + """Test drop method.""" + new_datagram = sample_datagram.drop("score", "active") + + assert set(new_datagram.keys()) == {"user_id", "name"} + assert new_datagram["user_id"] == 123 + assert new_datagram["name"] == "Alice" + + # Original should be unchanged + assert len(sample_datagram.keys()) == 4 + + def test_drop_single_column(self, sample_datagram): + """Test drop method with single column.""" + new_datagram = sample_datagram.drop("score") + + expected_keys = {"user_id", "name", "active"} + assert set(new_datagram.keys()) == expected_keys + + def test_drop_missing_column(self, sample_datagram): + """Test drop method with missing column raises KeyError.""" + with pytest.raises(KeyError): + sample_datagram.drop("nonexistent") + + def test_drop_ignore_missing(self, sample_datagram): + """Test drop method with ignore_missing=True.""" + new_datagram = sample_datagram.drop("score", "nonexistent", ignore_missing=True) + + expected_keys = {"user_id", "name", "active"} + assert set(new_datagram.keys()) == expected_keys + + def test_drop_all_columns_fails(self, sample_datagram): + """Test dropping all columns raises appropriate error.""" + with pytest.raises(ValueError): + sample_datagram.drop("user_id", "name", "score", "active") + + def test_rename(self, sample_datagram): + """Test rename method.""" + new_datagram = sample_datagram.rename({"user_id": "id", "name": "full_name"}) + + expected_keys = {"id", "full_name", "score", "active"} + assert set(new_datagram.keys()) == expected_keys + assert new_datagram["id"] == 123 + assert new_datagram["full_name"] == "Alice" + + # Original should be unchanged + assert "user_id" in sample_datagram.keys() + assert "name" in sample_datagram.keys() + + def test_rename_empty_mapping(self, sample_datagram): + """Test rename method with empty mapping returns new instance.""" + new_datagram = sample_datagram.rename({}) + + # Should return new instance with same data + assert new_datagram is not sample_datagram + assert new_datagram.as_dict() == sample_datagram.as_dict() + + def test_update(self, sample_datagram): + """Test update method.""" + new_datagram = sample_datagram.update(score=95.0, active=False) + + assert new_datagram["score"] == 95.0 + assert new_datagram["active"] is False + assert new_datagram["user_id"] == 123 # unchanged + assert new_datagram["name"] == "Alice" # unchanged + + # Original should be unchanged + assert sample_datagram["score"] == 85.5 + assert sample_datagram["active"] is True + + def test_update_missing_column(self, sample_datagram): + """Test update method with missing column raises KeyError.""" + with pytest.raises(KeyError): + sample_datagram.update(nonexistent="value") + + def test_update_empty(self, sample_datagram): + """Test update method with no updates returns same instance.""" + new_datagram = sample_datagram.update() + + assert new_datagram is sample_datagram + + def test_with_columns(self, sample_datagram): + """Test with_columns method.""" + new_datagram = sample_datagram.with_columns(grade="A", rank=1) + + expected_keys = {"user_id", "name", "score", "active", "grade", "rank"} + assert set(new_datagram.keys()) == expected_keys + assert new_datagram["grade"] == "A" + assert new_datagram["rank"] == 1 + assert new_datagram["user_id"] == 123 # unchanged + + # Original should be unchanged + assert len(sample_datagram.keys()) == 4 + + def test_with_columns_with_types(self, sample_datagram): + """Test with_columns method with type specification.""" + new_datagram = sample_datagram.with_columns( + grade="A", rank=1, python_schema={"grade": str, "rank": int} + ) + + assert new_datagram["grade"] == "A" + assert new_datagram["rank"] == 1 + + def test_with_columns_existing_column_fails(self, sample_datagram): + """Test with_columns method with existing column raises ValueError.""" + with pytest.raises(ValueError): + sample_datagram.with_columns(user_id=456) + + def test_with_columns_empty(self, sample_datagram): + """Test with_columns method with no columns returns same instance.""" + new_datagram = sample_datagram.with_columns() + + assert new_datagram is sample_datagram + + +class TestDictDatagramContextOperations: + """Test context operations.""" + + def test_with_context_key(self): + """Test with_context_key method.""" + data = {"user_id": 123, "name": "Alice"} + original_datagram = DictDatagram(data, data_context="v0.1") + + new_datagram = original_datagram.with_context_key("v0.1") + + # Both should have the full context key + assert "v0.1" in original_datagram.data_context_key + assert "v0.1" in new_datagram.data_context_key + assert new_datagram["user_id"] == 123 # data unchanged + + +class TestDictDatagramUtilityOperations: + """Test utility operations.""" + + @pytest.fixture + def sample_datagram(self): + """Create a sample datagram for testing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + return DictDatagram(data) + + def test_copy_with_cache(self, sample_datagram): + """Test copy method preserves cache.""" + # Access something to populate cache + _ = sample_datagram.as_dict() + + copied = sample_datagram.copy() + + assert copied is not sample_datagram + assert copied.as_dict() == sample_datagram.as_dict() + + def test_copy_without_cache(self, sample_datagram): + """Test copy method without cache.""" + copied = sample_datagram.copy() + + assert copied is not sample_datagram + assert copied.as_dict() == sample_datagram.as_dict() + + def test_str_representation(self, sample_datagram): + """Test string representation.""" + str_repr = str(sample_datagram) + + # The string representation might be the dict itself + assert "user_id" in str_repr + assert "123" in str_repr + + def test_repr_representation(self, sample_datagram): + """Test repr representation.""" + repr_str = repr(sample_datagram) + + # The repr might be the dict itself + assert "user_id" in repr_str + assert "123" in repr_str + + +class TestDictDatagramEdgeCases: + """Test edge cases and error conditions.""" + + def test_none_values(self): + """Test handling of None values.""" + data = {"user_id": 123, "name": None, "score": 85.5} + datagram = DictDatagram(data) + + assert datagram["user_id"] == 123 + assert datagram["name"] is None + assert datagram["score"] == 85.5 + + def test_complex_data_types(self): + """Test handling of complex data types.""" + data = { + "user_id": 123, + "tags": ["tag1", "tag2"], + "metadata": {"key": "value"}, + "score": 85.5, + } + datagram = DictDatagram(data) + + assert datagram["user_id"] == 123 + assert datagram["tags"] == ["tag1", "tag2"] + assert datagram["metadata"] == {"key": "value"} + + def test_unicode_strings(self): + """Test handling of Unicode strings.""" + data = {"user_id": 123, "name": "Алиса", "emoji": "😊"} + datagram = DictDatagram(data) + + assert datagram["name"] == "Алиса" + assert datagram["emoji"] == "😊" + + def test_large_numbers(self): + """Test handling of large numbers.""" + data = { + "user_id": 123, + "large_int": 9223372036854775807, # Max int64 + "large_float": 1.7976931348623157e308, # Near max float64 + } + datagram = DictDatagram(data) + + assert datagram["large_int"] == 9223372036854775807 + assert datagram["large_float"] == 1.7976931348623157e308 + + def test_duplicate_operations(self): + """Test that duplicate operations are idempotent.""" + data = {"user_id": 123, "name": "Alice"} + datagram = DictDatagram(data) + + # Multiple selects should be the same + selected1 = datagram.select("user_id") + selected2 = datagram.select("user_id") + + assert selected1.as_dict() == selected2.as_dict() + + +class TestDictDatagramIntegration: + """Test integration with other components.""" + + def test_chained_operations(self): + """Test chaining multiple operations.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5, "active": True} + datagram = DictDatagram(data) + + result = ( + datagram.update(score=95.0) + .with_columns(grade="A") + .drop("active") + .rename({"user_id": "id"}) + ) + + expected_keys = {"id", "name", "score", "grade"} + assert set(result.keys()) == expected_keys + assert result["id"] == 123 + assert result["score"] == 95.0 + assert result["grade"] == "A" + + def test_arrow_roundtrip(self): + """Test conversion to Arrow and back.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + original = DictDatagram(data) + + # Convert to Arrow table and back + table = original.as_table() + arrow_dict = table.to_pydict() + + # Convert dict format back to DictDatagram compatible format + converted_dict = {k: v[0] for k, v in arrow_dict.items()} + reconstructed = DictDatagram(converted_dict) + + # Should preserve data + assert reconstructed["user_id"] == original["user_id"] + assert reconstructed["name"] == original["name"] + assert reconstructed["score"] == original["score"] + + def test_mixed_include_options(self): + """Test various combinations of include options.""" + data = { + "user_id": 123, + "name": "Alice", + "__version": "1.0", + constants.CONTEXT_KEY: "v0.1", + } + datagram = DictDatagram(data) + + # Test all combinations + data_only = datagram.as_dict() + with_meta = datagram.as_dict(include_meta_columns=True) + with_context = datagram.as_dict(include_context=True) + with_all = datagram.as_dict(include_meta_columns=True, include_context=True) + + assert len(data_only) == 2 # user_id, name + assert len(with_meta) == 3 # + __version + assert len(with_context) == 3 # + context + assert len(with_all) == 4 # + both diff --git a/tests/test_data/test_datagrams/test_dict_tag_packet.py b/tests/test_data/test_datagrams/test_dict_tag_packet.py new file mode 100644 index 0000000..57933c2 --- /dev/null +++ b/tests/test_data/test_datagrams/test_dict_tag_packet.py @@ -0,0 +1,566 @@ +""" +Comprehensive tests for DictTag and DictPacket classes. + +This module tests all functionality of the dictionary-based tag and packet classes including: +- Tag-specific functionality (system tags) +- Packet-specific functionality (source info) +- Integration with base datagram functionality +- Conversion operations +""" + +import pytest + +from orcapod.data.datagrams import DictTag, DictPacket +from orcapod.data.system_constants import constants + + +class TestDictTagInitialization: + """Test DictTag initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with simple data.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + tag = DictTag(data) + + assert tag["user_id"] == 123 + assert tag["name"] == "Alice" + assert tag["score"] == 85.5 + + def test_initialization_with_system_tags(self): + """Test initialization with system tags.""" + data = {"user_id": 123, "name": "Alice"} + system_tags = {"tag_type": "user", "created_by": "system"} + + tag = DictTag(data, system_tags=system_tags) + + assert tag["user_id"] == 123 + system_tag_dict = tag.system_tags() + assert system_tag_dict["tag_type"] == "user" + assert system_tag_dict["created_by"] == "system" + + def test_initialization_with_system_tags_in_data(self): + """Test initialization when system tags are included in data.""" + data = { + "user_id": 123, + "name": "Alice", + f"{constants.SYSTEM_TAG_PREFIX}tag_type": "user", + f"{constants.SYSTEM_TAG_PREFIX}version": "1.0", + } + + tag = DictTag(data) + + assert tag["user_id"] == 123 + assert tag["name"] == "Alice" + + system_tags = tag.system_tags() + assert system_tags[f"{constants.SYSTEM_TAG_PREFIX}tag_type"] == "user" + assert system_tags[f"{constants.SYSTEM_TAG_PREFIX}version"] == "1.0" + + def test_initialization_mixed_system_tags(self): + """Test initialization with both embedded and explicit system tags.""" + data = {"user_id": 123, f"{constants.SYSTEM_TAG_PREFIX}embedded": "value1"} + system_tags = {"explicit": "value2"} + + tag = DictTag(data, system_tags=system_tags) + + system_tag_dict = tag.system_tags() + assert system_tag_dict[f"{constants.SYSTEM_TAG_PREFIX}embedded"] == "value1" + assert system_tag_dict["explicit"] == "value2" + + +class TestDictTagSystemTagOperations: + """Test system tag specific operations.""" + + @pytest.fixture + def sample_tag(self): + """Create a sample tag for testing.""" + data = {"user_id": 123, "name": "Alice"} + system_tags = {"tag_type": "user", "version": "1.0"} + return DictTag(data, system_tags=system_tags) + + def test_system_tags_method(self, sample_tag): + """Test system_tags method.""" + system_tags = sample_tag.system_tags() + + assert isinstance(system_tags, dict) + assert system_tags["tag_type"] == "user" + assert system_tags["version"] == "1.0" + + def test_keys_with_system_tags(self, sample_tag): + """Test keys method including system tags.""" + keys_data_only = sample_tag.keys() + keys_with_system = sample_tag.keys(include_system_tags=True) + + assert "user_id" in keys_data_only + assert "name" in keys_data_only + assert len(keys_with_system) > len(keys_data_only) + assert "tag_type" in keys_with_system + assert "version" in keys_with_system + + def test_types_with_system_tags(self, sample_tag): + """Test types method including system tags.""" + types_data_only = sample_tag.types() + types_with_system = sample_tag.types(include_system_tags=True) + + assert len(types_with_system) > len(types_data_only) + assert "tag_type" in types_with_system + assert "version" in types_with_system + + def test_arrow_schema_with_system_tags(self, sample_tag): + """Test arrow_schema method including system tags.""" + schema_data_only = sample_tag.arrow_schema() + schema_with_system = sample_tag.arrow_schema(include_system_tags=True) + + assert len(schema_with_system) > len(schema_data_only) + assert "tag_type" in schema_with_system.names + assert "version" in schema_with_system.names + + def test_as_dict_with_system_tags(self, sample_tag): + """Test as_dict method including system tags.""" + dict_data_only = sample_tag.as_dict() + dict_with_system = sample_tag.as_dict(include_system_tags=True) + + assert "user_id" in dict_data_only + assert "name" in dict_data_only + assert "tag_type" not in dict_data_only + + assert "user_id" in dict_with_system + assert "tag_type" in dict_with_system + assert "version" in dict_with_system + + def test_as_table_with_system_tags(self, sample_tag): + """Test as_table method including system tags.""" + table_data_only = sample_tag.as_table() + table_with_system = sample_tag.as_table(include_system_tags=True) + + assert len(table_with_system.column_names) > len(table_data_only.column_names) + assert "tag_type" in table_with_system.column_names + assert "version" in table_with_system.column_names + + def test_as_datagram_conversion(self, sample_tag): + """Test conversion to datagram.""" + datagram = sample_tag.as_datagram() + + # Should preserve data + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + + # Should not include system tags by default + assert "tag_type" not in datagram.keys() + + def test_as_datagram_with_system_tags(self, sample_tag): + """Test conversion to datagram including system tags.""" + datagram = sample_tag.as_datagram(include_system_tags=True) + + # Should preserve data and include system tags + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + assert "tag_type" in datagram.keys() + + +class TestDictPacketInitialization: + """Test DictPacket initialization and basic properties.""" + + def test_basic_initialization(self): + """Test basic initialization with simple data.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + packet = DictPacket(data) + + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + assert packet["score"] == 85.5 + + def test_initialization_with_source_info(self): + """Test initialization with source info.""" + data = {"user_id": 123, "name": "Alice"} + source_info = {"user_id": "database", "name": "user_input"} + + packet = DictPacket(data, source_info=source_info) + + assert packet["user_id"] == 123 + source_dict = packet.source_info() + assert source_dict["user_id"] == "database" + assert source_dict["name"] == "user_input" + + def test_initialization_with_source_info_in_data(self): + """Test initialization when source info is included in data.""" + data = { + "user_id": 123, + "name": "Alice", + f"{constants.SOURCE_PREFIX}user_id": "database", + f"{constants.SOURCE_PREFIX}name": "user_input", + } + + packet = DictPacket(data) + + assert packet["user_id"] == 123 + assert packet["name"] == "Alice" + + source_info = packet.source_info() + assert source_info["user_id"] == "database" + assert source_info["name"] == "user_input" + + def test_initialization_mixed_source_info(self): + """Test initialization with both embedded and explicit source info.""" + data = { + "user_id": 123, + "name": "Alice", + f"{constants.SOURCE_PREFIX}user_id": "embedded_source", + } + source_info = {"name": "explicit_source"} + + packet = DictPacket(data, source_info=source_info) + + source_dict = packet.source_info() + assert source_dict["user_id"] == "embedded_source" + assert source_dict["name"] == "explicit_source" + + +class TestDictPacketSourceInfoOperations: + """Test source info specific operations.""" + + @pytest.fixture + def sample_packet(self): + """Create a sample packet for testing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + source_info = { + "user_id": "database", + "name": "user_input", + "score": "calculation", + } + return DictPacket(data, source_info=source_info) + + def test_source_info_method(self, sample_packet): + """Test source_info method.""" + source_info = sample_packet.source_info() + + assert isinstance(source_info, dict) + assert source_info["user_id"] == "database" + assert source_info["name"] == "user_input" + assert source_info["score"] == "calculation" + + def test_source_info_with_missing_keys(self): + """Test source_info method when some keys are missing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + source_info = {"user_id": "database"} # Only partial source info + + packet = DictPacket(data, source_info=source_info) + full_source_info = packet.source_info() + + assert full_source_info["user_id"] == "database" + assert full_source_info["name"] is None + assert full_source_info["score"] is None + + def test_with_source_info(self, sample_packet): + """Test with_source_info method.""" + updated = sample_packet.with_source_info( + user_id="new_database", name="new_input" + ) + + # Original should be unchanged + original_source = sample_packet.source_info() + assert original_source["user_id"] == "database" + + # Updated should have new values + updated_source = updated.source_info() + assert updated_source["user_id"] == "new_database" + assert updated_source["name"] == "new_input" + assert updated_source["score"] == "calculation" # Unchanged + + def test_keys_with_source_info(self, sample_packet): + """Test keys method including source info.""" + keys_data_only = sample_packet.keys() + keys_with_source = sample_packet.keys(include_source=True) + + assert "user_id" in keys_data_only + assert "name" in keys_data_only + assert len(keys_with_source) > len(keys_data_only) + + # Should include prefixed source columns + source_keys = [ + k for k in keys_with_source if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + def test_types_with_source_info(self, sample_packet): + """Test types method including source info.""" + types_data_only = sample_packet.types() + types_with_source = sample_packet.types(include_source=True) + + assert len(types_with_source) > len(types_data_only) + + # Source columns should be string type + source_keys = [ + k for k in types_with_source.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + for key in source_keys: + assert types_with_source[key] is str + + def test_arrow_schema_with_source_info(self, sample_packet): + """Test arrow_schema method including source info.""" + schema_data_only = sample_packet.arrow_schema() + schema_with_source = sample_packet.arrow_schema(include_source=True) + + assert len(schema_with_source) > len(schema_data_only) + + source_columns = [ + name + for name in schema_with_source.names + if name.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_columns) > 0 + + def test_as_dict_with_source_info(self, sample_packet): + """Test as_dict method including source info.""" + dict_data_only = sample_packet.as_dict() + dict_with_source = sample_packet.as_dict(include_source=True) + + assert "user_id" in dict_data_only + assert "name" in dict_data_only + assert not any( + k.startswith(constants.SOURCE_PREFIX) for k in dict_data_only.keys() + ) + + assert "user_id" in dict_with_source + source_keys = [ + k for k in dict_with_source.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + def test_as_table_with_source_info(self, sample_packet): + """Test as_table method including source info.""" + table_data_only = sample_packet.as_table() + table_with_source = sample_packet.as_table(include_source=True) + + assert len(table_with_source.column_names) > len(table_data_only.column_names) + + source_columns = [ + name + for name in table_with_source.column_names + if name.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_columns) > 0 + + def test_as_datagram_conversion(self, sample_packet): + """Test conversion to datagram.""" + datagram = sample_packet.as_datagram() + + # Should preserve data + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + + # Should not include source info by default + assert not any(k.startswith(constants.SOURCE_PREFIX) for k in datagram.keys()) + + def test_as_datagram_with_source_info(self, sample_packet): + """Test conversion to datagram including source info.""" + datagram = sample_packet.as_datagram(include_source=True) + + # Should preserve data and include source info + assert datagram["user_id"] == 123 + assert datagram["name"] == "Alice" + source_keys = [ + k for k in datagram.keys() if k.startswith(constants.SOURCE_PREFIX) + ] + assert len(source_keys) > 0 + + +class TestDictPacketDataOperations: + """Test data operations specific to packets.""" + + @pytest.fixture + def sample_packet(self): + """Create a sample packet for testing.""" + data = {"user_id": 123, "name": "Alice", "score": 85.5} + source_info = { + "user_id": "database", + "name": "user_input", + "score": "calculation", + } + return DictPacket(data, source_info=source_info) + + def test_rename_preserves_source_info(self, sample_packet): + """Test that rename operation preserves source info mapping.""" + renamed = sample_packet.rename({"user_id": "id", "name": "username"}) + + # Data should be renamed + assert "id" in renamed.keys() + assert "username" in renamed.keys() + assert "user_id" not in renamed.keys() + assert "name" not in renamed.keys() + + # Source info should follow the rename + source_info = renamed.source_info() + assert source_info["id"] == "database" + assert source_info["username"] == "user_input" + assert source_info["score"] == "calculation" + + +class TestDictTagPacketIntegration: + """Test integration between tags, packets, and base functionality.""" + + def test_tag_to_packet_conversion(self): + """Test converting a tag to a packet-like structure.""" + data = {"user_id": 123, "name": "Alice"} + system_tags = {"tag_type": "user", "version": "1.0"} + tag = DictTag(data, system_tags=system_tags) + + # Convert to full dictionary + full_dict = tag.as_dict(include_all_info=True) + + # Should include data, system tags, meta columns, and context + assert "user_id" in full_dict + assert "tag_type" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_packet_comprehensive_dict(self): + """Test packet with all information types.""" + data = {"user_id": 123, "name": "Alice", "__meta_field": "meta_value"} + source_info = {"user_id": "database", "name": "user_input"} + + packet = DictPacket(data, source_info=source_info) + + # Get comprehensive dictionary + full_dict = packet.as_dict(include_all_info=True) + + # Should include data, source info, meta columns, and context + assert "user_id" in full_dict + assert f"{constants.SOURCE_PREFIX}user_id" in full_dict + assert "__meta_field" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_chained_operations_tag(self): + """Test chaining operations on tags.""" + data = {"user_id": 123, "first_name": "Alice", "last_name": "Smith"} + system_tags = {"tag_type": "user"} + + tag = DictTag(data, system_tags=system_tags) + + # Chain operations + result = ( + tag.with_columns(full_name="Alice Smith") + .drop("first_name", "last_name") + .update(user_id=456) + ) + + # Verify final state + assert set(result.keys()) == {"user_id", "full_name"} + assert result["user_id"] == 456 + assert result["full_name"] == "Alice Smith" + + # System tags should be preserved + system_tags = result.system_tags() + assert system_tags["tag_type"] == "user" + + def test_chained_operations_packet(self): + """Test chaining operations on packets.""" + data = {"user_id": 123, "first_name": "Alice", "last_name": "Smith"} + source_info = {"user_id": "database", "first_name": "form", "last_name": "form"} + + packet = DictPacket(data, source_info=source_info) + + # Chain operations + result = ( + packet.with_columns(full_name="Alice Smith") + .drop("first_name", "last_name") + .update(user_id=456) + .with_source_info(full_name="calculated") + ) + + # Verify final state + assert set(result.keys()) == {"user_id", "full_name"} + assert result["user_id"] == 456 + assert result["full_name"] == "Alice Smith" + + # Source info should be updated + source_info = result.source_info() + assert source_info["user_id"] == "database" + assert source_info["full_name"] == "calculated" + + def test_copy_operations(self): + """Test copy operations preserve all information.""" + # Test tag copy + tag_data = {"user_id": 123, "name": "Alice"} + system_tags = {"tag_type": "user"} + tag = DictTag(tag_data, system_tags=system_tags) + + tag_copy = tag.copy() + assert tag_copy is not tag + assert tag_copy["user_id"] == tag["user_id"] + assert tag_copy.system_tags() == tag.system_tags() + + # Test packet copy + packet_data = {"user_id": 123, "name": "Alice"} + source_info = {"user_id": "database"} + packet = DictPacket(packet_data, source_info=source_info) + + packet_copy = packet.copy() + assert packet_copy is not packet + assert packet_copy["user_id"] == packet["user_id"] + assert packet_copy.source_info() == packet.source_info() + + +class TestDictTagPacketEdgeCases: + """Test edge cases and error conditions.""" + + def test_tag_empty_system_tags(self): + """Test tag with empty system tags.""" + data = {"user_id": 123, "name": "Alice"} + tag = DictTag(data, system_tags={}) + + assert tag["user_id"] == 123 + assert tag.system_tags() == {} + + def test_packet_empty_source_info(self): + """Test packet with empty source info.""" + data = {"user_id": 123, "name": "Alice"} + packet = DictPacket(data, source_info={}) + + assert packet["user_id"] == 123 + source_info = packet.source_info() + assert all(v is None for v in source_info.values()) + + def test_tag_none_system_tags(self): + """Test tag with None system tags.""" + data = {"user_id": 123, "name": "Alice"} + tag = DictTag(data, system_tags=None) + + assert tag["user_id"] == 123 + assert tag.system_tags() == {} + + def test_packet_none_source_info(self): + """Test packet with None source info.""" + data = {"user_id": 123, "name": "Alice"} + packet = DictPacket(data, source_info=None) + + assert packet["user_id"] == 123 + source_info = packet.source_info() + assert all(v is None for v in source_info.values()) + + def test_tag_with_meta_and_system_tags(self): + """Test tag with both meta columns and system tags.""" + data = {"user_id": 123, "name": "Alice", "__meta_field": "meta_value"} + system_tags = {"tag_type": "user"} + + tag = DictTag(data, system_tags=system_tags) + + # All information should be accessible + full_dict = tag.as_dict(include_all_info=True) + assert "user_id" in full_dict + assert "__meta_field" in full_dict + assert "tag_type" in full_dict + assert constants.CONTEXT_KEY in full_dict + + def test_packet_with_meta_and_source_info(self): + """Test packet with both meta columns and source info.""" + data = {"user_id": 123, "name": "Alice", "__meta_field": "meta_value"} + source_info = {"user_id": "database"} + + packet = DictPacket(data, source_info=source_info) + + # All information should be accessible + full_dict = packet.as_dict(include_all_info=True) + assert "user_id" in full_dict + assert "__meta_field" in full_dict + assert f"{constants.SOURCE_PREFIX}user_id" in full_dict + assert constants.CONTEXT_KEY in full_dict From 166ef8a36b49f6f08000a0d24d0e2e67cbcfcde9 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 12 Aug 2025 20:56:04 +0000 Subject: [PATCH 188/224] fix: bugs in datagrams alternation methods --- src/orcapod/data/datagrams/arrow_datagram.py | 18 ++++- .../data/datagrams/arrow_tag_packet.py | 72 +++++++++++++++++-- 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 1f94e2a..f77dd89 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -558,8 +558,14 @@ def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: f"Following meta columns do not exist and cannot be dropped: {sorted(missing_keys)}" ) + # Only drop columns that actually exist + existing_keys = prefixed_keys - missing_keys + new_datagram = self.copy(include_cache=False) - new_datagram._meta_table = self._meta_table.drop_columns(list(prefixed_keys)) + if existing_keys: # Only drop if there are existing columns to drop + new_datagram._meta_table = self._meta_table.drop_columns( + list(existing_keys) + ) return new_datagram @@ -603,10 +609,16 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: raise KeyError( f"Following columns do not exist and cannot be dropped: {sorted(missing)}" ) - column_names = tuple(c for c in column_names if self._data_table.columns) + # Only keep columns that actually exist + existing_columns = tuple( + c for c in column_names if c in self._data_table.column_names + ) new_datagram = self.copy(include_cache=False) - new_datagram._data_table = self._data_table.drop_columns(list(column_names)) + if existing_columns: # Only drop if there are existing columns to drop + new_datagram._data_table = self._data_table.drop_columns( + list(existing_columns) + ) # TODO: consider dropping extra semantic columns if they are no longer needed return new_datagram diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index c6ee52e..71e8104 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -59,10 +59,10 @@ def __init__( self._data_table.select(extracted_system_tag_columns) )[0] ) + self._system_tags_dict.update(system_tags or {}) self._system_tags_python_schema = infer_schema_from_pylist_data( [self._system_tags_dict] ) - self._system_tags_dict.update(system_tags or {}) self._system_tags_table = ( self._data_context.type_converter.python_dicts_to_arrow_table( [self._system_tags_dict], python_schema=self._system_tags_python_schema @@ -171,8 +171,8 @@ def as_table( ) if ( include_all_info or include_system_tags - ) and self._system_tags_table.num_rows > 0: - # add system_tags only for existing data columns + ) and self._system_tags_table.num_columns > 0: + # add system_tags only if there are actual system tag columns table = arrow_utils.hstack_tables(table, self._system_tags_table) return table @@ -388,8 +388,12 @@ def as_table( include_context=include_context, ) if include_all_info or include_source: - # add source_info only for existing data columns - table = arrow_utils.hstack_tables(table, self._source_info_table) + # add source_info only if there are columns and the table has meaningful data + if ( + self._source_info_table.num_columns > 0 + and self._source_info_table.num_rows > 0 + ): + table = arrow_utils.hstack_tables(table, self._source_info_table) return table def as_datagram( @@ -466,7 +470,7 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: new_names = [column_mapping.get(k, k) for k in self._data_table.column_names] new_source_info_names = [ - f"{constants.SOURCE_PREFIX}{column_mapping.get(k.removeprefix(constants.SOURCE_PREFIX), k)}" + f"{constants.SOURCE_PREFIX}{column_mapping.get(k.removeprefix(constants.SOURCE_PREFIX), k.removeprefix(constants.SOURCE_PREFIX))}" for k in self._source_info_table.column_names ] @@ -478,6 +482,62 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: return new_datagram + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create a new ArrowPacket with new data columns added. + Maintains immutability by returning a new instance. + Also adds corresponding empty source info columns for new columns. + + Args: + column_types: Optional type specifications for new columns + **updates: New data columns as keyword arguments + + Returns: + New ArrowPacket instance with new data columns and corresponding source info columns + + Raises: + ValueError: If any column already exists (use update() instead) + """ + if not updates: + return self + + # First call parent method to add the data columns + new_packet = super().with_columns(column_types=column_types, **updates) + + # Now add corresponding empty source info columns for the new columns + source_info_updates = {} + for column_name in updates.keys(): + source_key = f"{constants.SOURCE_PREFIX}{column_name}" + source_info_updates[source_key] = None # Empty source info + + # Add new source info columns to the source info table + if source_info_updates: + # Get existing source info + schema = new_packet._source_info_table.schema + existing_source_info = new_packet._source_info_table.to_pylist()[0] + + # Add the new empty source info columns + existing_source_info.update(source_info_updates) + schema_columns = list(schema) + schema_columns.extend( + [ + pa.field(name, pa.large_string()) + for name in source_info_updates.keys() + ] + ) + new_schema = pa.schema(schema_columns) + + # Update the source info table + new_packet._source_info_table = pa.Table.from_pylist( + [existing_source_info], new_schema + ) + + return new_packet + # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: """Return a copy of the datagram.""" From 197f3e1501344ee24e068e2d45c2a444e7026f9f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 12 Aug 2025 20:59:50 +0000 Subject: [PATCH 189/224] fix: corner cases of system tag handling --- src/orcapod/data/datagrams/dict_datagram.py | 25 ++++++------------- src/orcapod/data/datagrams/dict_tag_packet.py | 24 ++++++++++-------- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 00ec527..103a742 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -629,14 +629,9 @@ def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: if not new_data: raise ValueError("Cannot drop all data columns") - # Reconstruct full data dict for new instance - full_data = new_data # Filtered user data - full_data.update(self._meta_data) # Keep existing meta data - - return self.__class__( - data=full_data, - data_context=self._data_context, - ) + new_datagram = self.copy(include_cache=False) + new_datagram._data = new_data + return new_datagram def rename(self, column_mapping: Mapping[str, str]) -> Self: """ @@ -681,7 +676,7 @@ def rename(self, column_mapping: Mapping[str, str]) -> Self: def update(self, **updates: DataValue) -> Self: """ Create a new DictDatagram with existing column values updated. - Maintains immutability by returning a new instance. + Maintains immutability by returning a new instance if any values are changed. Args: **updates: Column names and their new values (columns must exist) @@ -707,15 +702,9 @@ def update(self, **updates: DataValue) -> Self: new_data = dict(self._data) new_data.update(updates) - # Reconstruct full data dict for new instance - full_data = new_data # Updated user data - full_data.update(self._meta_data) # Keep existing meta data - - # TODO: transfer over python schema - return self.__class__( - data=full_data, - data_context=self._data_context, - ) + new_datagram = self.copy(include_cache=False) + new_datagram._data = new_data + return new_datagram def with_columns( self, diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 84f9c65..e71dbac 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -81,14 +81,16 @@ def as_table( ) if include_all_info or include_system_tags: - if self._cached_system_tags_table is None: - self._cached_system_tags_table = ( - self._data_context.type_converter.python_dicts_to_arrow_table( - [self._system_tags], - python_schema=self._system_tags_python_schema, + # Only create and stack system tags table if there are actually system tags + if self._system_tags: # Check if system tags dict is not empty + if self._cached_system_tags_table is None: + self._cached_system_tags_table = ( + self._data_context.type_converter.python_dicts_to_arrow_table( + [self._system_tags], + python_schema=self._system_tags_python_schema, + ) ) - ) - table = arrow_utils.hstack_tables(table, self._cached_system_tags_table) + table = arrow_utils.hstack_tables(table, self._cached_system_tags_table) return table def as_dict( @@ -515,10 +517,10 @@ def with_source_info(self, **source_info: str | None) -> Self: current_source_info = self._source_info.copy() for key, value in source_info.items(): - if not key.startswith(constants.SOURCE_PREFIX): - key = f"{constants.SOURCE_PREFIX}{key}" - if key in current_source_info: - current_source_info[key] = value + # Remove prefix if it exists, since _source_info stores unprefixed keys + if key.startswith(constants.SOURCE_PREFIX): + key = key.removeprefix(constants.SOURCE_PREFIX) + current_source_info[key] = value new_packet = self.copy(include_cache=False) new_packet._source_info = current_source_info From 144ef6a4825fa079afb64302bfb75077bac92387 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 13 Aug 2025 23:36:01 +0000 Subject: [PATCH 190/224] test: add system tag persistence tests --- .gitignore | 3 + pixi.lock | 3477 +++++++++++++++++ pyproject.toml | 21 +- src/orcapod/data/datagrams/arrow_datagram.py | 33 +- src/orcapod/protocols/data_protocols.py | 16 +- .../semantic_types/universal_converter.py | 129 +- src/orcapod/utils/arrow_utils.py | 50 + .../test_datagrams/test_arrow_datagram.py | 311 +- .../test_datagrams/test_arrow_tag_packet.py | 230 +- .../test_datagrams/test_base_integration.py | 6 +- uv.lock | 33 +- 11 files changed, 4087 insertions(+), 222 deletions(-) create mode 100644 pixi.lock diff --git a/.gitignore b/.gitignore index 81e31ee..1e38613 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,6 @@ cython_debug/ dj_*_conf.json # directory excluded from source control e.g. trash, scratch work, etc. .untracked +# pixi environments +.pixi/* +!.pixi/config.toml diff --git a/pixi.lock b/pixi.lock new file mode 100644 index 0000000..74e1220 --- /dev/null +++ b/pixi.lock @@ -0,0 +1,3477 @@ +version: 6 +environments: + all: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl + - pypi: ./ + default: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: ./ + dev: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + - pypi: https://files.pythonhosted.org/packages/cb/ed/d1bf75c089857d38332cf45416e419b47382b345ba5dfc4fae69397830d9/adlfs-2024.12.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d4/78/bf94897361fdd650850f0f2e405b2293e2f12808239046232bdedf554301/azure_core-1.35.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/88/2a/75f56b14f115189155cf12e46b366ad1fe3357af5a1a7c09f7446662d617/azure_datalake_store-0.0.53-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a9/74/17428cb429e8d52f6d0d69ed685f4760a545cb0156594963a9337b53b6c9/azure_identity-1.24.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/72/66/88566a6484e746c0b075f7c9bb248e8548eda0a486de4460d150a41e2d57/boto3-1.39.11-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/ea/2f/6ae1db51dc34db499bfe340e89f79a63bd115fc32513a7bacdf17d33cd86/coverage-7.10.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/79/62/120842ab20d9150a9d3a6bdc07fe2870384e82f5266d41c53b08a3a96b34/cryptography-45.0.6-cp311-abi3-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4d/a0/c95baae08a75bceabb79868d663a0736655e427ab9c81fb848da29edaeac/debugpy-1.8.16-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/41/9d/2814a2c47429dc2e197e176de25a946d4538422b081ade8638e585e4006f/google_cloud_storage-3.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/03/b6/39bcf01e1185882f34bc9fb77d1fb4a27911a55f60ab407de34abc8a2347/httpie-3.2.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/34/82/fc5ce89006389a6426ef28e326fc065b0fbaaed230373b62d14c889f47ea/mmh3-5.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/86/5b/fbc73e91f7727ae1e79b21ed833308e99dc11cc1cd3d4717f579775de5e9/msal-1.33.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz + - pypi: https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/bc/16/4ea354101abb1287856baa4af2732be351c7bee728065aed451b678153fd/pytest_cov-6.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7e/0a/2356305c423a975000867de56888b79e44ec2192c690ff93c3109fd78081/pyzmq-27.0.1-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/96/7c/a81ef5ef10978dd073a854e0fa93b5d8021d0594b639cc8f6453c3c78a1d/strictyaml-1.7.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl + - pypi: ./ + ray: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl + - pypi: ./ + redis: + channels: + - url: https://conda.anaconda.org/conda-forge/ + indexes: + - https://pypi.org/simple + packages: + linux-64: + - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + - pypi: ./ +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 + sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 + md5: d7c89558ba9fa0495403155b64376d81 + license: None + purls: [] + size: 2562 + timestamp: 1578324546067 +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 + build_number: 16 + sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 + md5: 73aaf86a425cc6e73fcf236a5a46396d + depends: + - _libgcc_mutex 0.1 conda_forge + - libgomp >=7.5.0 + constrains: + - openmp_impl 9999 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 23621 + timestamp: 1650670423406 +- pypi: https://files.pythonhosted.org/packages/cb/ed/d1bf75c089857d38332cf45416e419b47382b345ba5dfc4fae69397830d9/adlfs-2024.12.0-py3-none-any.whl + name: adlfs + version: 2024.12.0 + sha256: 00aab061ddec0413b2039487e656b62e01ece8ef1ca0493f76034a596cf069e3 + requires_dist: + - azure-core>=1.28.0,<2.0.0 + - azure-datalake-store>=0.0.53,<0.1 + - azure-identity + - azure-storage-blob>=12.17.0 + - fsspec>=2023.12.0 + - aiohttp>=3.7.0 + - sphinx ; extra == 'docs' + - myst-parser ; extra == 'docs' + - furo ; extra == 'docs' + - numpydoc ; extra == 'docs' + - pytest ; extra == 'tests' + - docker ; extra == 'tests' + - pytest-mock ; extra == 'tests' + - arrow ; extra == 'tests' + - dask[dataframe] ; extra == 'tests' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl + name: aiobotocore + version: 2.24.0 + sha256: 72bb1f8eb1b962779a95e1bcc9cf35bc33196ad763b622a40ae7fa9d2e95c87c + requires_dist: + - aiohttp>=3.9.2,<4.0.0 + - aioitertools>=0.5.1,<1.0.0 + - botocore>=1.39.9,<1.39.12 + - python-dateutil>=2.1,<3.0.0 + - jmespath>=0.7.1,<2.0.0 + - multidict>=6.0.0,<7.0.0 + - wrapt>=1.10.10,<2.0.0 + - awscli>=1.41.9,<1.41.12 ; extra == 'awscli' + - boto3>=1.39.9,<1.39.12 ; extra == 'boto3' + - httpx>=0.25.1,<0.29 ; extra == 'httpx' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl + name: aiohappyeyeballs + version: 2.6.1 + sha256: f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: aiohttp + version: 3.12.15 + sha256: 5346b93e62ab51ee2a9d68e8f73c7cf96ffb73568a23e683f931e52450e4148d + requires_dist: + - aiohappyeyeballs>=2.5.0 + - aiosignal>=1.4.0 + - async-timeout>=4.0,<6.0 ; python_full_version < '3.11' + - attrs>=17.3.0 + - frozenlist>=1.1.1 + - multidict>=4.5,<7.0 + - propcache>=0.2.0 + - yarl>=1.17.0,<2.0 + - aiodns>=3.3.0 ; extra == 'speedups' + - brotli ; platform_python_implementation == 'CPython' and extra == 'speedups' + - brotlicffi ; platform_python_implementation != 'CPython' and extra == 'speedups' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl + name: aiohttp-cors + version: 0.8.1 + sha256: 3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d + requires_dist: + - aiohttp>=3.9 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl + name: aioitertools + version: 0.12.0 + sha256: fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796 + requires_dist: + - typing-extensions>=4.0 ; python_full_version < '3.10' + - attribution==1.8.0 ; extra == 'dev' + - black==24.8.0 ; extra == 'dev' + - build>=1.2 ; extra == 'dev' + - coverage==7.6.1 ; extra == 'dev' + - flake8==7.1.1 ; extra == 'dev' + - flit==3.9.0 ; extra == 'dev' + - mypy==1.11.2 ; extra == 'dev' + - usort==1.0.8.post1 ; extra == 'dev' + - ufmt==2.7.1 ; extra == 'dev' + - sphinx==8.0.2 ; extra == 'docs' + - sphinx-mdinclude==0.6.2 ; extra == 'docs' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl + name: aiosignal + version: 1.4.0 + sha256: 053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e + requires_dist: + - frozenlist>=1.1.0 + - typing-extensions>=4.2 ; python_full_version < '3.13' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl + name: annotated-types + version: 0.7.0 + sha256: 1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 + requires_dist: + - typing-extensions>=4.0.0 ; python_full_version < '3.9' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl + name: argon2-cffi + version: 25.1.0 + sha256: fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741 + requires_dist: + - argon2-cffi-bindings + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl + name: argon2-cffi-bindings + version: 25.1.0 + sha256: d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a + requires_dist: + - cffi>=1.0.1 ; python_full_version < '3.14' + - cffi>=2.0.0b1 ; python_full_version >= '3.14' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: arro3-core + version: 0.5.1 + sha256: c4876a3c34bd54d970c498e2f61bfb7e36306934fd6acbfa5de497f093972bf0 + requires_dist: + - typing-extensions ; python_full_version < '3.12' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl + name: asttokens + version: 3.0.0 + sha256: e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2 + requires_dist: + - astroid>=2,<4 ; extra == 'astroid' + - astroid>=2,<4 ; extra == 'test' + - pytest ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-xdist ; extra == 'test' + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda + sha256: 7304f265f146235c34e24db310a94648aa306ca0b2a4a12042bf96da1881f99c + md5: d3f195dfdbbf736e4ec178bbec2a975c + depends: + - python >=3.9 + - six >=1.6.1,<2.0 + license: BSD-3-Clause AND PSF-2.0 + purls: + - pkg:pypi/astunparse?source=hash-mapping + size: 18143 + timestamp: 1736248194225 +- pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl + name: attrs + version: 25.3.0 + sha256: 427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3 + requires_dist: + - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'benchmark' + - hypothesis ; extra == 'benchmark' + - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'benchmark' + - pympler ; extra == 'benchmark' + - pytest-codspeed ; extra == 'benchmark' + - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'benchmark' + - pytest-xdist[psutil] ; extra == 'benchmark' + - pytest>=4.3.0 ; extra == 'benchmark' + - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'cov' + - coverage[toml]>=5.3 ; extra == 'cov' + - hypothesis ; extra == 'cov' + - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'cov' + - pympler ; extra == 'cov' + - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'cov' + - pytest-xdist[psutil] ; extra == 'cov' + - pytest>=4.3.0 ; extra == 'cov' + - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'dev' + - hypothesis ; extra == 'dev' + - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'dev' + - pre-commit-uv ; extra == 'dev' + - pympler ; extra == 'dev' + - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'dev' + - pytest-xdist[psutil] ; extra == 'dev' + - pytest>=4.3.0 ; extra == 'dev' + - cogapp ; extra == 'docs' + - furo ; extra == 'docs' + - myst-parser ; extra == 'docs' + - sphinx ; extra == 'docs' + - sphinx-notfound-page ; extra == 'docs' + - sphinxcontrib-towncrier ; extra == 'docs' + - towncrier ; extra == 'docs' + - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'tests' + - hypothesis ; extra == 'tests' + - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests' + - pympler ; extra == 'tests' + - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests' + - pytest-xdist[psutil] ; extra == 'tests' + - pytest>=4.3.0 ; extra == 'tests' + - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests-mypy' + - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests-mypy' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/d4/78/bf94897361fdd650850f0f2e405b2293e2f12808239046232bdedf554301/azure_core-1.35.0-py3-none-any.whl + name: azure-core + version: 1.35.0 + sha256: 8db78c72868a58f3de8991eb4d22c4d368fae226dac1002998d6c50437e7dad1 + requires_dist: + - requests>=2.21.0 + - six>=1.11.0 + - typing-extensions>=4.6.0 + - aiohttp>=3.0 ; extra == 'aio' + - opentelemetry-api~=1.26 ; extra == 'tracing' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/88/2a/75f56b14f115189155cf12e46b366ad1fe3357af5a1a7c09f7446662d617/azure_datalake_store-0.0.53-py2.py3-none-any.whl + name: azure-datalake-store + version: 0.0.53 + sha256: a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b + requires_dist: + - cffi + - msal>=1.16.0,<2 + - requests>=2.20.0 + - azure-nspkg ; python_full_version < '3' + - pathlib2 ; python_full_version < '3.4' + - futures ; python_full_version < '2.8' +- pypi: https://files.pythonhosted.org/packages/a9/74/17428cb429e8d52f6d0d69ed685f4760a545cb0156594963a9337b53b6c9/azure_identity-1.24.0-py3-none-any.whl + name: azure-identity + version: 1.24.0 + sha256: 9e04997cde0ab02ed66422c74748548e620b7b29361c72ce622acab0267ff7c4 + requires_dist: + - azure-core>=1.31.0 + - cryptography>=2.5 + - msal>=1.30.0 + - msal-extensions>=1.2.0 + - typing-extensions>=4.0.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl + name: azure-storage-blob + version: 12.26.0 + sha256: 8c5631b8b22b4f53ec5fff2f3bededf34cfef111e2af613ad42c9e6de00a77fe + requires_dist: + - azure-core>=1.30.0 + - cryptography>=2.1.4 + - typing-extensions>=4.6.0 + - isodate>=0.6.1 + - azure-core[aio]>=1.30.0 ; extra == 'aio' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl + name: beartype + version: 0.21.0 + sha256: b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0 + requires_dist: + - autoapi>=0.9.0 ; extra == 'dev' + - click ; extra == 'dev' + - coverage>=5.5 ; extra == 'dev' + - equinox ; sys_platform == 'linux' and extra == 'dev' + - jax[cpu] ; sys_platform == 'linux' and extra == 'dev' + - jaxtyping ; sys_platform == 'linux' and extra == 'dev' + - langchain ; extra == 'dev' + - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'dev' + - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'dev' + - numba ; python_full_version < '3.13' and extra == 'dev' + - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'dev' + - pandera ; extra == 'dev' + - pydata-sphinx-theme<=0.7.2 ; extra == 'dev' + - pygments ; extra == 'dev' + - pyright>=1.1.370 ; extra == 'dev' + - pytest>=4.0.0 ; extra == 'dev' + - rich-click ; extra == 'dev' + - sphinx ; extra == 'dev' + - sphinx>=4.2.0,<6.0.0 ; extra == 'dev' + - sphinxext-opengraph>=0.7.5 ; extra == 'dev' + - sqlalchemy ; extra == 'dev' + - tox>=3.20.1 ; extra == 'dev' + - typing-extensions>=3.10.0.0 ; extra == 'dev' + - xarray ; extra == 'dev' + - autoapi>=0.9.0 ; extra == 'doc-rtd' + - pydata-sphinx-theme<=0.7.2 ; extra == 'doc-rtd' + - sphinx>=4.2.0,<6.0.0 ; extra == 'doc-rtd' + - sphinxext-opengraph>=0.7.5 ; extra == 'doc-rtd' + - click ; extra == 'test' + - coverage>=5.5 ; extra == 'test' + - equinox ; sys_platform == 'linux' and extra == 'test' + - jax[cpu] ; sys_platform == 'linux' and extra == 'test' + - jaxtyping ; sys_platform == 'linux' and extra == 'test' + - langchain ; extra == 'test' + - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'test' + - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'test' + - numba ; python_full_version < '3.13' and extra == 'test' + - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'test' + - pandera ; extra == 'test' + - pygments ; extra == 'test' + - pyright>=1.1.370 ; extra == 'test' + - pytest>=4.0.0 ; extra == 'test' + - rich-click ; extra == 'test' + - sphinx ; extra == 'test' + - sqlalchemy ; extra == 'test' + - tox>=3.20.1 ; extra == 'test' + - typing-extensions>=3.10.0.0 ; extra == 'test' + - xarray ; extra == 'test' + - click ; extra == 'test-tox' + - equinox ; sys_platform == 'linux' and extra == 'test-tox' + - jax[cpu] ; sys_platform == 'linux' and extra == 'test-tox' + - jaxtyping ; sys_platform == 'linux' and extra == 'test-tox' + - langchain ; extra == 'test-tox' + - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'test-tox' + - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'test-tox' + - numba ; python_full_version < '3.13' and extra == 'test-tox' + - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'test-tox' + - pandera ; extra == 'test-tox' + - pygments ; extra == 'test-tox' + - pyright>=1.1.370 ; extra == 'test-tox' + - pytest>=4.0.0 ; extra == 'test-tox' + - rich-click ; extra == 'test-tox' + - sphinx ; extra == 'test-tox' + - sqlalchemy ; extra == 'test-tox' + - typing-extensions>=3.10.0.0 ; extra == 'test-tox' + - xarray ; extra == 'test-tox' + - coverage>=5.5 ; extra == 'test-tox-coverage' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/72/66/88566a6484e746c0b075f7c9bb248e8548eda0a486de4460d150a41e2d57/boto3-1.39.11-py3-none-any.whl + name: boto3 + version: 1.39.11 + sha256: af8f1dad35eceff7658fab43b39b0f55892b6e3dd12308733521cc24dd2c9a02 + requires_dist: + - botocore>=1.39.11,<1.40.0 + - jmespath>=0.7.1,<2.0.0 + - s3transfer>=0.13.0,<0.14.0 + - botocore[crt]>=1.21.0,<2.0a0 ; extra == 'crt' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl + name: botocore + version: 1.39.11 + sha256: 1545352931a8a186f3e977b1e1a4542d7d434796e274c3c62efd0210b5ea76dc + requires_dist: + - jmespath>=0.7.1,<2.0.0 + - python-dateutil>=2.1,<3.0.0 + - urllib3>=1.25.4,<1.27 ; python_full_version < '3.10' + - urllib3>=1.25.4,!=2.2.0,<3 ; python_full_version >= '3.10' + - awscrt==0.23.8 ; extra == 'crt' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda + sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d + md5: 62ee74e96c5ebb0af99386de58cf9553 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc-ng >=12 + license: bzip2-1.0.6 + license_family: BSD + purls: [] + size: 252783 + timestamp: 1720974456583 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda + sha256: 837b795a2bb39b75694ba910c13c15fa4998d4bb2a622c214a6a5174b2ae53d1 + md5: 74784ee3d225fc3dca89edb635b4e5cc + depends: + - __unix + license: ISC + purls: [] + size: 154402 + timestamp: 1754210968730 +- pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl + name: cachetools + version: 5.5.2 + sha256: d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl + name: certifi + version: 2025.8.3 + sha256: f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: cffi + version: 1.17.1 + sha256: dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd + requires_dist: + - pycparser + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: charset-normalizer + version: 3.4.3 + sha256: 416175faf02e4b0810f1f38bcb54682878a4af94059a1cd63b8747244420801f + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl + name: click + version: 8.2.1 + sha256: 61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b + requires_dist: + - colorama ; sys_platform == 'win32' + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl + name: colorful + version: 0.5.7 + sha256: 495dd3a23151a9568cee8a90fc1174c902ad7ef06655f50b6bddf9e80008da69 + requires_dist: + - colorama ; sys_platform == 'win32' +- pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl + name: comm + version: 0.2.3 + sha256: c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417 + requires_dist: + - pytest ; extra == 'test' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: contourpy + version: 1.3.3 + sha256: 4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9 + requires_dist: + - numpy>=1.25 + - furo ; extra == 'docs' + - sphinx>=7.2 ; extra == 'docs' + - sphinx-copybutton ; extra == 'docs' + - bokeh ; extra == 'bokeh' + - selenium ; extra == 'bokeh' + - contourpy[bokeh,docs] ; extra == 'mypy' + - bokeh ; extra == 'mypy' + - docutils-stubs ; extra == 'mypy' + - mypy==1.17.0 ; extra == 'mypy' + - types-pillow ; extra == 'mypy' + - contourpy[test-no-images] ; extra == 'test' + - matplotlib ; extra == 'test' + - pillow ; extra == 'test' + - pytest ; extra == 'test-no-images' + - pytest-cov ; extra == 'test-no-images' + - pytest-rerunfailures ; extra == 'test-no-images' + - pytest-xdist ; extra == 'test-no-images' + - wurlitzer ; extra == 'test-no-images' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/ea/2f/6ae1db51dc34db499bfe340e89f79a63bd115fc32513a7bacdf17d33cd86/coverage-7.10.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + name: coverage + version: 7.10.3 + sha256: 913ceddb4289cbba3a310704a424e3fb7aac2bc0c3a23ea473193cb290cf17d4 + requires_dist: + - tomli ; python_full_version <= '3.11' and extra == 'toml' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/79/62/120842ab20d9150a9d3a6bdc07fe2870384e82f5266d41c53b08a3a96b34/cryptography-45.0.6-cp311-abi3-manylinux_2_28_x86_64.whl + name: cryptography + version: 45.0.6 + sha256: 1b7fa6a1c1188c7ee32e47590d16a5a0646270921f8020efc9a511648e1b2e08 + requires_dist: + - cffi>=1.14 ; platform_python_implementation != 'PyPy' + - bcrypt>=3.1.5 ; extra == 'ssh' + - nox>=2024.4.15 ; extra == 'nox' + - nox[uv]>=2024.3.2 ; python_full_version >= '3.8' and extra == 'nox' + - cryptography-vectors==45.0.6 ; extra == 'test' + - pytest>=7.4.0 ; extra == 'test' + - pytest-benchmark>=4.0 ; extra == 'test' + - pytest-cov>=2.10.1 ; extra == 'test' + - pytest-xdist>=3.5.0 ; extra == 'test' + - pretend>=0.7 ; extra == 'test' + - certifi>=2024 ; extra == 'test' + - pytest-randomly ; extra == 'test-randomorder' + - sphinx>=5.3.0 ; extra == 'docs' + - sphinx-rtd-theme>=3.0.0 ; python_full_version >= '3.8' and extra == 'docs' + - sphinx-inline-tabs ; python_full_version >= '3.8' and extra == 'docs' + - pyenchant>=3 ; extra == 'docstest' + - readme-renderer>=30.0 ; extra == 'docstest' + - sphinxcontrib-spelling>=7.3.1 ; extra == 'docstest' + - build>=1.0.0 ; extra == 'sdist' + - ruff>=0.3.6 ; extra == 'pep8test' + - mypy>=1.4 ; extra == 'pep8test' + - check-sdist ; python_full_version >= '3.8' and extra == 'pep8test' + - click>=8.0.1 ; extra == 'pep8test' + requires_python: '>=3.7,!=3.9.0,!=3.9.1' +- pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl + name: cycler + version: 0.12.1 + sha256: 85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30 + requires_dist: + - ipython ; extra == 'docs' + - matplotlib ; extra == 'docs' + - numpydoc ; extra == 'docs' + - sphinx ; extra == 'docs' + - pytest ; extra == 'tests' + - pytest-cov ; extra == 'tests' + - pytest-xdist ; extra == 'tests' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/4d/a0/c95baae08a75bceabb79868d663a0736655e427ab9c81fb848da29edaeac/debugpy-1.8.16-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: debugpy + version: 1.8.16 + sha256: bee89e948bc236a5c43c4214ac62d28b29388453f5fd328d739035e205365f0b + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl + name: decorator + version: 5.2.1 + sha256: d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl + name: defusedxml + version: 0.7.1 + sha256: a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*' +- pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: deltalake + version: 1.1.4 + sha256: 7f28480d3a19f93a75687a1a2a4449b3a6b7355243b765e4379f501dcac03eea + requires_dist: + - arro3-core>=0.5.0 + - deprecated>=1.2.18 + - pandas ; extra == 'pandas' + - pyarrow>=16 ; extra == 'pyarrow' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl + name: deprecated + version: 1.2.18 + sha256: bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec + requires_dist: + - wrapt>=1.10,<2 + - tox ; extra == 'dev' + - pytest ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - bump2version<1 ; extra == 'dev' + - setuptools ; python_full_version >= '3.12' and extra == 'dev' + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' +- pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl + name: distlib + version: 0.4.0 + sha256: 9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16 +- pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl + name: executing + version: 2.2.0 + sha256: 11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa + requires_dist: + - asttokens>=2.1.0 ; extra == 'tests' + - ipython ; extra == 'tests' + - pytest ; extra == 'tests' + - coverage ; extra == 'tests' + - coverage-enable-subprocess ; extra == 'tests' + - littleutils ; extra == 'tests' + - rich ; python_full_version >= '3.11' and extra == 'tests' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl + name: filelock + version: 3.18.0 + sha256: c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de + requires_dist: + - furo>=2024.8.6 ; extra == 'docs' + - sphinx-autodoc-typehints>=3 ; extra == 'docs' + - sphinx>=8.1.3 ; extra == 'docs' + - covdefaults>=2.3 ; extra == 'testing' + - coverage>=7.6.10 ; extra == 'testing' + - diff-cover>=9.2.1 ; extra == 'testing' + - pytest-asyncio>=0.25.2 ; extra == 'testing' + - pytest-cov>=6 ; extra == 'testing' + - pytest-mock>=3.14 ; extra == 'testing' + - pytest-timeout>=2.3.1 ; extra == 'testing' + - pytest>=8.3.4 ; extra == 'testing' + - virtualenv>=20.28.1 ; extra == 'testing' + - typing-extensions>=4.12.2 ; python_full_version < '3.11' and extra == 'typing' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl + name: fonttools + version: 4.59.0 + sha256: efd7e6660674e234e29937bc1481dceb7e0336bfae75b856b4fb272b5093c5d4 + requires_dist: + - lxml>=4.0 ; extra == 'lxml' + - brotli>=1.0.1 ; platform_python_implementation == 'CPython' and extra == 'woff' + - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'woff' + - zopfli>=0.1.4 ; extra == 'woff' + - unicodedata2>=15.1.0 ; python_full_version < '3.13' and extra == 'unicode' + - lz4>=1.7.4.2 ; extra == 'graphite' + - scipy ; platform_python_implementation != 'PyPy' and extra == 'interpolatable' + - munkres ; platform_python_implementation == 'PyPy' and extra == 'interpolatable' + - pycairo ; extra == 'interpolatable' + - matplotlib ; extra == 'plot' + - sympy ; extra == 'symfont' + - xattr ; sys_platform == 'darwin' and extra == 'type1' + - skia-pathops>=0.5.0 ; extra == 'pathops' + - uharfbuzz>=0.23.0 ; extra == 'repacker' + - lxml>=4.0 ; extra == 'all' + - brotli>=1.0.1 ; platform_python_implementation == 'CPython' and extra == 'all' + - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'all' + - zopfli>=0.1.4 ; extra == 'all' + - unicodedata2>=15.1.0 ; python_full_version < '3.13' and extra == 'all' + - lz4>=1.7.4.2 ; extra == 'all' + - scipy ; platform_python_implementation != 'PyPy' and extra == 'all' + - munkres ; platform_python_implementation == 'PyPy' and extra == 'all' + - pycairo ; extra == 'all' + - matplotlib ; extra == 'all' + - sympy ; extra == 'all' + - xattr ; sys_platform == 'darwin' and extra == 'all' + - skia-pathops>=0.5.0 ; extra == 'all' + - uharfbuzz>=0.23.0 ; extra == 'all' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: frozenlist + version: 1.7.0 + sha256: 8bd7eb96a675f18aa5c553eb7ddc24a43c8c18f22e1f9925528128c052cdbe00 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl + name: fsspec + version: 2025.7.0 + sha256: 8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21 + requires_dist: + - adlfs ; extra == 'abfs' + - adlfs ; extra == 'adl' + - pyarrow>=1 ; extra == 'arrow' + - dask ; extra == 'dask' + - distributed ; extra == 'dask' + - pre-commit ; extra == 'dev' + - ruff>=0.5 ; extra == 'dev' + - numpydoc ; extra == 'doc' + - sphinx ; extra == 'doc' + - sphinx-design ; extra == 'doc' + - sphinx-rtd-theme ; extra == 'doc' + - yarl ; extra == 'doc' + - dropbox ; extra == 'dropbox' + - dropboxdrivefs ; extra == 'dropbox' + - requests ; extra == 'dropbox' + - adlfs ; extra == 'full' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'full' + - dask ; extra == 'full' + - distributed ; extra == 'full' + - dropbox ; extra == 'full' + - dropboxdrivefs ; extra == 'full' + - fusepy ; extra == 'full' + - gcsfs ; extra == 'full' + - libarchive-c ; extra == 'full' + - ocifs ; extra == 'full' + - panel ; extra == 'full' + - paramiko ; extra == 'full' + - pyarrow>=1 ; extra == 'full' + - pygit2 ; extra == 'full' + - requests ; extra == 'full' + - s3fs ; extra == 'full' + - smbprotocol ; extra == 'full' + - tqdm ; extra == 'full' + - fusepy ; extra == 'fuse' + - gcsfs ; extra == 'gcs' + - pygit2 ; extra == 'git' + - requests ; extra == 'github' + - gcsfs ; extra == 'gs' + - panel ; extra == 'gui' + - pyarrow>=1 ; extra == 'hdfs' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'http' + - libarchive-c ; extra == 'libarchive' + - ocifs ; extra == 'oci' + - s3fs ; extra == 's3' + - paramiko ; extra == 'sftp' + - smbprotocol ; extra == 'smb' + - paramiko ; extra == 'ssh' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test' + - numpy ; extra == 'test' + - pytest ; extra == 'test' + - pytest-asyncio!=0.22.0 ; extra == 'test' + - pytest-benchmark ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-mock ; extra == 'test' + - pytest-recording ; extra == 'test' + - pytest-rerunfailures ; extra == 'test' + - requests ; extra == 'test' + - aiobotocore>=2.5.4,<3.0.0 ; extra == 'test-downstream' + - dask[dataframe,test] ; extra == 'test-downstream' + - moto[server]>4,<5 ; extra == 'test-downstream' + - pytest-timeout ; extra == 'test-downstream' + - xarray ; extra == 'test-downstream' + - adlfs ; extra == 'test-full' + - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test-full' + - cloudpickle ; extra == 'test-full' + - dask ; extra == 'test-full' + - distributed ; extra == 'test-full' + - dropbox ; extra == 'test-full' + - dropboxdrivefs ; extra == 'test-full' + - fastparquet ; extra == 'test-full' + - fusepy ; extra == 'test-full' + - gcsfs ; extra == 'test-full' + - jinja2 ; extra == 'test-full' + - kerchunk ; extra == 'test-full' + - libarchive-c ; extra == 'test-full' + - lz4 ; extra == 'test-full' + - notebook ; extra == 'test-full' + - numpy ; extra == 'test-full' + - ocifs ; extra == 'test-full' + - pandas ; extra == 'test-full' + - panel ; extra == 'test-full' + - paramiko ; extra == 'test-full' + - pyarrow ; extra == 'test-full' + - pyarrow>=1 ; extra == 'test-full' + - pyftpdlib ; extra == 'test-full' + - pygit2 ; extra == 'test-full' + - pytest ; extra == 'test-full' + - pytest-asyncio!=0.22.0 ; extra == 'test-full' + - pytest-benchmark ; extra == 'test-full' + - pytest-cov ; extra == 'test-full' + - pytest-mock ; extra == 'test-full' + - pytest-recording ; extra == 'test-full' + - pytest-rerunfailures ; extra == 'test-full' + - python-snappy ; extra == 'test-full' + - requests ; extra == 'test-full' + - smbprotocol ; extra == 'test-full' + - tqdm ; extra == 'test-full' + - urllib3 ; extra == 'test-full' + - zarr ; extra == 'test-full' + - zstandard ; python_full_version < '3.14' and extra == 'test-full' + - tqdm ; extra == 'tqdm' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl + name: gcsfs + version: 2025.7.0 + sha256: 653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5 + requires_dist: + - aiohttp!=4.0.0a0,!=4.0.0a1 + - decorator>4.1.2 + - fsspec==2025.7.0 + - google-auth>=1.2 + - google-auth-oauthlib + - google-cloud-storage + - requests + - fusepy ; extra == 'gcsfuse' + - crcmod ; extra == 'crc' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl + name: ghp-import + version: 2.1.0 + sha256: 8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619 + requires_dist: + - python-dateutil>=2.8.1 + - twine ; extra == 'dev' + - markdown ; extra == 'dev' + - flake8 ; extra == 'dev' + - wheel ; extra == 'dev' +- pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl + name: google-api-core + version: 2.25.1 + sha256: 8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7 + requires_dist: + - googleapis-common-protos>=1.56.2,<2.0.0 + - protobuf>=3.19.5,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0 + - proto-plus>=1.22.3,<2.0.0 + - proto-plus>=1.25.0,<2.0.0 ; python_full_version >= '3.13' + - google-auth>=2.14.1,<3.0.0 + - requests>=2.18.0,<3.0.0 + - google-auth[aiohttp]>=2.35.0,<3.0.0 ; extra == 'async-rest' + - grpcio>=1.33.2,<2.0.0 ; extra == 'grpc' + - grpcio>=1.49.1,<2.0.0 ; python_full_version >= '3.11' and extra == 'grpc' + - grpcio-status>=1.33.2,<2.0.0 ; extra == 'grpc' + - grpcio-status>=1.49.1,<2.0.0 ; python_full_version >= '3.11' and extra == 'grpc' + - grpcio-gcp>=0.2.2,<1.0.0 ; extra == 'grpcgcp' + - grpcio-gcp>=0.2.2,<1.0.0 ; extra == 'grpcio-gcp' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl + name: google-auth + version: 2.40.3 + sha256: 1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca + requires_dist: + - cachetools>=2.0.0,<6.0 + - pyasn1-modules>=0.2.1 + - rsa>=3.1.4,<5 + - aiohttp>=3.6.2,<4.0.0 ; extra == 'aiohttp' + - requests>=2.20.0,<3.0.0 ; extra == 'aiohttp' + - cryptography ; extra == 'enterprise-cert' + - pyopenssl ; extra == 'enterprise-cert' + - pyjwt>=2.0 ; extra == 'pyjwt' + - cryptography>=38.0.3 ; extra == 'pyjwt' + - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'pyjwt' + - pyopenssl>=20.0.0 ; extra == 'pyopenssl' + - cryptography>=38.0.3 ; extra == 'pyopenssl' + - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'pyopenssl' + - pyu2f>=0.1.5 ; extra == 'reauth' + - requests>=2.20.0,<3.0.0 ; extra == 'requests' + - grpcio ; extra == 'testing' + - flask ; extra == 'testing' + - freezegun ; extra == 'testing' + - mock ; extra == 'testing' + - oauth2client ; extra == 'testing' + - pyjwt>=2.0 ; extra == 'testing' + - cryptography>=38.0.3 ; extra == 'testing' + - pytest ; extra == 'testing' + - pytest-cov ; extra == 'testing' + - pytest-localserver ; extra == 'testing' + - pyopenssl>=20.0.0 ; extra == 'testing' + - pyu2f>=0.1.5 ; extra == 'testing' + - responses ; extra == 'testing' + - urllib3 ; extra == 'testing' + - packaging ; extra == 'testing' + - aiohttp>=3.6.2,<4.0.0 ; extra == 'testing' + - requests>=2.20.0,<3.0.0 ; extra == 'testing' + - aioresponses ; extra == 'testing' + - pytest-asyncio ; extra == 'testing' + - pyopenssl<24.3.0 ; extra == 'testing' + - aiohttp<3.10.0 ; extra == 'testing' + - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'testing' + - urllib3 ; extra == 'urllib3' + - packaging ; extra == 'urllib3' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl + name: google-auth-oauthlib + version: 1.2.2 + sha256: fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2 + requires_dist: + - google-auth>=2.15.0 + - requests-oauthlib>=0.7.0 + - click>=6.0.0 ; extra == 'tool' + requires_python: '>=3.6' +- pypi: https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl + name: google-cloud-core + version: 2.4.3 + sha256: 5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e + requires_dist: + - google-api-core>=1.31.6,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0 + - google-auth>=1.25.0,<3.0.dev0 + - importlib-metadata>1.0.0 ; python_full_version < '3.8' + - grpcio>=1.38.0,<2.0.dev0 ; extra == 'grpc' + - grpcio-status>=1.38.0,<2.0.dev0 ; extra == 'grpc' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/41/9d/2814a2c47429dc2e197e176de25a946d4538422b081ade8638e585e4006f/google_cloud_storage-3.3.0-py3-none-any.whl + name: google-cloud-storage + version: 3.3.0 + sha256: 0338ecd6621b3ecacb108f1cf7513ff0d1bca7f1ff4d58e0220b59f3a725ff23 + requires_dist: + - google-auth>=2.26.1,<3.0.0 + - google-api-core>=2.15.0,<3.0.0 + - google-cloud-core>=2.4.2,<3.0.0 + - google-resumable-media>=2.7.2,<3.0.0 + - requests>=2.22.0,<3.0.0 + - google-crc32c>=1.1.3,<2.0.0 + - protobuf>=3.20.2,<7.0.0 ; extra == 'protobuf' + - opentelemetry-api>=1.1.0,<2.0.0 ; extra == 'tracing' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: google-crc32c + version: 1.7.1 + sha256: 6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6 + requires_dist: + - importlib-resources>=1.3 ; python_full_version < '3.9' and os_name == 'nt' + - pytest ; extra == 'testing' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl + name: google-resumable-media + version: 2.7.2 + sha256: 3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa + requires_dist: + - google-crc32c>=1.0,<2.0.dev0 + - aiohttp>=3.6.2,<4.0.0.dev0 ; extra == 'aiohttp' + - google-auth>=1.22.0,<2.0.dev0 ; extra == 'aiohttp' + - requests>=2.18.0,<3.0.0.dev0 ; extra == 'requests' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl + name: googleapis-common-protos + version: 1.70.0 + sha256: b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8 + requires_dist: + - protobuf>=3.20.2,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0 + - grpcio>=1.44.0,<2.0.0 ; extra == 'grpc' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: grpcio + version: 1.74.0 + sha256: e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 + requires_dist: + - grpcio-tools>=1.74.0 ; extra == 'protobuf' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/03/b6/39bcf01e1185882f34bc9fb77d1fb4a27911a55f60ab407de34abc8a2347/httpie-3.2.4-py3-none-any.whl + name: httpie + version: 3.2.4 + sha256: 4bd0435cc4b9bca59501bc65089de96f3e93b393803f32a81951db62050ebf0b + requires_dist: + - pip + - charset-normalizer>=2.0.0 + - defusedxml>=0.6.0 + - requests[socks]>=2.22.0 + - pygments>=2.5.2 + - requests-toolbelt>=0.9.1 + - multidict>=4.7.0 + - setuptools + - rich>=9.10.0 + - importlib-metadata>=1.4.0 ; python_full_version < '3.8' + - colorama>=0.2.4 ; sys_platform == 'win32' + - pytest ; extra == 'dev' + - pytest-httpbin>=0.0.6 ; extra == 'dev' + - responses ; extra == 'dev' + - pytest-mock ; extra == 'dev' + - werkzeug<2.1.0 ; extra == 'dev' + - flake8 ; extra == 'dev' + - flake8-comprehensions ; extra == 'dev' + - flake8-deprecated ; extra == 'dev' + - flake8-mutable ; extra == 'dev' + - flake8-tuple ; extra == 'dev' + - pyopenssl ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - pyyaml ; extra == 'dev' + - twine ; extra == 'dev' + - wheel ; extra == 'dev' + - jinja2 ; extra == 'dev' + - pytest ; extra == 'test' + - pytest-httpbin>=0.0.6 ; extra == 'test' + - responses ; extra == 'test' + - pytest-mock ; extra == 'test' + - werkzeug<2.1.0 ; extra == 'test' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl + name: idna + version: '3.10' + sha256: 946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 + requires_dist: + - ruff>=0.6.2 ; extra == 'all' + - mypy>=1.11.2 ; extra == 'all' + - pytest>=8.3.2 ; extra == 'all' + - flake8>=7.1.1 ; extra == 'all' + requires_python: '>=3.6' +- pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl + name: importlib-metadata + version: 8.7.0 + sha256: e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd + requires_dist: + - zipp>=3.20 + - typing-extensions>=3.6.4 ; python_full_version < '3.8' + - pytest>=6,!=8.1.* ; extra == 'test' + - importlib-resources>=1.3 ; python_full_version < '3.9' and extra == 'test' + - packaging ; extra == 'test' + - pyfakefs ; extra == 'test' + - flufl-flake8 ; extra == 'test' + - pytest-perf>=0.9.2 ; extra == 'test' + - jaraco-test>=5.4 ; extra == 'test' + - sphinx>=3.5 ; extra == 'doc' + - jaraco-packaging>=9.3 ; extra == 'doc' + - rst-linker>=1.9 ; extra == 'doc' + - furo ; extra == 'doc' + - sphinx-lint ; extra == 'doc' + - jaraco-tidelift>=1.4 ; extra == 'doc' + - ipython ; extra == 'perf' + - pytest-checkdocs>=2.4 ; extra == 'check' + - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' + - pytest-cov ; extra == 'cover' + - pytest-enabler>=2.2 ; extra == 'enabler' + - pytest-mypy ; extra == 'type' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl + name: iniconfig + version: 2.1.0 + sha256: 9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl + name: ipykernel + version: 6.30.1 + sha256: aa6b9fb93dca949069d8b85b6c79b2518e32ac583ae9c7d37c51d119e18b3fb4 + requires_dist: + - appnope>=0.1.2 ; sys_platform == 'darwin' + - comm>=0.1.1 + - debugpy>=1.6.5 + - ipython>=7.23.1 + - jupyter-client>=8.0.0 + - jupyter-core>=4.12,!=5.0.* + - matplotlib-inline>=0.1 + - nest-asyncio>=1.4 + - packaging>=22 + - psutil>=5.7 + - pyzmq>=25 + - tornado>=6.2 + - traitlets>=5.4.0 + - coverage[toml] ; extra == 'cov' + - matplotlib ; extra == 'cov' + - pytest-cov ; extra == 'cov' + - trio ; extra == 'cov' + - intersphinx-registry ; extra == 'docs' + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx ; extra == 'docs' + - sphinx-autodoc-typehints ; extra == 'docs' + - sphinxcontrib-github-alt ; extra == 'docs' + - sphinxcontrib-spelling ; extra == 'docs' + - trio ; extra == 'docs' + - pyqt5 ; extra == 'pyqt5' + - pyside6 ; extra == 'pyside6' + - flaky ; extra == 'test' + - ipyparallel ; extra == 'test' + - pre-commit ; extra == 'test' + - pytest-asyncio>=0.23.5 ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest>=7.0,<9 ; extra == 'test' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl + name: ipython + version: 9.4.0 + sha256: 25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066 + requires_dist: + - colorama ; sys_platform == 'win32' + - decorator + - ipython-pygments-lexers + - jedi>=0.16 + - matplotlib-inline + - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' + - prompt-toolkit>=3.0.41,<3.1.0 + - pygments>=2.4.0 + - stack-data + - traitlets>=5.13.0 + - typing-extensions>=4.6 ; python_full_version < '3.12' + - black ; extra == 'black' + - docrepr ; extra == 'doc' + - exceptiongroup ; extra == 'doc' + - intersphinx-registry ; extra == 'doc' + - ipykernel ; extra == 'doc' + - ipython[test] ; extra == 'doc' + - matplotlib ; extra == 'doc' + - setuptools>=18.5 ; extra == 'doc' + - sphinx-toml==0.0.4 ; extra == 'doc' + - sphinx-rtd-theme ; extra == 'doc' + - sphinx>=1.3 ; extra == 'doc' + - typing-extensions ; extra == 'doc' + - pytest ; extra == 'test' + - pytest-asyncio<0.22 ; extra == 'test' + - testpath ; extra == 'test' + - packaging ; extra == 'test' + - ipython[test] ; extra == 'test-extra' + - curio ; extra == 'test-extra' + - jupyter-ai ; extra == 'test-extra' + - matplotlib!=3.2.0 ; extra == 'test-extra' + - nbformat ; extra == 'test-extra' + - nbclient ; extra == 'test-extra' + - ipykernel ; extra == 'test-extra' + - numpy>=1.23 ; extra == 'test-extra' + - pandas ; extra == 'test-extra' + - trio ; extra == 'test-extra' + - matplotlib ; extra == 'matplotlib' + - ipython[doc,matplotlib,test,test-extra] ; extra == 'all' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl + name: ipython-pygments-lexers + version: 1.1.1 + sha256: a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c + requires_dist: + - pygments + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl + name: ipywidgets + version: 8.1.7 + sha256: 764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb + requires_dist: + - comm>=0.1.3 + - ipython>=6.1.0 + - traitlets>=4.3.1 + - widgetsnbextension~=4.0.14 + - jupyterlab-widgets~=3.0.15 + - jsonschema ; extra == 'test' + - ipykernel ; extra == 'test' + - pytest>=3.6.0 ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytz ; extra == 'test' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl + name: isodate + version: 0.7.2 + sha256: 28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl + name: jedi + version: 0.19.2 + sha256: a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9 + requires_dist: + - parso>=0.8.4,<0.9.0 + - jinja2==2.11.3 ; extra == 'docs' + - markupsafe==1.1.1 ; extra == 'docs' + - pygments==2.8.1 ; extra == 'docs' + - alabaster==0.7.12 ; extra == 'docs' + - babel==2.9.1 ; extra == 'docs' + - chardet==4.0.0 ; extra == 'docs' + - commonmark==0.8.1 ; extra == 'docs' + - docutils==0.17.1 ; extra == 'docs' + - future==0.18.2 ; extra == 'docs' + - idna==2.10 ; extra == 'docs' + - imagesize==1.2.0 ; extra == 'docs' + - mock==1.0.1 ; extra == 'docs' + - packaging==20.9 ; extra == 'docs' + - pyparsing==2.4.7 ; extra == 'docs' + - pytz==2021.1 ; extra == 'docs' + - readthedocs-sphinx-ext==2.1.4 ; extra == 'docs' + - recommonmark==0.5.0 ; extra == 'docs' + - requests==2.25.1 ; extra == 'docs' + - six==1.15.0 ; extra == 'docs' + - snowballstemmer==2.1.0 ; extra == 'docs' + - sphinx-rtd-theme==0.4.3 ; extra == 'docs' + - sphinx==1.8.5 ; extra == 'docs' + - sphinxcontrib-serializinghtml==1.1.4 ; extra == 'docs' + - sphinxcontrib-websupport==1.2.4 ; extra == 'docs' + - urllib3==1.26.4 ; extra == 'docs' + - flake8==5.0.4 ; extra == 'qa' + - mypy==0.971 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + - django ; extra == 'testing' + - attrs ; extra == 'testing' + - colorama ; extra == 'testing' + - docopt ; extra == 'testing' + - pytest<9.0.0 ; extra == 'testing' + requires_python: '>=3.6' +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda + sha256: f1ac18b11637ddadc05642e8185a851c7fab5998c6f5470d716812fae943b2af + md5: 446bd6c8cb26050d528881df495ce646 + depends: + - markupsafe >=2.0 + - python >=3.9 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/jinja2?source=hash-mapping + size: 112714 + timestamp: 1741263433881 +- pypi: https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl + name: jmespath + version: 1.0.1 + sha256: 02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl + name: jsonschema + version: 4.25.0 + sha256: 24c2e8da302de79c8b9382fee3e76b355e44d2a4364bb207159ce10b517bd716 + requires_dist: + - attrs>=22.2.0 + - jsonschema-specifications>=2023.3.6 + - referencing>=0.28.4 + - rpds-py>=0.7.1 + - fqdn ; extra == 'format' + - idna ; extra == 'format' + - isoduration ; extra == 'format' + - jsonpointer>1.13 ; extra == 'format' + - rfc3339-validator ; extra == 'format' + - rfc3987 ; extra == 'format' + - uri-template ; extra == 'format' + - webcolors>=1.11 ; extra == 'format' + - fqdn ; extra == 'format-nongpl' + - idna ; extra == 'format-nongpl' + - isoduration ; extra == 'format-nongpl' + - jsonpointer>1.13 ; extra == 'format-nongpl' + - rfc3339-validator ; extra == 'format-nongpl' + - rfc3986-validator>0.1.0 ; extra == 'format-nongpl' + - rfc3987-syntax>=1.1.0 ; extra == 'format-nongpl' + - uri-template ; extra == 'format-nongpl' + - webcolors>=24.6.0 ; extra == 'format-nongpl' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl + name: jsonschema-specifications + version: 2025.4.1 + sha256: 4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af + requires_dist: + - referencing>=0.31.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl + name: jupyter-client + version: 8.6.3 + sha256: e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f + requires_dist: + - importlib-metadata>=4.8.3 ; python_full_version < '3.10' + - jupyter-core>=4.12,!=5.0.* + - python-dateutil>=2.8.2 + - pyzmq>=23.0 + - tornado>=6.2 + - traitlets>=5.3 + - ipykernel ; extra == 'docs' + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx-autodoc-typehints ; extra == 'docs' + - sphinx>=4 ; extra == 'docs' + - sphinxcontrib-github-alt ; extra == 'docs' + - sphinxcontrib-spelling ; extra == 'docs' + - coverage ; extra == 'test' + - ipykernel>=6.14 ; extra == 'test' + - mypy ; extra == 'test' + - paramiko ; sys_platform == 'win32' and extra == 'test' + - pre-commit ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-jupyter[client]>=0.4.1 ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest<8.2.0 ; extra == 'test' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl + name: jupyter-core + version: 5.8.1 + sha256: c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0 + requires_dist: + - platformdirs>=2.5 + - pywin32>=300 ; platform_python_implementation != 'PyPy' and sys_platform == 'win32' + - traitlets>=5.3 + - intersphinx-registry ; extra == 'docs' + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx-autodoc-typehints ; extra == 'docs' + - sphinxcontrib-spelling ; extra == 'docs' + - traitlets ; extra == 'docs' + - ipykernel ; extra == 'test' + - pre-commit ; extra == 'test' + - pytest-cov ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest<9 ; extra == 'test' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl + name: jupyterlab-widgets + version: 3.0.15 + sha256: d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + name: kiwisolver + version: 1.4.9 + sha256: b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098 + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda + sha256: 1a620f27d79217c1295049ba214c2f80372062fd251b569e9873d4a953d27554 + md5: 0be7c6e070c19105f966d3758448d018 + depends: + - __glibc >=2.17,<3.0.a0 + constrains: + - binutils_impl_linux-64 2.44 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 676044 + timestamp: 1752032747103 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda + sha256: da2080da8f0288b95dd86765c801c6e166c4619b910b11f9a8446fb852438dc2 + md5: 4211416ecba1866fab0c6470986c22d6 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + constrains: + - expat 2.7.1.* + license: MIT + license_family: MIT + purls: [] + size: 74811 + timestamp: 1752719572741 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda + sha256: 764432d32db45466e87f10621db5b74363a9f847d2b8b1f9743746cd160f06ab + md5: ede4673863426c0883c0063d853bbd85 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: MIT + license_family: MIT + purls: [] + size: 57433 + timestamp: 1743434498161 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda + sha256: 144e35c1c2840f2dc202f6915fc41879c19eddbb8fa524e3ca4aa0d14018b26f + md5: f406dcbb2e7bef90d793e50e79a2882b + depends: + - __glibc >=2.17,<3.0.a0 + - _openmp_mutex >=4.5 + constrains: + - libgcc-ng ==15.1.0=*_4 + - libgomp 15.1.0 h767d61c_4 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 824153 + timestamp: 1753903866511 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda + sha256: 76ceac93ed98f208363d6e9c75011b0ff7b97b20f003f06461a619557e726637 + md5: 28771437ffcd9f3417c66012dc49a3be + depends: + - libgcc 15.1.0 h767d61c_4 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 29249 + timestamp: 1753903872571 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda + sha256: e0487a8fec78802ac04da0ac1139c3510992bc58a58cde66619dde3b363c2933 + md5: 3baf8976c96134738bba224e9ef6b1e5 + depends: + - __glibc >=2.17,<3.0.a0 + license: GPL-3.0-only WITH GCC-exception-3.1 + license_family: GPL + purls: [] + size: 447289 + timestamp: 1753903801049 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda + sha256: f2591c0069447bbe28d4d696b7fcb0c5bd0b4ac582769b89addbcf26fb3430d8 + md5: 1a580f7796c7bf6393fddb8bbbde58dc + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - xz 5.8.1.* + license: 0BSD + purls: [] + size: 112894 + timestamp: 1749230047870 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda + sha256: 3aa92d4074d4063f2a162cd8ecb45dccac93e543e565c01a787e16a43501f7ee + md5: c7e925f37e3b40d893459e625f6a53f1 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: BSD-2-Clause + license_family: BSD + purls: [] + size: 91183 + timestamp: 1748393666725 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda + sha256: 6d9c32fc369af5a84875725f7ddfbfc2ace795c28f246dc70055a79f9b2003da + md5: 0b367fad34931cb79e0d6b7e5c06bb1c + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=14 + - libzlib >=1.3.1,<2.0a0 + license: blessing + purls: [] + size: 932581 + timestamp: 1753948484112 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda + sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 + md5: 40b61aab5c7ba9ff276c41cfffe6b80b + depends: + - libgcc-ng >=12 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 33601 + timestamp: 1680112270483 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda + sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 + md5: edb0dca6bc32e4f4789199455a1dbeb8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + constrains: + - zlib 1.3.1 *_2 + license: Zlib + license_family: Other + purls: [] + size: 60963 + timestamp: 1727963148474 +- pypi: https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl + name: markdown + version: 3.8.2 + sha256: 5c83764dbd4e00bdd94d85a19b8d55ccca20fe35b2e678a1422b380324dd5f24 + requires_dist: + - importlib-metadata>=4.4 ; python_full_version < '3.10' + - coverage ; extra == 'testing' + - pyyaml ; extra == 'testing' + - mkdocs>=1.6 ; extra == 'docs' + - mkdocs-nature>=0.6 ; extra == 'docs' + - mdx-gh-links>=0.2 ; extra == 'docs' + - mkdocstrings[python] ; extra == 'docs' + - mkdocs-gen-files ; extra == 'docs' + - mkdocs-section-index ; extra == 'docs' + - mkdocs-literate-nav ; extra == 'docs' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl + name: markdown-it-py + version: 4.0.0 + sha256: 87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147 + requires_dist: + - mdurl~=0.1 + - psutil ; extra == 'benchmarking' + - pytest ; extra == 'benchmarking' + - pytest-benchmark ; extra == 'benchmarking' + - commonmark~=0.9 ; extra == 'compare' + - markdown~=3.4 ; extra == 'compare' + - mistletoe~=1.0 ; extra == 'compare' + - mistune~=3.0 ; extra == 'compare' + - panflute~=2.3 ; extra == 'compare' + - markdown-it-pyrs ; extra == 'compare' + - linkify-it-py>=1,<3 ; extra == 'linkify' + - mdit-py-plugins>=0.5.0 ; extra == 'plugins' + - gprof2dot ; extra == 'profiling' + - mdit-py-plugins>=0.5.0 ; extra == 'rtd' + - myst-parser ; extra == 'rtd' + - pyyaml ; extra == 'rtd' + - sphinx ; extra == 'rtd' + - sphinx-copybutton ; extra == 'rtd' + - sphinx-design ; extra == 'rtd' + - sphinx-book-theme~=1.0 ; extra == 'rtd' + - jupyter-sphinx ; extra == 'rtd' + - ipykernel ; extra == 'rtd' + - coverage ; extra == 'testing' + - pytest ; extra == 'testing' + - pytest-cov ; extra == 'testing' + - pytest-regressions ; extra == 'testing' + - requests ; extra == 'testing' + requires_python: '>=3.10' +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda + sha256: d812caf52efcea7c9fd0eafb21d45dadfd0516812f667b928bee50e87634fae5 + md5: 21b62c55924f01b6eef6827167b46acb + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - python >=3.13,<3.14.0a0 + - python_abi 3.13.* *_cp313 + constrains: + - jinja2 >=3.0.0 + license: BSD-3-Clause + license_family: BSD + purls: + - pkg:pypi/markupsafe?source=hash-mapping + size: 24856 + timestamp: 1733219782830 +- pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl + name: matplotlib + version: 3.10.5 + sha256: d52fd5b684d541b5a51fb276b2b97b010c75bee9aa392f96b4a07aeb491e33c7 + requires_dist: + - contourpy>=1.0.1 + - cycler>=0.10 + - fonttools>=4.22.0 + - kiwisolver>=1.3.1 + - numpy>=1.23 + - packaging>=20.0 + - pillow>=8 + - pyparsing>=2.3.1 + - python-dateutil>=2.7 + - meson-python>=0.13.1,<0.17.0 ; extra == 'dev' + - pybind11>=2.13.2,!=2.13.3 ; extra == 'dev' + - setuptools-scm>=7 ; extra == 'dev' + - setuptools>=64 ; extra == 'dev' + requires_python: '>=3.10' +- pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl + name: matplotlib-inline + version: 0.1.7 + sha256: df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca + requires_dist: + - traitlets + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl + name: mdurl + version: 0.1.2 + sha256: 84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl + name: mergedeep + version: 1.3.4 + sha256: 70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307 + requires_python: '>=3.6' +- pypi: https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl + name: minio + version: 7.2.16 + sha256: 9288ab988ca57c181eb59a4c96187b293131418e28c164392186c2b89026b223 + requires_dist: + - argon2-cffi + - certifi + - pycryptodome + - typing-extensions + - urllib3 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl + name: mkdocs + version: 1.6.1 + sha256: db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e + requires_dist: + - click>=7.0 + - colorama>=0.4 ; sys_platform == 'win32' + - ghp-import>=1.0 + - importlib-metadata>=4.4 ; python_full_version < '3.10' + - jinja2>=2.11.1 + - markdown>=3.3.6 + - markupsafe>=2.0.1 + - mergedeep>=1.3.4 + - mkdocs-get-deps>=0.2.0 + - packaging>=20.5 + - pathspec>=0.11.1 + - pyyaml-env-tag>=0.1 + - pyyaml>=5.1 + - watchdog>=2.0 + - babel>=2.9.0 ; extra == 'i18n' + - babel==2.9.0 ; extra == 'min-versions' + - click==7.0 ; extra == 'min-versions' + - colorama==0.4 ; sys_platform == 'win32' and extra == 'min-versions' + - ghp-import==1.0 ; extra == 'min-versions' + - importlib-metadata==4.4 ; python_full_version < '3.10' and extra == 'min-versions' + - jinja2==2.11.1 ; extra == 'min-versions' + - markdown==3.3.6 ; extra == 'min-versions' + - markupsafe==2.0.1 ; extra == 'min-versions' + - mergedeep==1.3.4 ; extra == 'min-versions' + - mkdocs-get-deps==0.2.0 ; extra == 'min-versions' + - packaging==20.5 ; extra == 'min-versions' + - pathspec==0.11.1 ; extra == 'min-versions' + - pyyaml-env-tag==0.1 ; extra == 'min-versions' + - pyyaml==5.1 ; extra == 'min-versions' + - watchdog==2.0 ; extra == 'min-versions' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl + name: mkdocs-get-deps + version: 0.2.0 + sha256: 2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134 + requires_dist: + - importlib-metadata>=4.3 ; python_full_version < '3.10' + - mergedeep>=1.3.4 + - platformdirs>=2.2.0 + - pyyaml>=5.1 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/34/82/fc5ce89006389a6426ef28e326fc065b0fbaaed230373b62d14c889f47ea/mmh3-5.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + name: mmh3 + version: 5.2.0 + sha256: 7e5634565367b6d98dc4aa2983703526ef556b3688ba3065edb4b9b90ede1c54 + requires_dist: + - pytest==8.4.1 ; extra == 'test' + - pytest-sugar==1.0.0 ; extra == 'test' + - black==25.1.0 ; extra == 'lint' + - clang-format==20.1.8 ; extra == 'lint' + - isort==6.0.1 ; extra == 'lint' + - pylint==3.3.7 ; extra == 'lint' + - mypy==1.17.0 ; extra == 'type' + - myst-parser==4.0.1 ; extra == 'docs' + - shibuya==2025.7.24 ; extra == 'docs' + - sphinx==8.2.3 ; extra == 'docs' + - sphinx-copybutton==0.5.2 ; extra == 'docs' + - pymmh3==0.0.5 ; extra == 'benchmark' + - pyperf==2.9.0 ; extra == 'benchmark' + - xxhash==3.5.0 ; extra == 'benchmark' + - matplotlib==3.10.3 ; extra == 'plot' + - pandas==2.3.1 ; extra == 'plot' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/86/5b/fbc73e91f7727ae1e79b21ed833308e99dc11cc1cd3d4717f579775de5e9/msal-1.33.0-py3-none-any.whl + name: msal + version: 1.33.0 + sha256: c0cd41cecf8eaed733ee7e3be9e040291eba53b0f262d3ae9c58f38b04244273 + requires_dist: + - requests>=2.0.0,<3 + - pyjwt[crypto]>=1.0.0,<3 + - cryptography>=2.5,<48 + - pymsalruntime>=0.14,<0.19 ; python_full_version >= '3.6' and sys_platform == 'win32' and extra == 'broker' + - pymsalruntime>=0.17,<0.19 ; python_full_version >= '3.8' and sys_platform == 'darwin' and extra == 'broker' + - pymsalruntime>=0.18,<0.19 ; python_full_version >= '3.8' and sys_platform == 'linux' and extra == 'broker' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl + name: msal-extensions + version: 1.3.1 + sha256: 96d3de4d034504e969ac5e85bae8106c8373b5c6568e4c8fa7af2eca9dbe6bca + requires_dist: + - msal>=1.29,<2 + - portalocker>=1.4,<4 ; extra == 'portalocker' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: msgpack + version: 1.1.1 + sha256: 9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl + name: multidict + version: 6.6.4 + sha256: 497a2954adc25c08daff36f795077f63ad33e13f19bfff7736e72c785391534f + requires_dist: + - typing-extensions>=4.1.0 ; python_full_version < '3.11' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda + sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 + md5: 47e340acb35de30501a76c7c799c41d7 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + license: X11 AND BSD-3-Clause + purls: [] + size: 891641 + timestamp: 1738195959188 +- pypi: https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl + name: nest-asyncio + version: 1.6.0 + sha256: 87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c + requires_python: '>=3.5' +- pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl + name: networkx + version: '3.5' + sha256: 0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec + requires_dist: + - numpy>=1.25 ; extra == 'default' + - scipy>=1.11.2 ; extra == 'default' + - matplotlib>=3.8 ; extra == 'default' + - pandas>=2.0 ; extra == 'default' + - pre-commit>=4.1 ; extra == 'developer' + - mypy>=1.15 ; extra == 'developer' + - sphinx>=8.0 ; extra == 'doc' + - pydata-sphinx-theme>=0.16 ; extra == 'doc' + - sphinx-gallery>=0.18 ; extra == 'doc' + - numpydoc>=1.8.0 ; extra == 'doc' + - pillow>=10 ; extra == 'doc' + - texext>=0.6.7 ; extra == 'doc' + - myst-nb>=1.1 ; extra == 'doc' + - intersphinx-registry ; extra == 'doc' + - osmnx>=2.0.0 ; extra == 'example' + - momepy>=0.7.2 ; extra == 'example' + - contextily>=1.6 ; extra == 'example' + - seaborn>=0.13 ; extra == 'example' + - cairocffi>=1.7 ; extra == 'example' + - igraph>=0.11 ; extra == 'example' + - scikit-learn>=1.5 ; extra == 'example' + - lxml>=4.6 ; extra == 'extra' + - pygraphviz>=1.14 ; extra == 'extra' + - pydot>=3.0.1 ; extra == 'extra' + - sympy>=1.10 ; extra == 'extra' + - pytest>=7.2 ; extra == 'test' + - pytest-cov>=4.0 ; extra == 'test' + - pytest-xdist>=3.0 ; extra == 'test' + - pytest-mpl ; extra == 'test-extras' + - pytest-randomly ; extra == 'test-extras' + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: numpy + version: 2.3.2 + sha256: 938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f + requires_python: '>=3.11' +- pypi: https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl + name: oauthlib + version: 3.3.1 + sha256: 88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1 + requires_dist: + - cryptography>=3.0.0 ; extra == 'rsa' + - cryptography>=3.0.0 ; extra == 'signedtoken' + - pyjwt>=2.0.0,<3 ; extra == 'signedtoken' + - blinker>=1.4.0 ; extra == 'signals' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl + name: opencensus + version: 0.11.4 + sha256: a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 + requires_dist: + - opencensus-context>=0.1.3 + - six~=1.16 + - google-api-core>=1.0.0,<2.0.0 ; python_full_version < '3.6' + - google-api-core>=1.0.0,<3.0.0 ; python_full_version >= '3.6' +- pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl + name: opencensus-context + version: 0.1.3 + sha256: 073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 + requires_dist: + - contextvars ; python_full_version == '3.6.*' +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda + sha256: c9f54d4e8212f313be7b02eb962d0cb13a8dae015683a403d3accd4add3e520e + md5: ffffb341206dd0dab0c36053c048d621 + depends: + - __glibc >=2.17,<3.0.a0 + - ca-certificates + - libgcc >=14 + license: Apache-2.0 + license_family: Apache + purls: [] + size: 3128847 + timestamp: 1754465526100 +- pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl + name: opentelemetry-api + version: 1.36.0 + sha256: 02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c + requires_dist: + - importlib-metadata>=6.0,<8.8.0 + - typing-extensions>=4.5.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl + name: opentelemetry-exporter-prometheus + version: 0.57b0 + sha256: c5b893d1cdd593fb022af2c7de3258c2d5a4d04402ae80d9fa35675fed77f05c + requires_dist: + - opentelemetry-api~=1.12 + - opentelemetry-sdk~=1.36.0 + - prometheus-client>=0.5.0,<1.0.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl + name: opentelemetry-proto + version: 1.36.0 + sha256: 151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e + requires_dist: + - protobuf>=5.0,<7.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl + name: opentelemetry-sdk + version: 1.36.0 + sha256: 19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb + requires_dist: + - opentelemetry-api==1.36.0 + - opentelemetry-semantic-conventions==0.57b0 + - typing-extensions>=4.5.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl + name: opentelemetry-semantic-conventions + version: 0.57b0 + sha256: 757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78 + requires_dist: + - opentelemetry-api==1.36.0 + - typing-extensions>=4.5.0 + requires_python: '>=3.9' +- pypi: ./ + name: orcapod + version: 0.0.3a2.dev29+g197f3e1.d20250813 + sha256: 9604e103255e0296954d6e36a7b1822e342d2a03c78afb0bcae28ac9d9121b24 + requires_dist: + - xxhash + - networkx + - typing-extensions + - matplotlib>=3.10.3 + - pandas>=2.2.3 + - pyyaml>=6.0.2 + - pyarrow>=20.0.0 + - polars>=1.31.0 + - beartype>=0.21.0 + - deltalake>=1.0.2 + - pdoc>=15.0.4 + - redis>=6.2.0 ; extra == 'redis' + - ray[default]==2.48.0 ; extra == 'ray' + - ipywidgets>=8.1.7 ; extra == 'ray' + - orcapod[redis] ; extra == 'all' + - orcapod[ray] ; extra == 'all' + requires_python: '>=3.12.0' + editable: true +- pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl + name: packaging + version: '25.0' + sha256: 29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: pandas + version: 2.3.1 + sha256: 2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9 + requires_dist: + - numpy>=1.22.4 ; python_full_version < '3.11' + - numpy>=1.23.2 ; python_full_version == '3.11.*' + - numpy>=1.26.0 ; python_full_version >= '3.12' + - python-dateutil>=2.8.2 + - pytz>=2020.1 + - tzdata>=2022.7 + - hypothesis>=6.46.1 ; extra == 'test' + - pytest>=7.3.2 ; extra == 'test' + - pytest-xdist>=2.2.0 ; extra == 'test' + - pyarrow>=10.0.1 ; extra == 'pyarrow' + - bottleneck>=1.3.6 ; extra == 'performance' + - numba>=0.56.4 ; extra == 'performance' + - numexpr>=2.8.4 ; extra == 'performance' + - scipy>=1.10.0 ; extra == 'computation' + - xarray>=2022.12.0 ; extra == 'computation' + - fsspec>=2022.11.0 ; extra == 'fss' + - s3fs>=2022.11.0 ; extra == 'aws' + - gcsfs>=2022.11.0 ; extra == 'gcp' + - pandas-gbq>=0.19.0 ; extra == 'gcp' + - odfpy>=1.4.1 ; extra == 'excel' + - openpyxl>=3.1.0 ; extra == 'excel' + - python-calamine>=0.1.7 ; extra == 'excel' + - pyxlsb>=1.0.10 ; extra == 'excel' + - xlrd>=2.0.1 ; extra == 'excel' + - xlsxwriter>=3.0.5 ; extra == 'excel' + - pyarrow>=10.0.1 ; extra == 'parquet' + - pyarrow>=10.0.1 ; extra == 'feather' + - tables>=3.8.0 ; extra == 'hdf5' + - pyreadstat>=1.2.0 ; extra == 'spss' + - sqlalchemy>=2.0.0 ; extra == 'postgresql' + - psycopg2>=2.9.6 ; extra == 'postgresql' + - adbc-driver-postgresql>=0.8.0 ; extra == 'postgresql' + - sqlalchemy>=2.0.0 ; extra == 'mysql' + - pymysql>=1.0.2 ; extra == 'mysql' + - sqlalchemy>=2.0.0 ; extra == 'sql-other' + - adbc-driver-postgresql>=0.8.0 ; extra == 'sql-other' + - adbc-driver-sqlite>=0.8.0 ; extra == 'sql-other' + - beautifulsoup4>=4.11.2 ; extra == 'html' + - html5lib>=1.1 ; extra == 'html' + - lxml>=4.9.2 ; extra == 'html' + - lxml>=4.9.2 ; extra == 'xml' + - matplotlib>=3.6.3 ; extra == 'plot' + - jinja2>=3.1.2 ; extra == 'output-formatting' + - tabulate>=0.9.0 ; extra == 'output-formatting' + - pyqt5>=5.15.9 ; extra == 'clipboard' + - qtpy>=2.3.0 ; extra == 'clipboard' + - zstandard>=0.19.0 ; extra == 'compression' + - dataframe-api-compat>=0.1.7 ; extra == 'consortium-standard' + - adbc-driver-postgresql>=0.8.0 ; extra == 'all' + - adbc-driver-sqlite>=0.8.0 ; extra == 'all' + - beautifulsoup4>=4.11.2 ; extra == 'all' + - bottleneck>=1.3.6 ; extra == 'all' + - dataframe-api-compat>=0.1.7 ; extra == 'all' + - fastparquet>=2022.12.0 ; extra == 'all' + - fsspec>=2022.11.0 ; extra == 'all' + - gcsfs>=2022.11.0 ; extra == 'all' + - html5lib>=1.1 ; extra == 'all' + - hypothesis>=6.46.1 ; extra == 'all' + - jinja2>=3.1.2 ; extra == 'all' + - lxml>=4.9.2 ; extra == 'all' + - matplotlib>=3.6.3 ; extra == 'all' + - numba>=0.56.4 ; extra == 'all' + - numexpr>=2.8.4 ; extra == 'all' + - odfpy>=1.4.1 ; extra == 'all' + - openpyxl>=3.1.0 ; extra == 'all' + - pandas-gbq>=0.19.0 ; extra == 'all' + - psycopg2>=2.9.6 ; extra == 'all' + - pyarrow>=10.0.1 ; extra == 'all' + - pymysql>=1.0.2 ; extra == 'all' + - pyqt5>=5.15.9 ; extra == 'all' + - pyreadstat>=1.2.0 ; extra == 'all' + - pytest>=7.3.2 ; extra == 'all' + - pytest-xdist>=2.2.0 ; extra == 'all' + - python-calamine>=0.1.7 ; extra == 'all' + - pyxlsb>=1.0.10 ; extra == 'all' + - qtpy>=2.3.0 ; extra == 'all' + - scipy>=1.10.0 ; extra == 'all' + - s3fs>=2022.11.0 ; extra == 'all' + - sqlalchemy>=2.0.0 ; extra == 'all' + - tables>=3.8.0 ; extra == 'all' + - tabulate>=0.9.0 ; extra == 'all' + - xarray>=2022.12.0 ; extra == 'all' + - xlrd>=2.0.1 ; extra == 'all' + - xlsxwriter>=3.0.5 ; extra == 'all' + - zstandard>=0.19.0 ; extra == 'all' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl + name: parso + version: 0.8.4 + sha256: a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18 + requires_dist: + - flake8==5.0.4 ; extra == 'qa' + - mypy==0.971 ; extra == 'qa' + - types-setuptools==67.2.0.1 ; extra == 'qa' + - docopt ; extra == 'testing' + - pytest ; extra == 'testing' + requires_python: '>=3.6' +- pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl + name: pathspec + version: 0.12.1 + sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 + requires_python: '>=3.8' +- conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda + sha256: 34a0ce54796d743113ef962c0d1a61e26f4f777c80647e14fd6bea7b3350b912 + md5: 751a8b7d5f3c6f428074e6ac34a2849b + depends: + - astunparse + - jinja2 >=2.11.0 + - markupsafe + - pygments >=2.12.0 + - python >=3.9 + license: Unlicense + purls: + - pkg:pypi/pdoc?source=hash-mapping + size: 123955 + timestamp: 1755106041556 +- pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl + name: pexpect + version: 4.9.0 + sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 + requires_dist: + - ptyprocess>=0.5 +- pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl + name: pillow + version: 11.3.0 + sha256: 13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8 + requires_dist: + - furo ; extra == 'docs' + - olefile ; extra == 'docs' + - sphinx>=8.2 ; extra == 'docs' + - sphinx-autobuild ; extra == 'docs' + - sphinx-copybutton ; extra == 'docs' + - sphinx-inline-tabs ; extra == 'docs' + - sphinxext-opengraph ; extra == 'docs' + - olefile ; extra == 'fpx' + - olefile ; extra == 'mic' + - pyarrow ; extra == 'test-arrow' + - check-manifest ; extra == 'tests' + - coverage>=7.4.2 ; extra == 'tests' + - defusedxml ; extra == 'tests' + - markdown2 ; extra == 'tests' + - olefile ; extra == 'tests' + - packaging ; extra == 'tests' + - pyroma ; extra == 'tests' + - pytest ; extra == 'tests' + - pytest-cov ; extra == 'tests' + - pytest-timeout ; extra == 'tests' + - pytest-xdist ; extra == 'tests' + - trove-classifiers>=2024.10.12 ; extra == 'tests' + - typing-extensions ; python_full_version < '3.10' and extra == 'typing' + - defusedxml ; extra == 'xmp' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl + name: pip + version: '25.2' + sha256: 6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl + name: platformdirs + version: 4.3.8 + sha256: ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4 + requires_dist: + - furo>=2024.8.6 ; extra == 'docs' + - proselint>=0.14 ; extra == 'docs' + - sphinx-autodoc-typehints>=3 ; extra == 'docs' + - sphinx>=8.1.3 ; extra == 'docs' + - appdirs==1.4.4 ; extra == 'test' + - covdefaults>=2.3 ; extra == 'test' + - pytest-cov>=6 ; extra == 'test' + - pytest-mock>=3.14 ; extra == 'test' + - pytest>=8.3.4 ; extra == 'test' + - mypy>=1.14.1 ; extra == 'type' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl + name: pluggy + version: 1.6.0 + sha256: e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 + requires_dist: + - pre-commit ; extra == 'dev' + - tox ; extra == 'dev' + - pytest ; extra == 'testing' + - pytest-benchmark ; extra == 'testing' + - coverage ; extra == 'testing' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: polars + version: 1.32.2 + sha256: a711a750cfc19f1f883d2b46895dd698abf4d446ca41c3bf510ced0ff1178057 + requires_dist: + - polars-cloud>=0.0.1a1 ; extra == 'polars-cloud' + - numpy>=1.16.0 ; extra == 'numpy' + - pandas ; extra == 'pandas' + - polars[pyarrow] ; extra == 'pandas' + - pyarrow>=7.0.0 ; extra == 'pyarrow' + - pydantic ; extra == 'pydantic' + - fastexcel>=0.9 ; extra == 'calamine' + - openpyxl>=3.0.0 ; extra == 'openpyxl' + - xlsx2csv>=0.8.0 ; extra == 'xlsx2csv' + - xlsxwriter ; extra == 'xlsxwriter' + - polars[calamine,openpyxl,xlsx2csv,xlsxwriter] ; extra == 'excel' + - adbc-driver-manager[dbapi] ; extra == 'adbc' + - adbc-driver-sqlite[dbapi] ; extra == 'adbc' + - connectorx>=0.3.2 ; extra == 'connectorx' + - sqlalchemy ; extra == 'sqlalchemy' + - polars[pandas] ; extra == 'sqlalchemy' + - polars[adbc,connectorx,sqlalchemy] ; extra == 'database' + - fsspec ; extra == 'fsspec' + - deltalake>=1.0.0 ; extra == 'deltalake' + - pyiceberg>=0.7.1 ; extra == 'iceberg' + - gevent ; extra == 'async' + - cloudpickle ; extra == 'cloudpickle' + - matplotlib ; extra == 'graph' + - altair>=5.4.0 ; extra == 'plot' + - great-tables>=0.8.0 ; extra == 'style' + - tzdata ; sys_platform == 'win32' and extra == 'timezone' + - cudf-polars-cu12 ; extra == 'gpu' + - polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone] ; extra == 'all' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl + name: prometheus-client + version: 0.22.1 + sha256: cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094 + requires_dist: + - twisted ; extra == 'twisted' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl + name: prompt-toolkit + version: 3.0.51 + sha256: 52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07 + requires_dist: + - wcwidth + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: propcache + version: 0.3.2 + sha256: 4c1396592321ac83157ac03a2023aa6cc4a3cc3cfdecb71090054c09e5a7cce3 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl + name: proto-plus + version: 1.26.1 + sha256: 13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66 + requires_dist: + - protobuf>=3.19.0,<7.0.0 + - google-api-core>=1.31.5 ; extra == 'testing' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl + name: protobuf + version: 6.31.1 + sha256: 4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: psutil + version: 7.0.0 + sha256: 4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34 + requires_dist: + - pytest ; extra == 'dev' + - pytest-xdist ; extra == 'dev' + - setuptools ; extra == 'dev' + - abi3audit ; extra == 'dev' + - black==24.10.0 ; extra == 'dev' + - check-manifest ; extra == 'dev' + - coverage ; extra == 'dev' + - packaging ; extra == 'dev' + - pylint ; extra == 'dev' + - pyperf ; extra == 'dev' + - pypinfo ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - requests ; extra == 'dev' + - rstcheck ; extra == 'dev' + - ruff ; extra == 'dev' + - sphinx ; extra == 'dev' + - sphinx-rtd-theme ; extra == 'dev' + - toml-sort ; extra == 'dev' + - twine ; extra == 'dev' + - virtualenv ; extra == 'dev' + - vulture ; extra == 'dev' + - wheel ; extra == 'dev' + - pytest ; extra == 'test' + - pytest-xdist ; extra == 'test' + - setuptools ; extra == 'test' + requires_python: '>=3.6' +- pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl + name: ptyprocess + version: 0.7.0 + sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 +- pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl + name: pure-eval + version: 0.2.3 + sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 + requires_dist: + - pytest ; extra == 'tests' +- pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl + name: py-spy + version: 0.4.1 + sha256: 6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29 + requires_dist: + - numpy ; extra == 'test' +- pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl + name: pyarrow + version: 21.0.0 + sha256: 69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61 + requires_dist: + - pytest ; extra == 'test' + - hypothesis ; extra == 'test' + - cffi ; extra == 'test' + - pytz ; extra == 'test' + - pandas ; extra == 'test' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl + name: pyarrow-stubs + version: 20.0.0.20250716 + sha256: 8ecfdd215af468d6b993e2290da7f3d51a32991c1d230b90682f7ee4bc5ee7cd + requires_dist: + - pyarrow>=20 + requires_python: '>=3.9,<4' +- pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl + name: pyasn1 + version: 0.6.1 + sha256: 0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl + name: pyasn1-modules + version: 0.4.2 + sha256: 29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a + requires_dist: + - pyasn1>=0.6.1,<0.7.0 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl + name: pycparser + version: '2.22' + sha256: c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: pycryptodome + version: 3.23.0 + sha256: c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' +- pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl + name: pydantic + version: 2.11.7 + sha256: dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b + requires_dist: + - annotated-types>=0.6.0 + - pydantic-core==2.33.2 + - typing-extensions>=4.12.2 + - typing-inspection>=0.4.0 + - email-validator>=2.0.0 ; extra == 'email' + - tzdata ; python_full_version >= '3.9' and sys_platform == 'win32' and extra == 'timezone' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: pydantic-core + version: 2.33.2 + sha256: 9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d + requires_dist: + - typing-extensions>=4.6.0,!=4.7.0 + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda + sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a + md5: 6b6ece66ebcae2d5f326c77ef2c5a066 + depends: + - python >=3.9 + license: BSD-2-Clause + license_family: BSD + purls: + - pkg:pypi/pygments?source=hash-mapping + size: 889287 + timestamp: 1750615908735 +- pypi: https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz + name: pyiceberg + version: 0.9.1 + sha256: 3634134ce33859a441768b39df179b2c6f3de2bbbf506622884f553b013ee799 + requires_dist: + - adlfs>=2023.1.0 ; extra == 'adlfs' + - boto3>=1.24.59 ; extra == 'dynamodb' or extra == 'glue' or extra == 'rest-sigv4' + - cachetools>=5.5.0,<6.0.0 + - click>=7.1.1,<9.0.0 + - duckdb>=0.5.0,<2.0.0 ; extra == 'duckdb' + - fsspec>=2023.1.0 + - gcsfs>=2023.1.0 ; extra == 'gcsfs' + - getdaft>=0.2.12 ; extra == 'daft' + - kerberos>=1.3.1,<2.0.0 ; extra == 'hive-kerberos' + - mmh3>=4.0.0,<6.0.0 + - mypy-boto3-glue>=1.28.18 ; extra == 'glue' + - pandas>=1.0.0,<3.0.0 ; extra == 'pandas' or extra == 'ray' + - polars>=1.21.0,<2.0.0 ; extra == 'polars' + - psycopg2-binary>=2.9.6 ; extra == 'sql-postgres' + - pyarrow>=17.0.0,<20.0.0 ; extra == 'duckdb' or extra == 'pandas' or extra == 'pyarrow' or extra == 'ray' + - pydantic>=2.0,!=2.4.0,!=2.4.1,<3.0 + - pyiceberg-core>=0.4.0,<0.5.0 ; extra == 'pyiceberg-core' + - pyparsing>=3.1.0,<4.0.0 + - python-snappy>=0.6.0,<1.0.0 ; extra == 'snappy' + - ray==2.10.0 ; python_full_version < '3.9' and extra == 'ray' + - ray>=2.10.0,<3.0.0 ; python_full_version >= '3.9' and extra == 'ray' + - requests>=2.20.0,<3.0.0 + - rich>=10.11.0,<14.0.0 + - s3fs>=2023.1.0 ; extra == 's3fs' + - sortedcontainers==2.4.0 + - sqlalchemy>=2.0.18,<3.0.0 ; extra == 'sql-postgres' or extra == 'sql-sqlite' + - strictyaml>=1.7.0,<2.0.0 + - tenacity>=8.2.3,<10.0.0 + - thrift>=0.13.0,<1.0.0 ; extra == 'hive' or extra == 'hive-kerberos' + - thrift-sasl>=0.4.3 ; extra == 'hive-kerberos' + - zstandard>=0.13.0,<1.0.0 ; extra == 'zstandard' + requires_python: '!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9' +- pypi: https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl + name: pyjwt + version: 2.10.1 + sha256: dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb + requires_dist: + - cryptography>=3.4.0 ; extra == 'crypto' + - coverage[toml]==5.0.4 ; extra == 'dev' + - cryptography>=3.4.0 ; extra == 'dev' + - pre-commit ; extra == 'dev' + - pytest>=6.0.0,<7.0.0 ; extra == 'dev' + - sphinx ; extra == 'dev' + - sphinx-rtd-theme ; extra == 'dev' + - zope-interface ; extra == 'dev' + - sphinx ; extra == 'docs' + - sphinx-rtd-theme ; extra == 'docs' + - zope-interface ; extra == 'docs' + - coverage[toml]==5.0.4 ; extra == 'tests' + - pytest>=6.0.0,<7.0.0 ; extra == 'tests' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl + name: pyparsing + version: 3.2.3 + sha256: a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf + requires_dist: + - railroad-diagrams ; extra == 'diagrams' + - jinja2 ; extra == 'diagrams' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl + name: pysocks + version: 1.7.1 + sha256: 2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' +- pypi: https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl + name: pytest + version: 8.4.1 + sha256: 539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7 + requires_dist: + - colorama>=0.4 ; sys_platform == 'win32' + - exceptiongroup>=1 ; python_full_version < '3.11' + - iniconfig>=1 + - packaging>=20 + - pluggy>=1.5,<2 + - pygments>=2.7.2 + - tomli>=1 ; python_full_version < '3.11' + - argcomplete ; extra == 'dev' + - attrs>=19.2 ; extra == 'dev' + - hypothesis>=3.56 ; extra == 'dev' + - mock ; extra == 'dev' + - requests ; extra == 'dev' + - setuptools ; extra == 'dev' + - xmlschema ; extra == 'dev' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/bc/16/4ea354101abb1287856baa4af2732be351c7bee728065aed451b678153fd/pytest_cov-6.2.1-py3-none-any.whl + name: pytest-cov + version: 6.2.1 + sha256: f5bc4c23f42f1cdd23c70b1dab1bbaef4fc505ba950d53e0081d0730dd7e86d5 + requires_dist: + - pytest>=6.2.5 + - coverage[toml]>=7.5 + - pluggy>=1.2 + - fields ; extra == 'testing' + - hunter ; extra == 'testing' + - process-tests ; extra == 'testing' + - pytest-xdist ; extra == 'testing' + - virtualenv ; extra == 'testing' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda + build_number: 102 + sha256: c2cdcc98ea3cbf78240624e4077e164dc9d5588eefb044b4097c3df54d24d504 + md5: 89e07d92cf50743886f41638d58c4328 + depends: + - __glibc >=2.17,<3.0.a0 + - bzip2 >=1.0.8,<2.0a0 + - ld_impl_linux-64 >=2.36.1 + - libexpat >=2.7.0,<3.0a0 + - libffi >=3.4.6,<3.5.0a0 + - libgcc >=13 + - liblzma >=5.8.1,<6.0a0 + - libmpdec >=4.0.0,<5.0a0 + - libsqlite >=3.50.1,<4.0a0 + - libuuid >=2.38.1,<3.0a0 + - libzlib >=1.3.1,<2.0a0 + - ncurses >=6.5,<7.0a0 + - openssl >=3.5.0,<4.0a0 + - python_abi 3.13.* *_cp313 + - readline >=8.2,<9.0a0 + - tk >=8.6.13,<8.7.0a0 + - tzdata + license: Python-2.0 + purls: [] + size: 33273132 + timestamp: 1750064035176 + python_site_packages_path: lib/python3.13/site-packages +- pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl + name: python-dateutil + version: 2.9.0.post0 + sha256: a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 + requires_dist: + - six>=1.5 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda + build_number: 8 + sha256: 210bffe7b121e651419cb196a2a63687b087497595c9be9d20ebe97dd06060a7 + md5: 94305520c52a4aa3f6c2b1ff6008d9f8 + constrains: + - python 3.13.* *_cp313 + license: BSD-3-Clause + license_family: BSD + purls: [] + size: 7002 + timestamp: 1752805902938 +- pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl + name: pytz + version: '2025.2' + sha256: 5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00 +- pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: pyyaml + version: 6.0.2 + sha256: 70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl + name: pyyaml-env-tag + version: '1.1' + sha256: 17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04 + requires_dist: + - pyyaml + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/7e/0a/2356305c423a975000867de56888b79e44ec2192c690ff93c3109fd78081/pyzmq-27.0.1-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl + name: pyzmq + version: 27.0.1 + sha256: f5b6133c8d313bde8bd0d123c169d22525300ff164c2189f849de495e1344577 + requires_dist: + - cffi ; implementation_name == 'pypy' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl + name: ray + version: 2.48.0 + sha256: 25e4b79fcc8f849d72db1acc4f03f37008c5c0b745df63d8a30cd35676b6545e + requires_dist: + - click>=7.0 + - filelock + - jsonschema + - msgpack>=1.0.0,<2.0.0 + - packaging + - protobuf>=3.15.3,!=3.19.5 + - pyyaml + - requests + - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'cgraph' + - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'client' + - grpcio ; extra == 'client' + - numpy>=1.20 ; extra == 'data' + - pandas>=1.3 ; extra == 'data' + - pyarrow>=9.0.0 ; extra == 'data' + - fsspec ; extra == 'data' + - aiohttp>=3.7 ; extra == 'default' + - aiohttp-cors ; extra == 'default' + - colorful ; extra == 'default' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'default' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'default' + - requests ; extra == 'default' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'default' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'default' + - opencensus ; extra == 'default' + - opentelemetry-sdk>=1.30.0 ; extra == 'default' + - opentelemetry-exporter-prometheus ; extra == 'default' + - opentelemetry-proto ; extra == 'default' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'default' + - prometheus-client>=0.7.1 ; extra == 'default' + - smart-open ; extra == 'default' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'default' + - memray ; sys_platform != 'win32' and extra == 'observability' + - colorful ; extra == 'serve' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'serve' + - opencensus ; extra == 'serve' + - aiohttp>=3.7 ; extra == 'serve' + - prometheus-client>=0.7.1 ; extra == 'serve' + - aiohttp-cors ; extra == 'serve' + - opentelemetry-exporter-prometheus ; extra == 'serve' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'serve' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'serve' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'serve' + - uvicorn[standard] ; extra == 'serve' + - fastapi ; extra == 'serve' + - requests ; extra == 'serve' + - opentelemetry-sdk>=1.30.0 ; extra == 'serve' + - smart-open ; extra == 'serve' + - opentelemetry-proto ; extra == 'serve' + - starlette ; extra == 'serve' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'serve' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'serve' + - watchfiles ; extra == 'serve' + - pandas ; extra == 'tune' + - tensorboardx>=1.9 ; extra == 'tune' + - requests ; extra == 'tune' + - pyarrow>=9.0.0 ; extra == 'tune' + - fsspec ; extra == 'tune' + - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'adag' + - colorful ; extra == 'serve-grpc' + - opencensus ; extra == 'serve-grpc' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'serve-grpc' + - aiohttp>=3.7 ; extra == 'serve-grpc' + - prometheus-client>=0.7.1 ; extra == 'serve-grpc' + - aiohttp-cors ; extra == 'serve-grpc' + - opentelemetry-exporter-prometheus ; extra == 'serve-grpc' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'serve-grpc' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'serve-grpc' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'serve-grpc' + - uvicorn[standard] ; extra == 'serve-grpc' + - fastapi ; extra == 'serve-grpc' + - requests ; extra == 'serve-grpc' + - opentelemetry-sdk>=1.30.0 ; extra == 'serve-grpc' + - smart-open ; extra == 'serve-grpc' + - opentelemetry-proto ; extra == 'serve-grpc' + - starlette ; extra == 'serve-grpc' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'serve-grpc' + - pyopenssl ; extra == 'serve-grpc' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'serve-grpc' + - watchfiles ; extra == 'serve-grpc' + - ray-cpp==2.48.0 ; extra == 'cpp' + - pandas ; extra == 'rllib' + - tensorboardx>=1.9 ; extra == 'rllib' + - requests ; extra == 'rllib' + - pyarrow>=9.0.0 ; extra == 'rllib' + - fsspec ; extra == 'rllib' + - dm-tree ; extra == 'rllib' + - gymnasium==1.0.0 ; extra == 'rllib' + - lz4 ; extra == 'rllib' + - ormsgpack==1.7.0 ; extra == 'rllib' + - pyyaml ; extra == 'rllib' + - scipy ; extra == 'rllib' + - pandas ; extra == 'train' + - tensorboardx>=1.9 ; extra == 'train' + - requests ; extra == 'train' + - pyarrow>=9.0.0 ; extra == 'train' + - fsspec ; extra == 'train' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'train' + - colorful ; extra == 'air' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'air' + - opencensus ; extra == 'air' + - aiohttp>=3.7 ; extra == 'air' + - prometheus-client>=0.7.1 ; extra == 'air' + - aiohttp-cors ; extra == 'air' + - tensorboardx>=1.9 ; extra == 'air' + - opentelemetry-exporter-prometheus ; extra == 'air' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'air' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'air' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'air' + - pandas>=1.3 ; extra == 'air' + - uvicorn[standard] ; extra == 'air' + - fsspec ; extra == 'air' + - fastapi ; extra == 'air' + - requests ; extra == 'air' + - opentelemetry-sdk>=1.30.0 ; extra == 'air' + - smart-open ; extra == 'air' + - opentelemetry-proto ; extra == 'air' + - pyarrow>=9.0.0 ; extra == 'air' + - starlette ; extra == 'air' + - pandas ; extra == 'air' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'air' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'air' + - numpy>=1.20 ; extra == 'air' + - watchfiles ; extra == 'air' + - colorful ; extra == 'all' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'all' + - opencensus ; extra == 'all' + - aiohttp>=3.7 ; extra == 'all' + - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'all' + - scipy ; extra == 'all' + - prometheus-client>=0.7.1 ; extra == 'all' + - aiohttp-cors ; extra == 'all' + - opentelemetry-exporter-prometheus ; extra == 'all' + - tensorboardx>=1.9 ; extra == 'all' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'all' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'all' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'all' + - pandas>=1.3 ; extra == 'all' + - uvicorn[standard] ; extra == 'all' + - ormsgpack==1.7.0 ; extra == 'all' + - fsspec ; extra == 'all' + - fastapi ; extra == 'all' + - requests ; extra == 'all' + - opentelemetry-sdk>=1.30.0 ; extra == 'all' + - gymnasium==1.0.0 ; extra == 'all' + - smart-open ; extra == 'all' + - memray ; sys_platform != 'win32' and extra == 'all' + - dm-tree ; extra == 'all' + - lz4 ; extra == 'all' + - opentelemetry-proto ; extra == 'all' + - pyarrow>=9.0.0 ; extra == 'all' + - starlette ; extra == 'all' + - pandas ; extra == 'all' + - pyyaml ; extra == 'all' + - grpcio ; extra == 'all' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'all' + - pyopenssl ; extra == 'all' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'all' + - numpy>=1.20 ; extra == 'all' + - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'all' + - watchfiles ; extra == 'all' + - colorful ; extra == 'all-cpp' + - opencensus ; extra == 'all-cpp' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'all-cpp' + - aiohttp>=3.7 ; extra == 'all-cpp' + - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'all-cpp' + - ray-cpp==2.48.0 ; extra == 'all-cpp' + - scipy ; extra == 'all-cpp' + - prometheus-client>=0.7.1 ; extra == 'all-cpp' + - aiohttp-cors ; extra == 'all-cpp' + - opentelemetry-exporter-prometheus ; extra == 'all-cpp' + - tensorboardx>=1.9 ; extra == 'all-cpp' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'all-cpp' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'all-cpp' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'all-cpp' + - pandas>=1.3 ; extra == 'all-cpp' + - uvicorn[standard] ; extra == 'all-cpp' + - ormsgpack==1.7.0 ; extra == 'all-cpp' + - fsspec ; extra == 'all-cpp' + - fastapi ; extra == 'all-cpp' + - requests ; extra == 'all-cpp' + - opentelemetry-sdk>=1.30.0 ; extra == 'all-cpp' + - gymnasium==1.0.0 ; extra == 'all-cpp' + - smart-open ; extra == 'all-cpp' + - memray ; sys_platform != 'win32' and extra == 'all-cpp' + - dm-tree ; extra == 'all-cpp' + - lz4 ; extra == 'all-cpp' + - opentelemetry-proto ; extra == 'all-cpp' + - pyarrow>=9.0.0 ; extra == 'all-cpp' + - starlette ; extra == 'all-cpp' + - pandas ; extra == 'all-cpp' + - pyyaml ; extra == 'all-cpp' + - grpcio ; extra == 'all-cpp' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'all-cpp' + - pyopenssl ; extra == 'all-cpp' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'all-cpp' + - numpy>=1.20 ; extra == 'all-cpp' + - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'all-cpp' + - watchfiles ; extra == 'all-cpp' + - colorful ; extra == 'llm' + - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'llm' + - opencensus ; extra == 'llm' + - aiohttp>=3.7 ; extra == 'llm' + - prometheus-client>=0.7.1 ; extra == 'llm' + - aiohttp-cors ; extra == 'llm' + - opentelemetry-exporter-prometheus ; extra == 'llm' + - vllm>=0.9.2 ; extra == 'llm' + - typer ; extra == 'llm' + - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'llm' + - virtualenv>=20.0.24,!=20.21.1 ; extra == 'llm' + - jsonschema ; extra == 'llm' + - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'llm' + - pandas>=1.3 ; extra == 'llm' + - uvicorn[standard] ; extra == 'llm' + - ninja ; extra == 'llm' + - fsspec ; extra == 'llm' + - fastapi ; extra == 'llm' + - requests ; extra == 'llm' + - opentelemetry-sdk>=1.30.0 ; extra == 'llm' + - smart-open ; extra == 'llm' + - jsonref>=1.1.0 ; extra == 'llm' + - opentelemetry-proto ; extra == 'llm' + - pyarrow>=9.0.0 ; extra == 'llm' + - starlette ; extra == 'llm' + - async-timeout ; python_full_version < '3.11' and extra == 'llm' + - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'llm' + - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'llm' + - numpy>=1.20 ; extra == 'llm' + - watchfiles ; extra == 'llm' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda + sha256: 2d6d0c026902561ed77cd646b5021aef2d4db22e57a5b0178dfc669231e06d2c + md5: 283b96675859b20a825f8fa30f311446 + depends: + - libgcc >=13 + - ncurses >=6.5,<7.0a0 + license: GPL-3.0-only + license_family: GPL + purls: [] + size: 282480 + timestamp: 1740379431762 +- pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl + name: redis + version: 6.4.0 + sha256: f0544fa9604264e9464cdf4814e7d4830f74b165d52f2a330a760a88dd248b7f + requires_dist: + - async-timeout>=4.0.3 ; python_full_version < '3.11.3' + - hiredis>=3.2.0 ; extra == 'hiredis' + - pyjwt>=2.9.0 ; extra == 'jwt' + - cryptography>=36.0.1 ; extra == 'ocsp' + - pyopenssl>=20.0.1 ; extra == 'ocsp' + - requests>=2.31.0 ; extra == 'ocsp' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl + name: referencing + version: 0.36.2 + sha256: e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 + requires_dist: + - attrs>=22.2.0 + - rpds-py>=0.7.0 + - typing-extensions>=4.4.0 ; python_full_version < '3.13' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl + name: requests + version: 2.32.4 + sha256: 27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c + requires_dist: + - charset-normalizer>=2,<4 + - idna>=2.5,<4 + - urllib3>=1.21.1,<3 + - certifi>=2017.4.17 + - pysocks>=1.5.6,!=1.5.7 ; extra == 'socks' + - chardet>=3.0.2,<6 ; extra == 'use-chardet-on-py3' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl + name: requests-oauthlib + version: 2.0.0 + sha256: 7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 + requires_dist: + - oauthlib>=3.0.0 + - requests>=2.0.0 + - oauthlib[signedtoken]>=3.0.0 ; extra == 'rsa' + requires_python: '>=3.4' +- pypi: https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl + name: requests-toolbelt + version: 1.0.0 + sha256: cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 + requires_dist: + - requests>=2.0.1,<3.0.0 + requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' +- pypi: https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl + name: rich + version: 13.9.4 + sha256: 6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90 + requires_dist: + - ipywidgets>=7.5.1,<9 ; extra == 'jupyter' + - markdown-it-py>=2.2.0 + - pygments>=2.13.0,<3.0.0 + - typing-extensions>=4.0.0,<5.0 ; python_full_version < '3.11' + requires_python: '>=3.8.0' +- pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: rpds-py + version: 0.27.0 + sha256: be0744661afbc4099fef7f4e604e7f1ea1be1dd7284f357924af12a705cc7d5c + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl + name: rsa + version: 4.9.1 + sha256: 68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 + requires_dist: + - pyasn1>=0.1.3 + requires_python: '>=3.6,<4' +- pypi: https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: ruff + version: 0.12.8 + sha256: 560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl + name: s3fs + version: 2025.7.0 + sha256: b6b2d3f84b6aa1c2ba5e62e39dd9410cf54f10a2cce1ea6db1ba0d1a6bcce685 + requires_dist: + - aiobotocore>=2.5.4,<3.0.0 + - fsspec==2025.7.0 + - aiohttp!=4.0.0a0,!=4.0.0a1 + - aiobotocore[awscli]>=2.5.4,<3.0.0 ; extra == 'awscli' + - aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == 'boto3' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl + name: s3transfer + version: 0.13.1 + sha256: a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724 + requires_dist: + - botocore>=1.37.4,<2.0a0 + - botocore[crt]>=1.37.4,<2.0a0 ; extra == 'crt' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl + name: setuptools + version: 80.9.0 + sha256: 062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922 + requires_dist: + - pytest>=6,!=8.1.* ; extra == 'test' + - virtualenv>=13.0.0 ; extra == 'test' + - wheel>=0.44.0 ; extra == 'test' + - pip>=19.1 ; extra == 'test' + - packaging>=24.2 ; extra == 'test' + - jaraco-envs>=2.2 ; extra == 'test' + - pytest-xdist>=3 ; extra == 'test' + - jaraco-path>=3.7.2 ; extra == 'test' + - build[virtualenv]>=1.0.3 ; extra == 'test' + - filelock>=3.4.0 ; extra == 'test' + - ini2toml[lite]>=0.14 ; extra == 'test' + - tomli-w>=1.0.0 ; extra == 'test' + - pytest-timeout ; extra == 'test' + - pytest-perf ; sys_platform != 'cygwin' and extra == 'test' + - jaraco-develop>=7.21 ; python_full_version >= '3.9' and sys_platform != 'cygwin' and extra == 'test' + - pytest-home>=0.5 ; extra == 'test' + - pytest-subprocess ; extra == 'test' + - pyproject-hooks!=1.1 ; extra == 'test' + - jaraco-test>=5.5 ; extra == 'test' + - sphinx>=3.5 ; extra == 'doc' + - jaraco-packaging>=9.3 ; extra == 'doc' + - rst-linker>=1.9 ; extra == 'doc' + - furo ; extra == 'doc' + - sphinx-lint ; extra == 'doc' + - jaraco-tidelift>=1.4 ; extra == 'doc' + - pygments-github-lexers==0.0.5 ; extra == 'doc' + - sphinx-favicon ; extra == 'doc' + - sphinx-inline-tabs ; extra == 'doc' + - sphinx-reredirects ; extra == 'doc' + - sphinxcontrib-towncrier ; extra == 'doc' + - sphinx-notfound-page>=1,<2 ; extra == 'doc' + - pyproject-hooks!=1.1 ; extra == 'doc' + - towncrier<24.7 ; extra == 'doc' + - packaging>=24.2 ; extra == 'core' + - more-itertools>=8.8 ; extra == 'core' + - jaraco-text>=3.7 ; extra == 'core' + - importlib-metadata>=6 ; python_full_version < '3.10' and extra == 'core' + - tomli>=2.0.1 ; python_full_version < '3.11' and extra == 'core' + - wheel>=0.43.0 ; extra == 'core' + - platformdirs>=4.2.2 ; extra == 'core' + - jaraco-functools>=4 ; extra == 'core' + - more-itertools ; extra == 'core' + - pytest-checkdocs>=2.4 ; extra == 'check' + - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' + - ruff>=0.8.0 ; sys_platform != 'cygwin' and extra == 'check' + - pytest-cov ; extra == 'cover' + - pytest-enabler>=2.2 ; extra == 'enabler' + - pytest-mypy ; extra == 'type' + - mypy==1.14.* ; extra == 'type' + - importlib-metadata>=7.0.2 ; python_full_version < '3.10' and extra == 'type' + - jaraco-develop>=7.21 ; sys_platform != 'cygwin' and extra == 'type' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda + sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d + md5: 3339e3b65d58accf4ca4fb8748ab16b3 + depends: + - python >=3.9 + - python + license: MIT + license_family: MIT + purls: + - pkg:pypi/six?source=hash-mapping + size: 18455 + timestamp: 1753199211006 +- pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl + name: smart-open + version: 7.3.0.post1 + sha256: c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4 + requires_dist: + - wrapt + - boto3 ; extra == 's3' + - google-cloud-storage>=2.6.0 ; extra == 'gcs' + - azure-storage-blob ; extra == 'azure' + - azure-common ; extra == 'azure' + - azure-core ; extra == 'azure' + - requests ; extra == 'http' + - requests ; extra == 'webhdfs' + - paramiko ; extra == 'ssh' + - zstandard ; extra == 'zst' + - smart-open[azure,gcs,http,s3,ssh,webhdfs,zst] ; extra == 'all' + - smart-open[all] ; extra == 'test' + - moto[server] ; extra == 'test' + - responses ; extra == 'test' + - pytest ; extra == 'test' + - pytest-rerunfailures ; extra == 'test' + - pytest-benchmark ; extra == 'test' + - awscli ; extra == 'test' + - pyopenssl ; extra == 'test' + - numpy ; extra == 'test' + requires_python: '>=3.8,<4.0' +- pypi: https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl + name: sortedcontainers + version: 2.4.0 + sha256: a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0 +- pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl + name: stack-data + version: 0.6.3 + sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 + requires_dist: + - executing>=1.2.0 + - asttokens>=2.1.0 + - pure-eval + - pytest ; extra == 'tests' + - typeguard ; extra == 'tests' + - pygments ; extra == 'tests' + - littleutils ; extra == 'tests' + - cython ; extra == 'tests' +- pypi: https://files.pythonhosted.org/packages/96/7c/a81ef5ef10978dd073a854e0fa93b5d8021d0594b639cc8f6453c3c78a1d/strictyaml-1.7.3-py3-none-any.whl + name: strictyaml + version: 1.7.3 + sha256: fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7 + requires_dist: + - python-dateutil>=2.6.0 + requires_python: '>=3.7.0' +- pypi: https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl + name: tenacity + version: 9.1.2 + sha256: f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138 + requires_dist: + - reno ; extra == 'doc' + - sphinx ; extra == 'doc' + - pytest ; extra == 'test' + - tornado>=4.5 ; extra == 'test' + - typeguard ; extra == 'test' + requires_python: '>=3.9' +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda + sha256: a84ff687119e6d8752346d1d408d5cf360dee0badd487a472aa8ddedfdc219e1 + md5: a0116df4f4ed05c303811a837d5b39d8 + depends: + - __glibc >=2.17,<3.0.a0 + - libgcc >=13 + - libzlib >=1.3.1,<2.0a0 + license: TCL + license_family: BSD + purls: [] + size: 3285204 + timestamp: 1748387766691 +- pypi: https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: tornado + version: 6.5.2 + sha256: e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl + name: tqdm + version: 4.67.1 + sha256: 26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 + requires_dist: + - colorama ; sys_platform == 'win32' + - pytest>=6 ; extra == 'dev' + - pytest-cov ; extra == 'dev' + - pytest-timeout ; extra == 'dev' + - pytest-asyncio>=0.24 ; extra == 'dev' + - nbval ; extra == 'dev' + - requests ; extra == 'discord' + - slack-sdk ; extra == 'slack' + - requests ; extra == 'telegram' + - ipywidgets>=6 ; extra == 'notebook' + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl + name: traitlets + version: 5.14.3 + sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f + requires_dist: + - myst-parser ; extra == 'docs' + - pydata-sphinx-theme ; extra == 'docs' + - sphinx ; extra == 'docs' + - argcomplete>=3.0.3 ; extra == 'test' + - mypy>=1.7.0 ; extra == 'test' + - pre-commit ; extra == 'test' + - pytest-mock ; extra == 'test' + - pytest-mypy-testing ; extra == 'test' + - pytest>=7.0,<8.2 ; extra == 'test' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl + name: typing-extensions + version: 4.14.1 + sha256: d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl + name: typing-inspection + version: 0.4.1 + sha256: 389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 + requires_dist: + - typing-extensions>=4.12.0 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl + name: tzdata + version: '2025.2' + sha256: 1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 + requires_python: '>=2' +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda + sha256: 5aaa366385d716557e365f0a4e9c3fca43ba196872abbbe3d56bb610d131e192 + md5: 4222072737ccff51314b5ece9c7d6f5a + license: LicenseRef-Public-Domain + purls: [] + size: 122968 + timestamp: 1742727099393 +- pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl + name: urllib3 + version: 2.5.0 + sha256: e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc + requires_dist: + - brotli>=1.0.9 ; platform_python_implementation == 'CPython' and extra == 'brotli' + - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'brotli' + - h2>=4,<5 ; extra == 'h2' + - pysocks>=1.5.6,!=1.5.7,<2.0 ; extra == 'socks' + - zstandard>=0.18.0 ; extra == 'zstd' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl + name: virtualenv + version: 20.33.1 + sha256: 07c19bc66c11acab6a5958b815cbcee30891cd1c2ccf53785a28651a0d8d8a67 + requires_dist: + - distlib>=0.3.7,<1 + - filelock>=3.12.2,<4 + - importlib-metadata>=6.6 ; python_full_version < '3.8' + - platformdirs>=3.9.1,<5 + - furo>=2023.7.26 ; extra == 'docs' + - proselint>=0.13 ; extra == 'docs' + - sphinx>=7.1.2,!=7.3 ; extra == 'docs' + - sphinx-argparse>=0.4 ; extra == 'docs' + - sphinxcontrib-towncrier>=0.2.1a0 ; extra == 'docs' + - towncrier>=23.6 ; extra == 'docs' + - covdefaults>=2.3 ; extra == 'test' + - coverage-enable-subprocess>=1 ; extra == 'test' + - coverage>=7.2.7 ; extra == 'test' + - flaky>=3.7 ; extra == 'test' + - packaging>=23.1 ; extra == 'test' + - pytest-env>=0.8.2 ; extra == 'test' + - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'GraalVM' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') + - pytest-mock>=3.11.1 ; extra == 'test' + - pytest-randomly>=3.12 ; extra == 'test' + - pytest-timeout>=2.1 ; extra == 'test' + - pytest>=7.4 ; extra == 'test' + - setuptools>=68 ; extra == 'test' + - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl + name: watchdog + version: 6.0.0 + sha256: 20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2 + requires_dist: + - pyyaml>=3.10 ; extra == 'watchmedo' + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl + name: wcwidth + version: 0.2.13 + sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 + requires_dist: + - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' +- pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl + name: widgetsnbextension + version: 4.0.14 + sha256: 4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575 + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl + name: wrapt + version: 1.17.3 + sha256: 6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277 + requires_python: '>=3.8' +- pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: xxhash + version: 3.5.0 + sha256: 07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb + requires_python: '>=3.7' +- pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + name: yarl + version: 1.20.1 + sha256: d1a4fbb50e14396ba3d375f68bfe02215d8e7bc3ec49da8341fe3157f59d2ff5 + requires_dist: + - idna>=2.0 + - multidict>=4.0 + - propcache>=0.2.1 + requires_python: '>=3.9' +- pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl + name: zipp + version: 3.23.0 + sha256: 071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e + requires_dist: + - pytest>=6,!=8.1.* ; extra == 'test' + - jaraco-itertools ; extra == 'test' + - jaraco-functools ; extra == 'test' + - more-itertools ; extra == 'test' + - big-o ; extra == 'test' + - pytest-ignore-flaky ; extra == 'test' + - jaraco-test ; extra == 'test' + - sphinx>=3.5 ; extra == 'doc' + - jaraco-packaging>=9.3 ; extra == 'doc' + - rst-linker>=1.9 ; extra == 'doc' + - furo ; extra == 'doc' + - sphinx-lint ; extra == 'doc' + - jaraco-tidelift>=1.4 ; extra == 'doc' + - pytest-checkdocs>=2.4 ; extra == 'check' + - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' + - pytest-cov ; extra == 'cover' + - pytest-enabler>=2.2 ; extra == 'enabler' + - pytest-mypy ; extra == 'type' + requires_python: '>=3.9' diff --git a/pyproject.toml b/pyproject.toml index 5ac2066..8acc2cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "pyarrow>=20.0.0", "polars>=1.31.0", "beartype>=0.21.0", + "deltalake>=1.0.2", ] readme = "README.md" requires-python = ">=3.12.0" @@ -62,5 +63,23 @@ dev = [ "ruff>=0.11.11", "s3fs>=2025.7.0", "tqdm>=4.67.1", - "unitycatalog-client>=0.3.0", ] + +[tool.pixi.workspace] +channels = ["conda-forge"] +platforms = ["linux-64"] + +[tool.pixi.pypi-dependencies] +orcapod = { path = ".", editable = true } + +[tool.pixi.environments] +default = { solve-group = "default" } +all = { features = ["all", "redis", "ray"], solve-group = "default" } +dev = { features = ["dev"], solve-group = "default" } +ray = { features = ["ray"], solve-group = "default" } +redis = { features = ["redis"], solve-group = "default" } + +[tool.pixi.tasks] + +[tool.pixi.dependencies] +python = "3.13.*" diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index f77dd89..6709db2 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -7,7 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.data.system_constants import constants -from orcapod.types import TypeSpec, typespec_utils +from orcapod.types import TypeSpec from orcapod.types.core import DataValue from orcapod.utils import arrow_utils @@ -76,6 +76,9 @@ def __init__( "Table must contain exactly one row to be a valid datagram." ) + # normalize the table to large data types (for Polars compatibility) + table = arrow_utils.normalize_table_to_large_types(table) + # Split table into data, meta, and context components context_columns = ( [constants.CONTEXT_KEY] @@ -527,7 +530,9 @@ def with_meta_columns(self, **meta_updates: DataValue) -> Self: # Create new meta table new_datagram._meta_table = ( - pa.Table.from_pylist([meta_dict]) if meta_dict else None + self._data_context.type_converter.python_dicts_to_arrow_table([meta_dict]) + if meta_dict + else None ) return new_datagram @@ -678,10 +683,13 @@ def update(self, **updates: DataValue) -> Self: new_datagram = self.copy(include_cache=False) - updates_typespec = {k: v for k, v in self.types().items() if k in updates} + # use existing schema + sub_schema = arrow_utils.schema_select( + new_datagram._data_table.schema, list(updates.keys()) + ) update_table = self._data_context.type_converter.python_dicts_to_arrow_table( - [updates], python_schema=updates_typespec + [updates], arrow_schema=sub_schema ) new_datagram._data_table = arrow_utils.hstack_tables( @@ -727,14 +735,10 @@ def with_columns( new_datagram = self.copy() # TODO: consider simplifying this conversion logic - # prepare update's table - typespec: dict[str, type] = typespec_utils.get_typespec_from_dict( - updates, column_types - ) # type: ignore[assignment] # TODO: cleanup the handling of typespec python schema and various conversion points new_data_table = self._data_context.type_converter.python_dicts_to_arrow_table( - [updates], python_schema=typespec + [updates], python_schema=dict(column_types) if column_types else None ) # perform in-place update @@ -758,15 +762,10 @@ def with_context_key(self, new_context_key: str) -> Self: """ # TODO: consider if there is a more efficient way to handle context # Combine all tables for reconstruction - combined_table = self._data_table - if self._meta_table is not None: - combined_table = arrow_utils.hstack_tables(combined_table, self._meta_table) - return self.__class__( - table=combined_table, - data_context=new_context_key, - # Note: semantic_converter will be rebuilt for new context - ) + new_datagram = self.copy(include_cache=False) + new_datagram._data_context = contexts.resolve_context(new_context_key) + return new_datagram # 8. Utility Operations def copy(self, include_cache: bool = True) -> Self: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py index 7273095..1e07ece 100644 --- a/src/orcapod/protocols/data_protocols.py +++ b/src/orcapod/protocols/data_protocols.py @@ -1,6 +1,6 @@ from collections.abc import Collection, Iterator, Mapping, Callable from datetime import datetime -from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING +from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING, runtime_checkable from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import DataValue, TypeSpec @@ -11,6 +11,7 @@ import pandas as pd +@runtime_checkable class ExecutionEngine(Protocol): @property def name(self) -> str: ... @@ -30,6 +31,7 @@ async def submit_async(self, function: Callable, *args, **kwargs) -> Any: ... +@runtime_checkable class Datagram(Protocol): """ Protocol for immutable datagram containers in Orcapod. @@ -633,6 +635,7 @@ def __repr__(self) -> str: ... +@runtime_checkable class Tag(Datagram, Protocol): """ Metadata associated with each data item in a stream. @@ -879,6 +882,7 @@ def system_tags(self) -> dict[str, DataValue]: ... +@runtime_checkable class Packet(Datagram, Protocol): """ The actual data payload in a stream. @@ -1145,6 +1149,7 @@ def with_source_info( ... +@runtime_checkable class PodFunction(Protocol): """ A function suitable for use in a FunctionPod. @@ -1184,6 +1189,7 @@ def __call__(self, **kwargs: DataValue) -> None | DataValue: ... +@runtime_checkable class Labelable(Protocol): """ Protocol for objects that can have a human-readable label. @@ -1215,6 +1221,7 @@ def label(self) -> str | None: ... +@runtime_checkable class Stream(ContentIdentifiable, Labelable, Protocol): """ Base protocol for all streams in Orcapod. @@ -1550,6 +1557,7 @@ def map_packets( ... +@runtime_checkable class LiveStream(Stream, Protocol): """ A stream that automatically stays up-to-date with its upstream dependencies. @@ -1622,6 +1630,7 @@ def invalidate(self) -> None: ... +@runtime_checkable class Kernel(ContentIdentifiable, Labelable, Protocol): """ The fundamental unit of computation in Orcapod. @@ -1815,6 +1824,7 @@ def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: ... +@runtime_checkable class Pod(Kernel, Protocol): """ Specialized kernel for packet-level processing with advanced caching. @@ -1940,6 +1950,7 @@ def call( ... +@runtime_checkable class CachedPod(Pod, Protocol): async def async_call( self, @@ -2012,6 +2023,7 @@ def get_all_records( ... +@runtime_checkable class Source(Kernel, Stream, Protocol): """ Entry point for data into the computational graph. @@ -2078,6 +2090,7 @@ def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... +@runtime_checkable class Tracker(Protocol): """ Records kernel invocations and stream creation for computational graph tracking. @@ -2176,6 +2189,7 @@ def record_pod_invocation( ... +@runtime_checkable class TrackerManager(Protocol): """ Manages multiple trackers and coordinates their activity. diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 412a1a9..0433646 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -233,22 +233,37 @@ def python_dicts_to_arrow_table( self, python_dicts: list[dict[str, Any]], python_schema: dict[str, type] | None = None, + arrow_schema: "pa.Schema | None" = None, ) -> pa.Table: """ Convert a list of Python dictionaries to an Arrow table. This uses the main conversion logic and caches results for performance. """ - if python_schema is None: + if python_schema is not None and arrow_schema is not None: + logger.warning( + "Both Python and Arrow schemas are provided. If they are not compatible, this may lead to unexpected behavior." + ) + if python_schema is None and arrow_schema is None: + # Infer schema from data if not provided python_schema = infer_schema_from_pylist_data(python_dicts) + if arrow_schema is None: + # Convert to Arrow schema + assert python_schema is not None, "Python schema should not be None here" + arrow_schema = self.python_schema_to_arrow_schema(python_schema) + + if python_schema is None: + assert arrow_schema is not None, ( + "Arrow schema should not be None if reaching here" + ) + python_schema = self.arrow_schema_to_python_schema(arrow_schema) + struct_dicts = self.python_dicts_to_struct_dicts( python_dicts, python_schema=python_schema ) - # Convert to Arrow schema - arrow_schema = self.python_schema_to_arrow_schema(python_schema) - + # TODO: add more helpful message here return pa.Table.from_pylist(struct_dicts, schema=arrow_schema) def arrow_table_to_python_dicts( @@ -581,7 +596,8 @@ def _create_python_to_arrow_converter( """Create a cached conversion function for Python → Arrow values.""" # Get the Arrow type for this Python type - arrow_type = self.python_type_to_arrow_type(python_type) + # TODO: check if this step is necessary + _ = self.python_type_to_arrow_type(python_type) # Check for semantic type first if self.semantic_registry: @@ -785,109 +801,6 @@ def get_cache_stats(self) -> dict[str, int]: } -# def infer_schema_from_pylist_data(data: list[dict]) -> dict[str, type]: -# """ -# Infer schema from sample data (best effort). - -# Args: -# data: List of sample dictionaries - -# Returns: -# Dictionary mapping field names to inferred Python types - -# Note: This is best-effort inference and may not handle all edge cases. -# For production use, explicit schemas are recommended. -# """ -# if not data: -# return {} - -# schema = {} - -# # Get all possible field names -# # use list to preserve order of appearance as much as possible -# all_fields = [] -# for record in data: -# all_fields.extend(record.keys()) - -# all_fields = list( -# dict.fromkeys(all_fields) -# ) # Remove duplicates while preserving order - -# # Infer type for each field -# for field_name in all_fields: -# field_values = [ -# record.get(field_name) -# for record in data -# if field_name in record and record[field_name] is not None -# ] - -# if not field_values: -# schema[field_name] = Any # No non-null values found -# continue - -# # Get types of all values -# value_types = {type(v) for v in field_values} - -# if len(value_types) == 1: -# # All values have same type -# value_type = next(iter(value_types)) - -# # For containers, try to infer element types -# if value_type is list and field_values: -# # Infer list element type from first non-empty list -# for lst in field_values: -# if lst: # non-empty list -# element_types = {type(elem) for elem in lst} -# if len(element_types) == 1: -# element_type = next(iter(element_types)) -# schema[field_name] = list[element_type] -# else: -# schema[field_name] = list[Any] # Mixed types -# break -# else: -# schema[field_name] = list[Any] # All lists empty - -# elif value_type in {set, frozenset} and field_values: -# # Infer set element type from first non-empty set -# for s in field_values: -# if s: # non-empty set -# element_types = {type(elem) for elem in s} -# if len(element_types) == 1: -# element_type = next(iter(element_types)) -# schema[field_name] = set[element_type] -# else: -# schema[field_name] = set[Any] # Mixed types -# break -# else: -# schema[field_name] = set[Any] # All sets empty - -# elif value_type is dict and field_values: -# # Infer dict types from first non-empty dict -# for d in field_values: -# if d: # non-empty dict -# key_types = {type(k) for k in d.keys()} -# value_types = {type(v) for v in d.values()} - -# if len(key_types) == 1 and len(value_types) == 1: -# key_type = next(iter(key_types)) -# val_type = next(iter(value_types)) -# schema[field_name] = dict[key_type, val_type] -# else: -# schema[field_name] = dict[Any, Any] # Mixed types -# break -# else: -# schema[field_name] = dict[Any, Any] # All dicts empty - -# else: -# schema[field_name] = value_type - -# else: -# # Mixed types - use Union or Any -# schema[field_name] = Any - -# return schema - - # Public API functions def python_type_to_arrow_type( python_type: type, data_context: DataContext | str | None = None diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index b7be792..8e5bf9c 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -15,6 +15,56 @@ pa = LazyModule("pyarrow") +def schema_select( + arrow_schema: "pa.Schema", + column_names: Collection[str], + ignore_missing: bool = False, +) -> "pa.Schema": + """ + Select a subset of columns from a PyArrow schema. + + Args: + arrow_schema: The original PyArrow schema. + column_names: A collection of column names to select. + + Returns: + A new PyArrow schema containing only the selected columns. + """ + if not ignore_missing: + existing_columns = set(field.name for field in arrow_schema) + missing_columns = set(column_names) - existing_columns + if missing_columns: + raise KeyError(f"Missing columns in Arrow schema: {missing_columns}") + selected_fields = [field for field in arrow_schema if field.name in column_names] + return pa.schema(selected_fields) + + +def schema_drop( + arrow_schema: "pa.Schema", + column_names: Collection[str], + ignore_missing: bool = False, +) -> "pa.Schema": + """ + Drop a subset of columns from a PyArrow schema. + + Args: + arrow_schema: The original PyArrow schema. + column_names: A collection of column names to drop. + + Returns: + A new PyArrow schema containing only the remaining columns. + """ + if not ignore_missing: + existing_columns = set(field.name for field in arrow_schema) + missing_columns = set(column_names) - existing_columns + if missing_columns: + raise KeyError(f"Missing columns in Arrow schema: {missing_columns}") + remaining_fields = [ + field for field in arrow_schema if field.name not in column_names + ] + return pa.schema(remaining_fields) + + def normalize_to_large_types(arrow_type: "pa.DataType") -> "pa.DataType": """ Recursively convert Arrow types to their large variants where available. diff --git a/tests/test_data/test_datagrams/test_arrow_datagram.py b/tests/test_data/test_datagrams/test_arrow_datagram.py index 44fe537..a3da84b 100644 --- a/tests/test_data/test_datagrams/test_arrow_datagram.py +++ b/tests/test_data/test_datagrams/test_arrow_datagram.py @@ -11,18 +11,19 @@ - Context operations - Utility operations """ +# Verified by Edgar Y. Walker +from typing import cast import pytest import pyarrow as pa from datetime import datetime, date from orcapod.data.datagrams import ArrowDatagram from orcapod.data.system_constants import constants +from orcapod.protocols.data_protocols import Datagram class TestArrowDatagramInitialization: - """Test ArrowDatagram initialization and basic properties.""" - def test_basic_initialization(self): """Test basic initialization with PyArrow table.""" table = pa.Table.from_pydict( @@ -49,6 +50,13 @@ def test_initialization_empty_table_fails(self): with pytest.raises(ValueError, match="exactly one row"): ArrowDatagram(table) + def test_string_type_initialization(self) -> None: + """Initializing with pa.string() table should yield table with pa.large_string()""" + table = pa.Table.from_pydict({"name": ["John"]}) + datagram = ArrowDatagram(table) + # TODO: fix this type annotation mistake in the pyi of pyarrow-stubs + assert datagram._data_table.schema[0].type == pa.large_string() # type: ignore + def test_initialization_with_meta_info(self): """Test initialization with meta information.""" table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) @@ -59,6 +67,10 @@ def test_initialization_with_meta_info(self): assert datagram["user_id"] == 123 assert datagram.get_meta_value("pipeline_version") == "v1.0" assert datagram.get_meta_value("timestamp") == "2024-01-01" + assert [ + f"{constants.META_PREFIX}pipeline_version" + in datagram.as_table(include_meta_columns=True).column_names + ] def test_initialization_with_context_in_table(self): """Test initialization when context is included in table.""" @@ -81,8 +93,8 @@ def test_initialization_with_meta_columns_in_table(self): { "user_id": [123], "name": ["Alice"], - "__version": ["1.0"], - "__timestamp": ["2024-01-01"], + f"{constants.META_PREFIX}version": ["1.0"], + f"{constants.META_PREFIX}timestamp": ["2024-01-01"], } ) @@ -96,14 +108,17 @@ def test_initialization_with_explicit_context(self): """Test initialization with explicit data context.""" table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) - datagram = ArrowDatagram(table, data_context="v0.1") + datagram = ArrowDatagram(table, data_context="std:v0.1:default") assert datagram.data_context_key == "std:v0.1:default" def test_initialization_no_data_columns_fails(self): """Test initialization with no data columns fails.""" table = pa.Table.from_pydict( - {"__version": ["1.0"], constants.CONTEXT_KEY: ["v0.1"]} + { + f"{constants.META_PREFIX}version": ["1.0"], + constants.CONTEXT_KEY: ["std:v0.1:default"], + } ) with pytest.raises(ValueError, match="at least one data column"): @@ -117,7 +132,13 @@ class TestArrowDatagramDictInterface: def sample_datagram(self): """Create a sample datagram for testing.""" table = pa.Table.from_pydict( - {"user_id": [123], "name": ["Alice"], "score": [85.5], "active": [True]} + { + "user_id": [123], + "name": ["Alice"], + "score": [85.5], + "active": [True], + f"{constants.META_PREFIX}version": ["1.0"], + } ) return ArrowDatagram(table) @@ -142,6 +163,7 @@ def test_contains(self, sample_datagram): def test_iter(self, sample_datagram): """Test __iter__ method.""" keys = list(sample_datagram) + # this should not include the meta column expected_keys = ["user_id", "name", "score", "active"] assert set(keys) == set(expected_keys) @@ -152,6 +174,29 @@ def test_get(self, sample_datagram): assert sample_datagram.get("nonexistent", "default") == "default" +class TestArrowDatagramProtocolAdherance: + @pytest.fixture + def basic_datagram(self) -> ArrowDatagram: + table = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "score": [85.5], + "active": [True], + f"{constants.META_PREFIX}version": ["1.0"], + } + ) + return ArrowDatagram(table) + + def test_is_instance_of_datagram(self, basic_datagram): + # ArrowDatagram should ben an instance of Datagram protocol + assert isinstance(basic_datagram, Datagram) + + # verify that it is NOT possible to check for inheritance + with pytest.raises(TypeError): + issubclass(ArrowDatagram, Datagram) + + class TestArrowDatagramStructuralInfo: """Test structural information methods.""" @@ -162,8 +207,8 @@ def datagram_with_meta(self): { "user_id": [123], "name": ["Alice"], - "__version": ["1.0"], - "__pipeline_id": ["test_pipeline"], + f"{constants.META_PREFIX}version": ["1.0"], + f"{constants.META_PREFIX}pipeline_id": ["test_pipeline"], } ) return ArrowDatagram(table) @@ -177,7 +222,12 @@ def test_keys_data_only(self, datagram_with_meta): def test_keys_with_meta_columns(self, datagram_with_meta): """Test keys method including meta columns.""" keys = datagram_with_meta.keys(include_meta_columns=True) - expected = ("user_id", "name", "__version", "__pipeline_id") + expected = ( + "user_id", + "name", + f"{constants.META_PREFIX}version", + f"{constants.META_PREFIX}pipeline_id", + ) assert set(keys) == set(expected) def test_keys_with_context(self, datagram_with_meta): @@ -192,16 +242,30 @@ def test_keys_with_all_info(self, datagram_with_meta): expected = ( "user_id", "name", - "__version", - "__pipeline_id", + f"{constants.META_PREFIX}version", + f"{constants.META_PREFIX}pipeline_id", constants.CONTEXT_KEY, ) assert set(keys) == set(expected) def test_keys_with_specific_meta_prefix(self, datagram_with_meta): """Test keys method with specific meta column prefixes.""" - keys = datagram_with_meta.keys(include_meta_columns=["__version"]) - expected = ("user_id", "name", "__version") + keys = datagram_with_meta.keys( + include_meta_columns=[f"{constants.META_PREFIX}version"] + ) + expected = ("user_id", "name", f"{constants.META_PREFIX}version") + assert set(keys) == set(expected) + + def test_keys_with_nonexistent_meta_prefix(self, datagram_with_meta): + """Test keys methods when called with non-existent meta column prefixes""" + # non-existing prefix should be ignored + keys = datagram_with_meta.keys( + include_meta_columns=[ + f"{constants.META_PREFIX}nonexistent", + f"{constants.META_PREFIX}version", + ] + ) + expected = ("user_id", "name", f"{constants.META_PREFIX}version") assert set(keys) == set(expected) def test_types_data_only(self, datagram_with_meta): @@ -215,7 +279,12 @@ def test_types_data_only(self, datagram_with_meta): def test_types_with_meta_columns(self, datagram_with_meta): """Test types method including meta columns.""" types = datagram_with_meta.types(include_meta_columns=True) - expected_keys = {"user_id", "name", "__version", "__pipeline_id"} + expected_keys = { + "user_id", + "name", + f"{constants.META_PREFIX}version", + f"{constants.META_PREFIX}pipeline_id", + } assert set(types.keys()) == expected_keys def test_types_with_context(self, datagram_with_meta): @@ -234,7 +303,12 @@ def test_arrow_schema_data_only(self, datagram_with_meta): def test_arrow_schema_with_meta_columns(self, datagram_with_meta): """Test arrow_schema method including meta columns.""" schema = datagram_with_meta.arrow_schema(include_meta_columns=True) - expected_names = {"user_id", "name", "__version", "__pipeline_id"} + expected_names = { + "user_id", + "name", + f"{constants.META_PREFIX}version", + f"{constants.META_PREFIX}pipeline_id", + } assert set(schema.names) == expected_names def test_arrow_schema_with_context(self, datagram_with_meta): @@ -253,6 +327,31 @@ def test_content_hash(self, datagram_with_meta): assert isinstance(hash1, str) assert len(hash1) > 0 + def test_content_hash_same_data_different_meta_data(self): + """Test that the content hash is the same for identical data with different meta data.""" + table1 = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.0"], + "__pipeline_id": ["pipeline_1"], + } + ) + table2 = pa.Table.from_pydict( + { + "user_id": [123], + "name": ["Alice"], + "__version": ["1.1"], + "__pipeline_id": ["pipeline_2"], + } + ) + datagram1 = ArrowDatagram(table1) + datagram2 = ArrowDatagram(table2) + hash1 = datagram1.content_hash() + hash2 = datagram2.content_hash() + + assert hash1 == hash2 + def test_content_hash_different_data(self): """Test that different data produces different hashes.""" table1 = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) @@ -277,8 +376,8 @@ def datagram_with_all(self): { "user_id": [123], "name": ["Alice"], - "__version": ["1.0"], - constants.CONTEXT_KEY: ["v0.1"], + f"{constants.META_PREFIX}version": ["1.0"], + constants.CONTEXT_KEY: ["std:v0.1:default"], } ) return ArrowDatagram(table) @@ -292,7 +391,11 @@ def test_as_dict_data_only(self, datagram_with_all): def test_as_dict_with_meta_columns(self, datagram_with_all): """Test as_dict method including meta columns.""" result = datagram_with_all.as_dict(include_meta_columns=True) - expected = {"user_id": 123, "name": "Alice", "__version": "1.0"} + expected = { + "user_id": 123, + "name": "Alice", + f"{constants.META_PREFIX}version": "1.0", + } assert result == expected def test_as_dict_with_context(self, datagram_with_all): @@ -308,13 +411,17 @@ def test_as_dict_with_context(self, datagram_with_all): def test_as_dict_with_all_info(self, datagram_with_all): """Test as_dict method including all information.""" result = datagram_with_all.as_dict(include_all_info=True) + all_placed = datagram_with_all.as_dict( + include_meta_columns=True, include_context=True + ) expected = { "user_id": 123, "name": "Alice", - "__version": "1.0", + f"{constants.META_PREFIX}version": "1.0", constants.CONTEXT_KEY: "std:v0.1:default", } assert result == expected + assert result == all_placed def test_as_table_data_only(self, datagram_with_all): """Test as_table method with data columns only.""" @@ -330,8 +437,9 @@ def test_as_table_with_meta_columns(self, datagram_with_all): table = datagram_with_all.as_table(include_meta_columns=True) assert len(table) == 1 - expected_columns = {"user_id", "name", "__version"} + expected_columns = {"user_id", "name", f"{constants.META_PREFIX}version"} assert set(table.column_names) == expected_columns + assert table[f"{constants.META_PREFIX}version"].to_pylist() == ["1.0"] def test_as_table_with_context(self, datagram_with_all): """Test as_table method including context.""" @@ -340,10 +448,12 @@ def test_as_table_with_context(self, datagram_with_all): assert len(table) == 1 expected_columns = {"user_id", "name", constants.CONTEXT_KEY} assert set(table.column_names) == expected_columns + assert table[constants.CONTEXT_KEY].to_pylist() == ["std:v0.1:default"] def test_as_arrow_compatible_dict(self, datagram_with_all): """Test as_arrow_compatible_dict method.""" result = datagram_with_all.as_arrow_compatible_dict() + # TODO: add test case including complex data types # Should have same keys as as_dict dict_result = datagram_with_all.as_dict() @@ -360,8 +470,8 @@ def datagram_with_meta(self): { "user_id": [123], "name": ["Alice"], - "__version": ["1.0"], - "__pipeline_id": ["test"], + f"{constants.META_PREFIX}version": ["1.0"], + f"{constants.META_PREFIX}pipeline_id": ["test"], } ) return ArrowDatagram(table) @@ -369,13 +479,19 @@ def datagram_with_meta(self): def test_meta_columns_property(self, datagram_with_meta): """Test meta_columns property.""" meta_cols = datagram_with_meta.meta_columns - expected = ("__version", "__pipeline_id") + expected = ( + f"{constants.META_PREFIX}version", + f"{constants.META_PREFIX}pipeline_id", + ) assert set(meta_cols) == set(expected) def test_get_meta_value(self, datagram_with_meta): """Test get_meta_value method.""" # With prefix - assert datagram_with_meta.get_meta_value("__version") == "1.0" + assert ( + datagram_with_meta.get_meta_value(f"{constants.META_PREFIX}version") + == "1.0" + ) # Without prefix assert datagram_with_meta.get_meta_value("version") == "1.0" @@ -387,7 +503,7 @@ def test_with_meta_columns(self, datagram_with_meta): """Test with_meta_columns method.""" updated = datagram_with_meta.with_meta_columns( version="2.0", # Update existing - new_meta="new_value", # Add new + new_meta=3.5, # Add new ) # Original should be unchanged @@ -395,7 +511,16 @@ def test_with_meta_columns(self, datagram_with_meta): # Updated should have new values assert updated.get_meta_value("version") == "2.0" - assert updated.get_meta_value("new_meta") == "new_value" + assert updated.get_meta_value("new_meta") == 3.5 + + # meta data should be available as meta-prefixed column + table_with_meta = updated.as_table(include_meta_columns=True) + assert table_with_meta[f"{constants.META_PREFIX}version"].to_pylist() == ["2.0"] + assert table_with_meta[f"{constants.META_PREFIX}new_meta"].to_pylist() == [3.5] + + assert ( + table_with_meta[f"{constants.META_PREFIX}version"].type == pa.large_string() + ) # Data should be preserved assert updated["user_id"] == 123 @@ -403,7 +528,9 @@ def test_with_meta_columns(self, datagram_with_meta): def test_with_meta_columns_prefixed_keys(self, datagram_with_meta): """Test with_meta_columns method with prefixed keys.""" - updated = datagram_with_meta.with_meta_columns(__version="2.0") + updated = datagram_with_meta.with_meta_columns( + **{f"{constants.META_PREFIX}version": "2.0"} + ) assert updated.get_meta_value("version") == "2.0" @@ -414,7 +541,7 @@ def test_drop_meta_columns(self, datagram_with_meta): # Original should be unchanged assert datagram_with_meta.get_meta_value("version") == "1.0" - # Updated should not have dropped column + # Updated should not have dropped other metadata columns assert updated.get_meta_value("version") is None assert updated.get_meta_value("pipeline_id") == "test" @@ -423,7 +550,9 @@ def test_drop_meta_columns(self, datagram_with_meta): def test_drop_meta_columns_prefixed(self, datagram_with_meta): """Test drop_meta_columns method with prefixed keys.""" - updated = datagram_with_meta.drop_meta_columns("__version") + updated = datagram_with_meta.drop_meta_columns( + f"{constants.META_PREFIX}version" + ) assert updated.get_meta_value("version") is None @@ -431,6 +560,10 @@ def test_drop_meta_columns_multiple(self, datagram_with_meta): """Test dropping multiple meta columns.""" updated = datagram_with_meta.drop_meta_columns("version", "pipeline_id") + # original should not be modified + assert datagram_with_meta.get_meta_value("version") == "1.0" + assert datagram_with_meta.get_meta_value("pipeline_id") == "test" + assert updated.get_meta_value("version") is None assert updated.get_meta_value("pipeline_id") is None @@ -459,22 +592,35 @@ class TestArrowDatagramDataOperations: def sample_datagram(self): """Create a sample datagram for testing.""" table = pa.Table.from_pydict( - {"user_id": [123], "name": ["Alice"], "score": [85.5], "active": [True]} + { + "user_id": [123], + "name": ["Alice"], + "score": [85.5], + "active": [True], + f"{constants.META_PREFIX}version": ["1.0"], + f"{constants.META_PREFIX}pipeline_id": ["test"], + } ) return ArrowDatagram(table) - def test_select(self, sample_datagram): + def test_select(self, sample_datagram: ArrowDatagram): """Test select method.""" selected = sample_datagram.select("user_id", "name") assert set(selected.keys()) == {"user_id", "name"} assert selected["user_id"] == 123 assert selected["name"] == "Alice" + # meta values should be copied over + assert selected.get_meta_value("version") == "1.0" + assert selected.get_meta_value("pipeline_id") == "test" + + # context should be preserved + assert selected.data_context_key == sample_datagram.data_context_key # Original should be unchanged assert set(sample_datagram.keys()) == {"user_id", "name", "score", "active"} - def test_select_single_column(self, sample_datagram): + def test_select_single_column(self, sample_datagram: ArrowDatagram): """Test select method with single column.""" selected = sample_datagram.select("user_id") @@ -486,7 +632,7 @@ def test_select_missing_column(self, sample_datagram): with pytest.raises(ValueError): sample_datagram.select("user_id", "nonexistent") - def test_drop(self, sample_datagram): + def test_drop(self, sample_datagram: ArrowDatagram): """Test drop method.""" dropped = sample_datagram.drop("score", "active") @@ -494,27 +640,36 @@ def test_drop(self, sample_datagram): assert dropped["user_id"] == 123 assert dropped["name"] == "Alice" + # drop should preserve context and meta values + assert dropped.get_meta_value("version") == "1.0" + assert dropped.get_meta_value("pipeline_id") == "test" + assert dropped.data_context_key == sample_datagram.data_context_key + # Original should be unchanged assert set(sample_datagram.keys()) == {"user_id", "name", "score", "active"} - def test_drop_single_column(self, sample_datagram): + def test_drop_single_column(self, sample_datagram: ArrowDatagram): """Test drop method with single column.""" dropped = sample_datagram.drop("score") + # drop should preserve context and meta values + assert dropped.get_meta_value("version") == "1.0" + assert dropped.get_meta_value("pipeline_id") == "test" + assert dropped.data_context_key == sample_datagram.data_context_key assert set(dropped.keys()) == {"user_id", "name", "active"} - def test_drop_missing_column(self, sample_datagram): + def test_drop_missing_column(self, sample_datagram: ArrowDatagram): """Test drop method with missing column raises KeyError.""" with pytest.raises(KeyError): sample_datagram.drop("nonexistent") - def test_drop_ignore_missing(self, sample_datagram): + def test_drop_ignore_missing(self, sample_datagram: ArrowDatagram): """Test drop method with ignore_missing=True.""" dropped = sample_datagram.drop("score", "nonexistent", ignore_missing=True) assert set(dropped.keys()) == {"user_id", "name", "active"} - def test_rename(self, sample_datagram): + def test_rename(self, sample_datagram: ArrowDatagram): """Test rename method.""" renamed = sample_datagram.rename({"user_id": "id", "name": "username"}) @@ -524,11 +679,16 @@ def test_rename(self, sample_datagram): assert renamed["username"] == "Alice" assert renamed["score"] == 85.5 + # meta and context should be unaffected + assert renamed.get_meta_value("version") == "1.0" + assert renamed.get_meta_value("pipeline_id") == "test" + assert renamed.data_context_key == sample_datagram.data_context_key + # Original should be unchanged assert "user_id" in sample_datagram assert "id" not in sample_datagram - def test_rename_empty_mapping(self, sample_datagram): + def test_rename_empty_mapping(self, sample_datagram: ArrowDatagram): """Test rename method with empty mapping.""" renamed = sample_datagram.rename({}) @@ -536,7 +696,7 @@ def test_rename_empty_mapping(self, sample_datagram): assert set(renamed.keys()) == set(sample_datagram.keys()) assert renamed["user_id"] == sample_datagram["user_id"] - def test_update(self, sample_datagram): + def test_update(self, sample_datagram: ArrowDatagram): """Test update method.""" updated = sample_datagram.update(score=95.0, active=False) @@ -549,19 +709,20 @@ def test_update(self, sample_datagram): assert not updated["active"] assert updated["user_id"] == 123 # Unchanged columns preserved - def test_update_missing_column(self, sample_datagram): + def test_update_missing_column(self, sample_datagram: ArrowDatagram): """Test update method with missing column raises KeyError.""" with pytest.raises(KeyError): sample_datagram.update(nonexistent="value") - def test_update_empty(self, sample_datagram): + def test_update_empty(self, sample_datagram: ArrowDatagram): """Test update method with no updates returns same instance.""" updated = sample_datagram.update() # Should return the same instance + # TODO: reconsider if this behavior is what is specified by the protocol assert updated is sample_datagram - def test_with_columns(self, sample_datagram): + def test_with_columns(self, sample_datagram: ArrowDatagram): """Test with_columns method.""" new_datagram = sample_datagram.with_columns( department="Engineering", salary=75000 @@ -577,7 +738,7 @@ def test_with_columns(self, sample_datagram): assert new_datagram["department"] == "Engineering" assert new_datagram["salary"] == 75000 - def test_with_columns_with_types(self, sample_datagram): + def test_with_columns_with_types(self, sample_datagram: ArrowDatagram): """Test with_columns method with explicit types.""" new_datagram = sample_datagram.with_columns( column_types={"salary": int, "rate": float}, salary=75000, rate=85.5 @@ -596,6 +757,7 @@ def test_with_columns_empty(self, sample_datagram): """Test with_columns method with no columns returns same instance.""" new_datagram = sample_datagram.with_columns() + # TODO: again consider if this behavior is what's specified by protocol assert new_datagram is sample_datagram @@ -605,9 +767,9 @@ class TestArrowDatagramContextOperations: def test_with_context_key(self): """Test with_context_key method.""" table = pa.Table.from_pydict({"user_id": [123], "name": ["Alice"]}) - original_datagram = ArrowDatagram(table, data_context="v0.1") + original_datagram = ArrowDatagram(table, data_context="std:v0.1:default") - new_datagram = original_datagram.with_context_key("v0.1") + new_datagram = original_datagram.with_context_key("std:v0.1:default") # Original should be unchanged assert original_datagram.data_context_key == "std:v0.1:default" @@ -627,7 +789,11 @@ class TestArrowDatagramUtilityOperations: def sample_datagram(self): """Create a sample datagram for testing.""" table = pa.Table.from_pydict( - {"user_id": [123], "name": ["Alice"], "__version": ["1.0"]} + { + "user_id": [123], + "name": ["Alice"], + f"{constants.META_PREFIX}version": ["1.0"], + } ) return ArrowDatagram(table) @@ -673,7 +839,7 @@ def test_str_representation(self, sample_datagram): assert "Alice" in str_repr # Should not contain meta columns - assert "__version" not in str_repr + assert f"{constants.META_PREFIX}version" not in str_repr def test_repr_representation(self, sample_datagram): """Test repr representation.""" @@ -733,7 +899,7 @@ def test_large_string_types(self): datagram = ArrowDatagram(table) assert datagram["id"] == 123 - assert len(datagram["text"]) > 1000 + assert len(cast(str, datagram["text"])) > 1000 def test_timestamp_types(self): """Test handling of timestamp types.""" @@ -786,6 +952,23 @@ def test_duplicate_operations(self): renamed = datagram.rename({"user_id": "user_id", "name": "name"}) assert set(renamed.keys()) == set(datagram.keys()) + def test_conversion_to_large_types(self): + table = pa.Table.from_arrays( + [ + pa.array([123], type=pa.int8()), + pa.array(["A very long string " * 100], type=pa.string()), + ], + names=["id", "text"], + ) + + datagram = ArrowDatagram(table) + + returned_table = datagram.as_table() + + # integer should be preserved but string should become large_string + assert returned_table["id"].type == pa.int8() + assert returned_table["text"].type == pa.large_string() + class TestArrowDatagramIntegration: """Test integration between different operations.""" @@ -799,9 +982,10 @@ def test_chained_operations(self): "last_name": ["Smith"], "score": [85.5], "active": [True], - "__version": ["1.0"], + f"{constants.META_PREFIX}version": ["1.0"], } ) + original_keys = set(table.column_names) - {f"{constants.META_PREFIX}version"} datagram = ArrowDatagram(table) @@ -813,6 +997,11 @@ def test_chained_operations(self): .with_meta_columns(version="2.0") ) + # verify original is not modified + assert set(datagram.keys()) == original_keys + assert datagram["first_name"] == "Alice" + assert datagram["score"] == 85.5 + # Verify final state assert set(result.keys()) == {"user_id", "score", "active", "full_name"} assert result["full_name"] == "Alice Smith" @@ -821,6 +1010,9 @@ def test_chained_operations(self): def test_dict_roundtrip(self): """Test conversion to dict and back preserves data.""" + + # TODO: perform this test but using semantic types + table = pa.Table.from_pydict( {"user_id": [123], "name": ["Alice"], "score": [85.5]} ) @@ -844,8 +1036,8 @@ def test_mixed_include_options(self): { "user_id": [123], "name": ["Alice"], - "__version": ["1.0"], - "__pipeline": ["test"], + f"{constants.META_PREFIX}version": ["1.0"], + f"{constants.META_PREFIX}pipeline": ["test"], } ) @@ -859,8 +1051,10 @@ def test_mixed_include_options(self): assert dict1 == dict2 # Test specific meta prefixes - dict3 = datagram.as_dict(include_meta_columns=["__version"]) - expected_keys = {"user_id", "name", "__version"} + dict3 = datagram.as_dict( + include_meta_columns=[f"{constants.META_PREFIX}version"] + ) + expected_keys = {"user_id", "name", f"{constants.META_PREFIX}version"} assert set(dict3.keys()) == expected_keys def test_arrow_table_schema_preservation(self): @@ -885,11 +1079,10 @@ def test_arrow_table_schema_preservation(self): assert schema.field("name").type == pa.large_string() assert schema.field("score").type == pa.float32() - # Operations should preserve types - but this might not be implemented yet - # For now, let's just test that the basic schema is correct - # updated = datagram.update(score=90.0) - # updated_schema = updated.arrow_schema() - # assert updated_schema.field("score").type == pa.float32() + # Operations should preserve types + updated = datagram.update(score=90.0) + updated_schema = updated.arrow_schema() + assert updated_schema.field("score").type == pa.float32() class TestArrowDatagramPerformance: diff --git a/tests/test_data/test_datagrams/test_arrow_tag_packet.py b/tests/test_data/test_datagrams/test_arrow_tag_packet.py index 6fbe9d9..04844cc 100644 --- a/tests/test_data/test_datagrams/test_arrow_tag_packet.py +++ b/tests/test_data/test_datagrams/test_arrow_tag_packet.py @@ -175,6 +175,232 @@ def test_as_datagram_with_system_tags(self, sample_tag): assert "tag_type" in datagram.keys() +class TestArrowTagDataOperations: + """Test that system tags are preserved across all data operations.""" + + @pytest.fixture + def sample_tag_with_system_tags(self): + """Create a sample tag with system tags for testing operations.""" + table = pa.Table.from_pydict( + {"user_id": [123], "name": ["Alice"], "score": [85.5], "active": [True]} + ) + system_tags = { + "tag_type": "user", + "version": "1.0", + "created_by": "system", + "priority": "high", + } + return ArrowTag(table, system_tags=system_tags) + + def test_select_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that select operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Select subset of columns + selected = sample_tag_with_system_tags.select("user_id", "name") + + # System tags should be preserved + assert selected.system_tags() == original_system_tags + assert selected.system_tags()["tag_type"] == "user" + assert selected.system_tags()["version"] == "1.0" + assert selected.system_tags()["created_by"] == "system" + assert selected.system_tags()["priority"] == "high" + + # Only selected data columns should remain + assert "user_id" in selected.keys() + assert "name" in selected.keys() + assert "score" not in selected.keys() + assert "active" not in selected.keys() + + def test_drop_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that drop operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Drop some columns + dropped = sample_tag_with_system_tags.drop("score", "active") + + # System tags should be preserved + assert dropped.system_tags() == original_system_tags + assert dropped.system_tags()["tag_type"] == "user" + assert dropped.system_tags()["version"] == "1.0" + + # Dropped columns should be gone, others should remain + assert "user_id" in dropped.keys() + assert "name" in dropped.keys() + assert "score" not in dropped.keys() + assert "active" not in dropped.keys() + + def test_rename_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that rename operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Rename columns + renamed = sample_tag_with_system_tags.rename( + {"user_id": "id", "name": "username"} + ) + + # System tags should be preserved + assert renamed.system_tags() == original_system_tags + assert renamed.system_tags()["tag_type"] == "user" + assert renamed.system_tags()["version"] == "1.0" + + # Data columns should be renamed + assert "id" in renamed.keys() + assert "username" in renamed.keys() + assert "user_id" not in renamed.keys() + assert "name" not in renamed.keys() + + def test_update_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that update operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Update some column values + updated = sample_tag_with_system_tags.update(name="Alice Smith", score=92.0) + + # System tags should be preserved + assert updated.system_tags() == original_system_tags + assert updated.system_tags()["tag_type"] == "user" + assert updated.system_tags()["version"] == "1.0" + + # Updated values should be reflected + assert updated["name"] == "Alice Smith" + assert updated["score"] == 92.0 + assert updated["user_id"] == 123 # Unchanged + + def test_with_columns_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that with_columns operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Add new columns + with_new_cols = sample_tag_with_system_tags.with_columns( + email="alice@example.com", age=30, department="engineering" + ) + + # System tags should be preserved + assert with_new_cols.system_tags() == original_system_tags + assert with_new_cols.system_tags()["tag_type"] == "user" + assert with_new_cols.system_tags()["version"] == "1.0" + assert with_new_cols.system_tags()["created_by"] == "system" + assert with_new_cols.system_tags()["priority"] == "high" + + # New columns should be added + assert with_new_cols["email"] == "alice@example.com" + assert with_new_cols["age"] == 30 + assert with_new_cols["department"] == "engineering" + + # Original columns should remain + assert with_new_cols["user_id"] == 123 + assert with_new_cols["name"] == "Alice" + + def test_with_meta_columns_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that with_meta_columns operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Add meta columns + with_meta = sample_tag_with_system_tags.with_meta_columns( + pipeline_version="v2.1.0", processed_at="2024-01-01" + ) + + # System tags should be preserved + assert with_meta.system_tags() == original_system_tags + assert with_meta.system_tags()["tag_type"] == "user" + assert with_meta.system_tags()["version"] == "1.0" + + # Meta columns should be added + assert with_meta.get_meta_value("pipeline_version") == "v2.1.0" + assert with_meta.get_meta_value("processed_at") == "2024-01-01" + + def test_drop_meta_columns_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that drop_meta_columns operation preserves system tags.""" + # First add some meta columns + with_meta = sample_tag_with_system_tags.with_meta_columns( + pipeline_version="v2.1.0", processed_at="2024-01-01" + ) + original_system_tags = with_meta.system_tags() + + # Drop meta columns + dropped_meta = with_meta.drop_meta_columns("pipeline_version") + + # System tags should be preserved + assert dropped_meta.system_tags() == original_system_tags + assert dropped_meta.system_tags()["tag_type"] == "user" + assert dropped_meta.system_tags()["version"] == "1.0" + + # Meta column should be dropped + assert dropped_meta.get_meta_value("pipeline_version") is None + assert dropped_meta.get_meta_value("processed_at") == "2024-01-01" + + def test_with_context_key_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that with_context_key operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Change context key (note: "test" will resolve to "default" but that's expected) + new_context = sample_tag_with_system_tags.with_context_key("std:v0.1:test") + + # System tags should be preserved + assert new_context.system_tags() == original_system_tags + assert new_context.system_tags()["tag_type"] == "user" + assert new_context.system_tags()["version"] == "1.0" + + # Context should be different from original (even if resolved to default) + # The important thing is that the operation worked and system tags are preserved + assert new_context.data_context_key.startswith("std:v0.1:") + # Verify that this is a different object + assert new_context is not sample_tag_with_system_tags + + def test_copy_preserves_system_tags(self, sample_tag_with_system_tags): + """Test that copy operation preserves system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Copy with cache + copied_with_cache = sample_tag_with_system_tags.copy(include_cache=True) + + # Copy without cache + copied_without_cache = sample_tag_with_system_tags.copy(include_cache=False) + + # System tags should be preserved in both cases + assert copied_with_cache.system_tags() == original_system_tags + assert copied_without_cache.system_tags() == original_system_tags + + # Verify all system tags are present + for copy_obj in [copied_with_cache, copied_without_cache]: + assert copy_obj.system_tags()["tag_type"] == "user" + assert copy_obj.system_tags()["version"] == "1.0" + assert copy_obj.system_tags()["created_by"] == "system" + assert copy_obj.system_tags()["priority"] == "high" + + def test_chained_operations_preserve_system_tags(self, sample_tag_with_system_tags): + """Test that chained operations preserve system tags.""" + original_system_tags = sample_tag_with_system_tags.system_tags() + + # Chain multiple operations + result = ( + sample_tag_with_system_tags.with_columns( + full_name="Alice Smith", department="eng" + ) + .drop("score") + .update(active=False) + .rename({"user_id": "id"}) + .with_meta_columns(processed=True) + ) + + # System tags should be preserved through all operations + assert result.system_tags() == original_system_tags + assert result.system_tags()["tag_type"] == "user" + assert result.system_tags()["version"] == "1.0" + assert result.system_tags()["created_by"] == "system" + assert result.system_tags()["priority"] == "high" + + # Verify the chained operations worked + assert result["full_name"] == "Alice Smith" + assert result["department"] == "eng" + assert "score" not in result.keys() + assert result["active"] is False + assert "id" in result.keys() + assert "user_id" not in result.keys() + assert result.get_meta_value("processed") is True + + class TestArrowPacketInitialization: """Test ArrowPacket initialization and basic properties.""" @@ -658,7 +884,7 @@ def test_tag_complex_arrow_types(self): assert tag["id"] == 123 assert tag["numbers"] == [1, 2, 3] - assert tag["struct_field"]["nested"] == "value" + assert tag["struct_field"]["nested"] == "value" # type: ignore def test_packet_complex_arrow_types(self): """Test packets with complex Arrow data types.""" @@ -678,7 +904,7 @@ def test_packet_complex_arrow_types(self): assert packet["id"] == 123 assert packet["numbers"] == [1, 2, 3] - assert packet["struct_field"]["nested"] == "value" + assert packet["struct_field"]["nested"] == "value" # type: ignore def test_tag_timestamp_handling(self): """Test tag handling of timestamp types.""" diff --git a/tests/test_data/test_datagrams/test_base_integration.py b/tests/test_data/test_datagrams/test_base_integration.py index dae53c4..d0bd19f 100644 --- a/tests/test_data/test_datagrams/test_base_integration.py +++ b/tests/test_data/test_datagrams/test_base_integration.py @@ -122,7 +122,7 @@ def test_is_abstract(self): """Test that BaseDatagram cannot be instantiated directly.""" try: # This should raise TypeError for abstract class - BaseDatagram() + BaseDatagram() # type: ignore pytest.fail("Expected TypeError for abstract class instantiation") except TypeError as e: # Expected behavior - BaseDatagram is abstract @@ -573,10 +573,10 @@ def test_consistent_operation_errors(self): def test_consistent_validation(self): """Test that validation is consistent.""" - data = {"user_id": [123], "name": ["Alice"]} # Lists for PyArrow + data = {"user_id": 123, "name": "Alice"} # Lists for PyArrow dict_datagram = DictDatagram(data) - table = pa.Table.from_pydict(data) + table = pa.Table.from_pylist([data]) arrow_datagram = ArrowDatagram(table) # Both should handle edge cases consistently diff --git a/uv.lock b/uv.lock index ec6bcb4..a50b758 100644 --- a/uv.lock +++ b/uv.lock @@ -114,18 +114,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl", hash = "sha256:3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d", size = 25231, upload-time = "2025-03-31T14:16:18.478Z" }, ] -[[package]] -name = "aiohttp-retry" -version = "2.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9d/61/ebda4d8e3d8cfa1fd3db0fb428db2dd7461d5742cea35178277ad180b033/aiohttp_retry-2.9.1.tar.gz", hash = "sha256:8eb75e904ed4ee5c2ec242fefe85bf04240f685391c4879d8f541d6028ff01f1", size = 13608, upload-time = "2024-11-06T10:44:54.574Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1a/99/84ba7273339d0f3dfa57901b846489d2e5c2cd731470167757f1935fffbd/aiohttp_retry-2.9.1-py3-none-any.whl", hash = "sha256:66d2759d1921838256a05a3f80ad7e724936f083e35be5abb5e16eed6be6dc54", size = 9981, upload-time = "2024-11-06T10:44:52.917Z" }, -] - [[package]] name = "aioitertools" version = "0.12.0" @@ -1780,6 +1768,7 @@ name = "orcapod" source = { editable = "." } dependencies = [ { name = "beartype" }, + { name = "deltalake" }, { name = "matplotlib" }, { name = "networkx" }, { name = "pandas" }, @@ -1825,12 +1814,12 @@ dev = [ { name = "ruff" }, { name = "s3fs" }, { name = "tqdm" }, - { name = "unitycatalog-client" }, ] [package.metadata] requires-dist = [ { name = "beartype", specifier = ">=0.21.0" }, + { name = "deltalake", specifier = ">=1.0.2" }, { name = "ipywidgets", marker = "extra == 'ray'", specifier = ">=8.1.7" }, { name = "matplotlib", specifier = ">=3.10.3" }, { name = "networkx" }, @@ -1868,7 +1857,6 @@ dev = [ { name = "ruff", specifier = ">=0.11.11" }, { name = "s3fs", specifier = ">=2025.7.0" }, { name = "tqdm", specifier = ">=4.67.1" }, - { name = "unitycatalog-client", specifier = ">=0.3.0" }, ] [[package]] @@ -2958,23 +2946,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] -[[package]] -name = "unitycatalog-client" -version = "0.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, - { name = "aiohttp-retry" }, - { name = "pydantic" }, - { name = "python-dateutil" }, - { name = "typing-extensions" }, - { name = "urllib3" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2a/14/430e0fd06707b5ade9ba69a5847e1645a1adab7761b8149fd1916f814216/unitycatalog_client-0.3.0.tar.gz", hash = "sha256:6373b8c26723307beb9e14e92c9c5b75cc6dab343ba30b0a1d93c421ca944dfa", size = 63438, upload-time = "2025-06-06T15:23:06.584Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/76/dd/7e12caea6075a02feec609f60e2b7fe06a7b39f1c7026b32b21eaa6a68b0/unitycatalog_client-0.3.0-py3-none-any.whl", hash = "sha256:29d6061cafd076a098d515d3019a19d2449c14b82621b3910c1273cba16ee6e5", size = 159106, upload-time = "2025-06-06T15:23:03.839Z" }, -] - [[package]] name = "urllib3" version = "2.4.0" From 5a4227ede77b66382bf5ceefe62bcddb2d25ba71 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 19 Aug 2025 23:12:51 +0000 Subject: [PATCH 191/224] refactor: clean up the protocol organization and use content hash object --- pixi.lock | 50 +- pyproject.toml | 2 +- src/orcapod/contexts/__init__.py | 31 + src/orcapod/contexts/core.py | 8 +- src/orcapod/data/base.py | 73 +- src/orcapod/data/datagrams/arrow_datagram.py | 6 +- .../data/datagrams/arrow_tag_packet.py | 1 - src/orcapod/data/datagrams/base.py | 8 +- src/orcapod/data/datagrams/dict_datagram.py | 16 +- src/orcapod/data/kernels.py | 6 +- src/orcapod/data/pods.py | 11 +- src/orcapod/data/sources.py | 65 +- src/orcapod/data/streams.py | 184 +- src/orcapod/hashing/arrow_hashers.py | 239 +- src/orcapod/hashing/content_identifiable.py | 331 --- src/orcapod/hashing/object_hashers.py | 17 +- src/orcapod/hashing/semantic_type_hashers.py | 5 +- src/orcapod/pipeline/nodes.py | 7 +- src/orcapod/protocols/data_protocols.py | 2297 ----------------- .../protocols/data_protocols/__init__.py | 24 + src/orcapod/protocols/data_protocols/base.py | 97 + .../protocols/data_protocols/datagrams.py | 1105 ++++++++ .../protocols/data_protocols/kernel.py | 201 ++ src/orcapod/protocols/data_protocols/pods.py | 208 ++ .../protocols/data_protocols/source.py | 55 + .../protocols/data_protocols/streams.py | 424 +++ .../protocols/data_protocols/trackers.py | 213 ++ src/orcapod/protocols/hashing_protocols.py | 101 +- .../protocols/legacy_data_protocols.py | 2278 ++++++++++++++++ src/orcapod/protocols/semantic_protocols.py | 49 +- .../test_datagrams/test_arrow_datagram.py | 5 +- .../test_datagrams/test_dict_datagram.py | 4 +- 32 files changed, 4988 insertions(+), 3133 deletions(-) delete mode 100644 src/orcapod/hashing/content_identifiable.py delete mode 100644 src/orcapod/protocols/data_protocols.py create mode 100644 src/orcapod/protocols/data_protocols/__init__.py create mode 100644 src/orcapod/protocols/data_protocols/base.py create mode 100644 src/orcapod/protocols/data_protocols/datagrams.py create mode 100644 src/orcapod/protocols/data_protocols/kernel.py create mode 100644 src/orcapod/protocols/data_protocols/pods.py create mode 100644 src/orcapod/protocols/data_protocols/source.py create mode 100644 src/orcapod/protocols/data_protocols/streams.py create mode 100644 src/orcapod/protocols/data_protocols/trackers.py create mode 100644 src/orcapod/protocols/legacy_data_protocols.py diff --git a/pixi.lock b/pixi.lock index 74e1220..54acc8c 100644 --- a/pixi.lock +++ b/pixi.lock @@ -9,10 +9,8 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda @@ -24,10 +22,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda @@ -141,10 +137,8 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda @@ -156,11 +150,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda @@ -201,7 +192,6 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda @@ -219,7 +209,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda @@ -403,10 +392,8 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda @@ -418,10 +405,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda @@ -534,10 +519,8 @@ environments: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda @@ -549,11 +532,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda @@ -740,17 +720,6 @@ packages: - pytest-cov ; extra == 'test' - pytest-xdist ; extra == 'test' requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/noarch/astunparse-1.6.3-pyhd8ed1ab_3.conda - sha256: 7304f265f146235c34e24db310a94648aa306ca0b2a4a12042bf96da1881f99c - md5: d3f195dfdbbf736e4ec178bbec2a975c - depends: - - python >=3.9 - - six >=1.6.1,<2.0 - license: BSD-3-Clause AND PSF-2.0 - purls: - - pkg:pypi/astunparse?source=hash-mapping - size: 18143 - timestamp: 1736248194225 - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl name: attrs version: 25.3.0 @@ -2263,8 +2232,8 @@ packages: requires_python: '>=3.9' - pypi: ./ name: orcapod - version: 0.0.3a2.dev29+g197f3e1.d20250813 - sha256: 9604e103255e0296954d6e36a7b1822e342d2a03c78afb0bcae28ac9d9121b24 + version: 0.0.3a2.dev30+g144ef6a.d20250815 + sha256: ca2f39ccbaf238434a839d578fbcb67fdf7d345a5ee936cb60d5f025d4bc7d81 requires_dist: - xxhash - networkx @@ -2276,7 +2245,6 @@ packages: - polars>=1.31.0 - beartype>=0.21.0 - deltalake>=1.0.2 - - pdoc>=15.0.4 - redis>=6.2.0 ; extra == 'redis' - ray[default]==2.48.0 ; extra == 'ray' - ipywidgets>=8.1.7 ; extra == 'ray' @@ -2396,20 +2364,6 @@ packages: version: 0.12.1 sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 requires_python: '>=3.8' -- conda: https://conda.anaconda.org/conda-forge/noarch/pdoc-15.0.4-pyhd8ed1ab_0.conda - sha256: 34a0ce54796d743113ef962c0d1a61e26f4f777c80647e14fd6bea7b3350b912 - md5: 751a8b7d5f3c6f428074e6ac34a2849b - depends: - - astunparse - - jinja2 >=2.11.0 - - markupsafe - - pygments >=2.12.0 - - python >=3.9 - license: Unlicense - purls: - - pkg:pypi/pdoc?source=hash-mapping - size: 123955 - timestamp: 1755106041556 - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl name: pexpect version: 4.9.0 diff --git a/pyproject.toml b/pyproject.toml index 8acc2cc..95f8bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,4 +82,4 @@ redis = { features = ["redis"], solve-group = "default" } [tool.pixi.tasks] [tool.pixi.dependencies] -python = "3.13.*" +python = ">=3.12" diff --git a/src/orcapod/contexts/__init__.py b/src/orcapod/contexts/__init__.py index 9adda27..a47d847 100644 --- a/src/orcapod/contexts/__init__.py +++ b/src/orcapod/contexts/__init__.py @@ -28,6 +28,7 @@ from .core import DataContext, ContextValidationError, ContextResolutionError from .registry import JSONDataContextRegistry from typing import Any +from orcapod.protocols import hashing_protocols as hp, semantic_protocols as sp # Global registry instance (lazily initialized) _registry: JSONDataContextRegistry | None = None @@ -164,6 +165,36 @@ def get_default_context() -> DataContext: return resolve_context() +def get_default_object_hasher() -> hp.ObjectHasher: + """ + Get the default object hasher. + + Returns: + ObjectHasher instance for the default context + """ + return get_default_context().object_hasher + + +def get_default_arrow_hasher() -> hp.ArrowHasher: + """ + Get the default arrow hasher. + + Returns: + ArrowHasher instance for the default context + """ + return get_default_context().arrow_hasher + + +def get_default_type_converter() -> "sp.TypeConverter": + """ + Get the default type converter. + + Returns: + UniversalTypeConverter instance for the default context + """ + return get_default_context().type_converter + + # Convenience function for creating custom registries def create_registry( contexts_dir: str | None = None, diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index 7e87319..09ca2cc 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -6,12 +6,8 @@ """ from dataclasses import dataclass -from typing import TYPE_CHECKING -if TYPE_CHECKING: - # TODO: consider establishing type converter protocol - from orcapod.semantic_types import UniversalTypeConverter -from orcapod.protocols import hashing_protocols as hp +from orcapod.protocols import hashing_protocols as hp, semantic_protocols as sp @dataclass @@ -35,7 +31,7 @@ class DataContext: context_key: str version: str description: str - type_converter: "UniversalTypeConverter" + type_converter: sp.TypeConverter arrow_hasher: hp.ArrowHasher object_hasher: hp.ObjectHasher diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index efdb49b..e5d4f73 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from collections.abc import Collection from pathlib import Path from typing import Any, Mapping @@ -10,33 +11,10 @@ logger = logging.getLogger(__name__) -class LabeledContentIdentifiableBase: - """ - Base class for content-identifiable objects. - This class provides a way to define objects that can be uniquely identified - based on their content rather than their identity in memory. Specifically, the identity of the - object is determined by the structure returned by the `identity_structure` method. - The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, - which defaults to the one returned by `get_default_object_hasher`. - Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. - """ - - def __init__( - self, - identity_structure_hasher: hp.ObjectHasher | None = None, - label: str | None = None, - data_context: str | contexts.DataContext | None = None, - ) -> None: - """ - Initialize the ContentHashable with an optional ObjectHasher. - - Args: - identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. - """ +class LablableBase: + def __init__(self, label: str | None = None, **kwargs): self._label = label - self._data_context = contexts.resolve_context(data_context) - self._content_hash: bytes | None = None - self._int_hash: int | None = None + super().__init__(**kwargs) @property def has_assigned_label(self) -> bool: @@ -75,6 +53,32 @@ def computed_label(self) -> str | None: """ return None + +class ContentIdentifiableBase(ABC): + """ + Base class for content-identifiable objects. + This class provides a way to define objects that can be uniquely identified + based on their content rather than their identity in memory. Specifically, the identity of the + object is determined by the structure returned by the `identity_structure` method. + The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, + which defaults to the one returned by `get_default_object_hasher`. + Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. + """ + + def __init__( + self, data_context: str | contexts.DataContext | None = None, **kwargs + ) -> None: + """ + Initialize the ContentHashable with an optional ObjectHasher. + + Args: + identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. + """ + super().__init__(**kwargs) + self._data_context = contexts.resolve_context(data_context) + self._content_hash: hp.ContentHash | None = None + self._int_hash: int | None = None + def identity_structure(self) -> Any: """ Return a structure that represents the identity of this object. @@ -86,10 +90,9 @@ def identity_structure(self) -> Any: Returns: Any: A structure representing this object's content, or None to use default hash """ - # TODO: come up with a way to signify non-determinate identity structure - return None + raise NotImplementedError("Subclasses must implement identity_structure") - def content_hash(self) -> bytes: + def content_hash(self) -> hp.ContentHash: """ Compute a hash based on the content of this object. @@ -100,7 +103,7 @@ def content_hash(self) -> bytes: if self._content_hash is None: structure = self.identity_structure() processed_structure = process_structure(structure) - self._content_hash = self._data_context.object_hasher.hash( + self._content_hash = self._data_context.object_hasher.hash_object( processed_structure ) return self._content_hash @@ -120,7 +123,9 @@ def __hash__(self) -> int: # If no identity structure is provided, use the default hash self._int_hash = super().__hash__() else: - self._int_hash = self._data_context.object_hasher.hash_to_int(structure) + self._int_hash = self._data_context.object_hasher.hash_object( + structure + ).to_int() return self._int_hash def __eq__(self, other: object) -> bool: @@ -133,12 +138,16 @@ def __eq__(self, other: object) -> bool: Returns: bool: True if both objects have the same identity structure, False otherwise. """ - if not isinstance(other, LabeledContentIdentifiableBase): + if not isinstance(other, ContentIdentifiableBase): return NotImplemented return self.identity_structure() == other.identity_structure() +class LabeledContentIdentifiableBase(ContentIdentifiableBase, LablableBase): + pass + + def process_structure( obj: Any, visited: set[int] | None = None, diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 6709db2..a7d0cb8 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -9,6 +9,7 @@ from orcapod.data.system_constants import constants from orcapod.types import TypeSpec from orcapod.types.core import DataValue +from orcapod.protocols.hashing_protocols import ContentHash from orcapod.utils import arrow_utils logger = logging.getLogger(__name__) @@ -145,7 +146,7 @@ def __init__( self._cached_python_schema: TypeSpec | None = None self._cached_python_dict: dict[str, DataValue] | None = None self._cached_meta_python_schema: TypeSpec | None = None - self._cached_content_hash: str | None = None + self._cached_content_hash: ContentHash | None = None # 1. Core Properties (Identity & Structure) @property @@ -322,7 +323,7 @@ def arrow_schema( return arrow_utils.join_arrow_schemas(*all_schemas) - def content_hash(self) -> str: + def content_hash(self) -> ContentHash: """ Calculate and return content hash of the datagram. Only includes data columns, not meta columns or context. @@ -333,7 +334,6 @@ def content_hash(self) -> str: if self._cached_content_hash is None: self._cached_content_hash = self._data_context.arrow_hasher.hash_table( self._data_table, - prefix_hasher_id=True, ) return self._cached_content_hash diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 71e8104..88e848e 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -281,7 +281,6 @@ def __init__( self._cached_source_info: dict[str, str | None] | None = None self._cached_python_schema: TypeSpec | None = None - self._cached_content_hash: str | None = None def keys( self, diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 297b100..f476cd9 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -17,10 +17,12 @@ """ import logging -from abc import ABC, abstractmethod +from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping from typing import Self, TypeAlias from orcapod import contexts +from orcapod.data.base import ContentIdentifiableBase +from orcapod.protocols.hashing_protocols import ContentHash import pyarrow as pa @@ -102,7 +104,7 @@ def contains_prefix_from(column: str, prefixes: Collection[str]) -> bool: return False -class BaseDatagram(ABC): +class BaseDatagram(ContentIdentifiableBase): """ Abstract base class for immutable datagram implementations. @@ -192,7 +194,7 @@ def arrow_schema( ... @abstractmethod - def content_hash(self) -> str: + def content_hash(self) -> ContentHash: """Calculate and return content hash of the datagram.""" ... diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 103a742..9088537 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -1,21 +1,26 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Self, cast - -import pyarrow as pa +from typing import Self, cast, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_schema_from_pylist_data from orcapod.types.core import DataValue from orcapod.utils import arrow_utils +from orcapod.protocols.hashing_protocols import ContentHash logger = logging.getLogger(__name__) # FIXME: make this configurable! DEBUG = False +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + class DictDatagram(BaseDatagram): """ @@ -122,7 +127,7 @@ def __init__( # Initialize caches self._cached_data_table: pa.Table | None = None self._cached_meta_table: pa.Table | None = None - self._cached_content_hash: str | None = None + self._cached_content_hash: ContentHash | None = None self._cached_data_arrow_schema: pa.Schema | None = None self._cached_meta_arrow_schema: pa.Schema | None = None @@ -306,7 +311,7 @@ def arrow_schema( return arrow_utils.join_arrow_schemas(*all_schemas) - def content_hash(self) -> str: + def content_hash(self) -> ContentHash: """ Calculate and return content hash of the datagram. Only includes data columns, not meta columns or context. @@ -317,7 +322,6 @@ def content_hash(self) -> str: if self._cached_content_hash is None: self._cached_content_hash = self._data_context.arrow_hasher.hash_table( self.as_table(include_meta_columns=False, include_context=False), - prefix_hasher_id=True, ) return self._cached_content_hash diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index d267c13..07e512e 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod +from abc import abstractmethod from collections.abc import Collection from datetime import datetime, timezone from typing import Any @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -class TrackedKernelBase(ABC, LabeledContentIdentifiableBase): +class TrackedKernelBase(LabeledContentIdentifiableBase): """ Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. It is the base class for all computations and transformations that can be performed on a collection of streams @@ -50,7 +50,7 @@ def kernel_id(self) -> tuple[str, ...]: Returns a unique identifier for the kernel. This is used to identify the kernel in the computational graph. """ - return (f"{self.__class__.__name__}", self.content_hash().hex()) + return (f"{self.__class__.__name__}", self.content_hash().to_hex()) @property def data_context(self) -> contexts.DataContext: diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 80536d6..87c9bd9 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -314,6 +314,7 @@ def __init__( self._function_info_extractor = function_info_extractor object_hasher = self.data_context.object_hasher + # TODO: fix and replace with object_hasher protocol specific methods self._function_signature_hash = object_hasher.hash_to_hex( get_function_signature(self.function), prefix_hasher_id=True ) @@ -351,7 +352,7 @@ def get_record_id( execution_engine_hash: str, ) -> str: return combine_hashes( - packet.content_hash(), + str(packet.content_hash()), self._total_pod_id_hash, execution_engine_hash, prefix_hasher_id=True, @@ -731,7 +732,7 @@ def record_packet( data_table = data_table.add_column( 0, constants.INPUT_PACKET_HASH, - pa.array([input_packet.content_hash()], type=pa.large_string()), + pa.array([str(input_packet.content_hash())], type=pa.large_string()), ) # add execution engine information execution_engine_hash = execution_engine.name if execution_engine else "default" @@ -779,7 +780,7 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non # get all records with matching the input packet hash # TODO: add match based on match_tier if specified - constraints = {constants.INPUT_PACKET_HASH: input_packet.content_hash()} + constraints = {constants.INPUT_PACKET_HASH: str(input_packet.content_hash())} if self.match_tier is not None: constraints[f"{constants.POD_ID_PREFIX}{self.match_tier}"] = ( self.pod.tiered_pod_id[self.match_tier] @@ -794,7 +795,7 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non if result_table.num_rows > 1: logger.info( - f"Performing conflict resolution for multiple records for {input_packet.content_hash()}" + f"Performing conflict resolution for multiple records for {input_packet.content_hash().display_name()}" ) if self.retrieval_mode == "latest": result_table = result_table.sort_by( @@ -812,7 +813,7 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non break if result_table.num_rows > 1: logger.warning( - f"No matching record found for {input_packet.content_hash()} with tiered pod ID {self.tiered_pod_id}" + f"No matching record found for {input_packet.content_hash().display_name()} with tiered pod ID {self.tiered_pod_id}" ) result_table = result_table.sort_by( self.DATA_RETRIEVED_FLAG, ascending=False diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources.py index f41c941..0f25436 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources.py @@ -1,4 +1,3 @@ -from abc import abstractmethod from collections.abc import Collection, Iterator from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -11,11 +10,11 @@ from orcapod.data.streams import ( TableStream, KernelStream, - OperatorStreamBaseMixin, + StatefulStreamBase, ) from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue, TypeSpec, typespec_utils +from orcapod.types import DataValue, TypeSpec from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants @@ -31,7 +30,7 @@ pa = LazyModule("pyarrow") -class SourceBase(TrackedKernelBase, OperatorStreamBaseMixin): +class SourceBase(TrackedKernelBase, StatefulStreamBase): """ Base class for sources that act as both Kernels and LiveStreams. @@ -132,21 +131,30 @@ def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: """ return self().iter_packets() - def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def iter_packets( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: """Delegate to the cached KernelStream.""" - return self().iter_packets() + return self().iter_packets(execution_engine=execution_engine) def as_table( self, include_data_context: bool = False, include_source: bool = False, + include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( include_data_context=include_data_context, include_source=include_source, + include_system_tags=include_system_tags, include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, ) def flow( @@ -204,51 +212,6 @@ def packet_keys(self) -> tuple[str, ...]: _, packet_keys = self.keys() return packet_keys - @abstractmethod - def get_all_records( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all records from the source. - - This method should be implemented by subclasses to return the full dataset. - If the source has no records, return None. - """ - ... - - def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": - records = self.get_all_records(include_system_columns=False) - if records is not None: - result = pl.LazyFrame(records) - if sort_by_tags: - result = result.sort(self.tag_keys) - return result - return None - - def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - lazy_df = self.as_lazy_frame(sort_by_tags=sort_by_tags) - if lazy_df is not None: - return lazy_df.collect() - return None - - def as_polars_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": - """ - Return the DataFrame representation of the pod's records. - """ - return self.as_df(sort_by_tags=sort_by_tags) - - def as_pandas_df(self, sort_by_tags: bool = True) -> "pd.DataFrame | None": - """ - Return the pandas DataFrame representation of the pod's records. - """ - df = self.as_polars_df(sort_by_tags=sort_by_tags) - if df is not None: - return df.to_pandas() - return None - def reset_cache(self) -> None: """ Clear the cached KernelStream, forcing a fresh one on next access. diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 654c763..1d9cf0d 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -1,5 +1,5 @@ import logging -from abc import ABC, abstractmethod +from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping from datetime import datetime, timezone from itertools import repeat @@ -47,7 +47,7 @@ def synchronous_run(async_func, *args, **kwargs): """ try: # Check if we're already in an event loop - loop = asyncio.get_running_loop() + _ = asyncio.get_running_loop() def run_in_thread(): return asyncio.run(async_func(*args, **kwargs)) @@ -104,33 +104,22 @@ def map_packets( return MapPackets(name_map, drop_unmapped)(self) # type: ignore[return-value] -class StreamBase(ABC, OperatorStreamBaseMixin, LabeledContentIdentifiableBase): +class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): """ - A stream is a collection of tagged-packets that are generated by an operation. - The stream is iterable and can be used to access the packets in the stream. - - A stream has property `invocation` that is an instance of Invocation that generated the stream. - This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). + A stream that has a unique identity within the pipeline. """ def __init__( self, - source: dp.Kernel | None = None, - upstreams: tuple[dp.Stream, ...] = (), data_context: str | contexts.DataContext | None = None, execution_engine: dp.ExecutionEngine | None = None, **kwargs, ) -> None: super().__init__(**kwargs) - self._source = source - self._upstreams = upstreams self._last_modified: datetime | None = None self._set_modified_time() # note that this is not necessary for Stream protocol, but is provided # for convenience to resolve semantic types and other context-specific information - if data_context is None and source is not None: - # if source is provided, use its data context - data_context = source.data_context_key self._data_context = contexts.resolve_context(data_context) self._execution_engine = execution_engine @@ -140,10 +129,10 @@ def substream_identities(self) -> tuple[str, ...]: Returns the identities of the substreams that this stream is composed of. This is used to identify the substreams in the computational graph. """ - return (self.content_hash().hex(),) + return (self.content_hash().to_hex(),) @property - def execution_engine(self): + def execution_engine(self) -> dp.ExecutionEngine | None: """ Returns the execution engine that is used to execute this stream. This is typically used to track the execution context of the stream. @@ -177,20 +166,22 @@ def data_context(self) -> contexts.DataContext: return self._data_context @property + @abstractmethod def source(self) -> dp.Kernel | None: """ The source of the stream, which is the kernel that generated the stream. This is typically used to track the origin of the stream in the computational graph. """ - return self._source + ... @property + @abstractmethod def upstreams(self) -> tuple[dp.Stream, ...]: """ The upstream streams that are used to generate this stream. This is typically used to track the origin of the stream in the computational graph. """ - return self._upstreams + ... def computed_label(self) -> str | None: if self.source is not None: @@ -284,6 +275,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": ... @@ -293,6 +285,7 @@ def as_df( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pl.DataFrame | None": """ @@ -304,6 +297,7 @@ def as_df( include_source=include_source, include_system_tags=include_system_tags, include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, execution_engine=execution_engine, ) ) @@ -317,6 +311,91 @@ def flow( """ return [e for e in self.iter_packets(execution_engine=execution_engine)] + # def identity_structure(self) -> Any: + # """ + # Identity structure of a stream is deferred to the identity structure + # of the associated invocation, if present. + # A bare stream without invocation has no well-defined identity structure. + # Specialized stream subclasses should override this method to provide more meaningful identity structure + # """ + # ... + + +class StreamBase(StatefulStreamBase): + """ + A stream is a collection of tagged-packets that are generated by an operation. + The stream is iterable and can be used to access the packets in the stream. + + A stream has property `invocation` that is an instance of Invocation that generated the stream. + This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). + """ + + def __init__( + self, + source: dp.Kernel | None = None, + upstreams: tuple[dp.Stream, ...] = (), + data_context: str | contexts.DataContext | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._source = source + self._upstreams = upstreams + if data_context is None and source is not None: + # if source is provided, use its data context + data_context = source.data_context_key + super().__init__(data_context=data_context, **kwargs) + + @property + def source(self) -> dp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + # @abstractmethod + # def iter_packets( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + # @abstractmethod + # def run( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> None: ... + + # @abstractmethod + # async def run_async( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> None: ... + + # @abstractmethod + # def as_table( + # self, + # include_data_context: bool = False, + # include_source: bool = False, + # include_system_tags: bool = False, + # include_content_hash: bool | str = False, + # sort_by_tags: bool = True, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> "pa.Table": ... + def identity_structure(self) -> Any: """ Identity structure of a stream is deferred to the identity structure @@ -506,6 +585,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": """ @@ -520,7 +600,7 @@ def as_table( else include_content_hash ) content_hashes = [ - packet.content_hash() for _, packet in self.iter_packets() + str(packet.content_hash()) for _, packet in self.iter_packets() ] output_table = output_table.append_column( hash_column_name, pa.array(content_hashes, type=pa.large_string()) @@ -534,7 +614,14 @@ def as_table( if include_source: table_stack += (self._source_info_table,) - return arrow_utils.hstack_tables(*table_stack) + table = arrow_utils.hstack_tables(*table_stack) + + if sort_by_tags: + return table.sort_by( + [(column, "ascending") for column in self._all_tag_columns] + ) + + return table def clear_cache(self) -> None: """ @@ -737,6 +824,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": self.refresh() @@ -748,6 +836,7 @@ def as_table( include_source=include_source, include_system_tags=include_system_tags, include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, execution_engine=execution_engine, ) @@ -880,6 +969,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: @@ -959,6 +1049,11 @@ def as_table( output_table = output_table.append_column( hash_column_name, self._cached_content_hash_column ) + + if sort_by_tags: + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) return output_table @@ -1004,16 +1099,6 @@ async def run_async( missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) existing = None else: - # missing = target_entries.join( - # existing_entries, - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left anti", - # ) - # Single join that gives you both missing and existing - # More efficient - only bring the key column from existing_entries - # .select([constants.INPUT_PACKET_HASH]).append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), all_results = target_entries.join( existing_entries.append_column( "_exists", pa.array([True] * len(existing_entries)) @@ -1172,33 +1257,6 @@ def iter_packets( if packet is not None: yield tag, packet - # def iter_packets(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - # if self._prepared_stream_iterator is not None: - # for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - # if i in self._cached_output_packets: - # # Use cached result - # tag, packet = self._cached_output_packets[i] - # if packet is not None: - # yield tag, packet - # else: - # # Process packet - # processed = self.pod.call(tag, packet) - # if processed is not None: - # # Update shared cache for future iterators (optimization) - # self._cached_output_packets[i] = processed - # tag, packet = processed - # if packet is not None: - # yield tag, packet - - # # Mark completion by releasing the iterator - # self._prepared_stream_iterator = None - # else: - # # Yield from snapshot of complete cache - # for i in range(len(self._cached_output_packets)): - # tag, packet = self._cached_output_packets[i] - # if packet is not None: - # yield tag, packet - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. @@ -1223,6 +1281,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: @@ -1300,6 +1359,13 @@ def as_table( output_table = output_table.append_column( hash_column_name, self._cached_content_hash_column ) + + if sort_by_tags: + # TODO: consider having explicit tag/packet properties? + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) + return output_table @@ -1335,6 +1401,7 @@ def as_table( include_source: bool = False, include_system_tags: bool = False, include_content_hash: bool | str = False, + sort_by_tags: bool = True, execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": """ @@ -1346,6 +1413,7 @@ def as_table( include_source=include_source, include_system_tags=include_system_tags, include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, execution_engine=execution_engine, ) diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index c8c53fb..0ef8ab8 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -7,6 +7,7 @@ from collections.abc import Callable from orcapod.hashing.visitors import SemanticHashingVisitor from orcapod.utils import arrow_utils +from orcapod.protocols.hashing_protocols import ContentHash SERIALIZATION_METHOD_LUT: dict[str, Callable[[pa.Table], bytes]] = { @@ -110,6 +111,7 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: # Convert column to struct dicts for processing column_data = table.column(i).to_pylist() + # TODO: verify the functioning of the visitor pattern # Create fresh visitor for each column (stateless approach) visitor = SemanticHashingVisitor(self.semantic_registry) @@ -162,7 +164,7 @@ def serialize_arrow_table(self, table: pa.Table) -> bytes: ] return serialization_method_function(table) - def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: + def hash_table(self, table: pa.Table) -> ContentHash: """ Compute stable hash of Arrow table with semantic type processing. @@ -190,11 +192,7 @@ def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: hasher = hashlib.new(self.hash_algorithm) hasher.update(serialized_bytes) - hash_str = hasher.hexdigest() - if prefix_hasher_id: - hash_str = f"{self.hasher_id}@{hash_str}" - - return hash_str + return ContentHash(method=self.hasher_id, digest=hasher.digest()) def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: """ @@ -232,232 +230,3 @@ def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: "processed_columns": processed_columns, "column_order": [field.name for field in table.schema], } - - -# class SemanticArrowHasher2: -# """ -# Stable hasher for Arrow tables with semantic type support. - -# This hasher: -# 1. Processes columns with special semantic types using dedicated hashers -# 2. Sorts columns by name for deterministic ordering -# 3. Uses Arrow IPC format for stable serialization -# 4. Computes final hash of the processed packet -# """ - -# def __init__( -# self, -# hash_algorithm: str = "sha256", -# semantic_type_hashers: dict[str, SemanticTypeHasher] | None = None, -# chunk_size: int = 8192, -# hasher_id: str | None = None, -# handle_missing: str = "error", -# serialization_method: str = "logical", -# # TODO: consider passing options for serialization method -# ): -# """ -# Initialize SemanticArrowHasher. - -# Args: -# chunk_size: Size of chunks to read files in bytes -# handle_missing: How to handle missing files ('error', 'skip', 'null_hash') -# """ -# if hasher_id is None: -# hasher_id = f"semantic_arrow_hasher:{hash_algorithm}:{serialization_method}" -# self._hasher_id = hasher_id -# self.chunk_size = chunk_size -# self.handle_missing = handle_missing -# self.semantic_type_hashers: dict[str, SemanticTypeHasher] = ( -# semantic_type_hashers or {} -# ) -# self.hash_algorithm = hash_algorithm -# if serialization_method not in SERIALIZATION_METHOD_LUT: -# raise ValueError( -# f"Invalid serialization method '{serialization_method}'. " -# f"Supported methods: {list(SERIALIZATION_METHOD_LUT.keys())}" -# ) -# self.serialization_method = serialization_method -# self._serialize_arrow_table = SERIALIZATION_METHOD_LUT[serialization_method] - -# def set_cacher(self, semantic_type: str, cacher: StringCacher) -> None: -# """ -# Add a string cacher for caching hash values. - -# This is a no-op for SemanticArrowHasher since it hashes column contents directly. -# """ -# if semantic_type in self.semantic_type_hashers: -# self.semantic_type_hashers[semantic_type].set_cacher(cacher) -# else: -# raise KeyError(f"No hasher registered for semantic type '{semantic_type}'") - -# @property -# def hasher_id(self) -> str: -# return self._hasher_id - -# def register_semantic_hasher(self, semantic_type: str, hasher: SemanticTypeHasher): -# """Register a custom hasher for a semantic type.""" -# self.semantic_type_hashers[semantic_type] = hasher - -# def _get_semantic_type(self, field: pa.Field) -> str | None: -# """Extract semantic_type from field metadata.""" -# if field.metadata is None: -# return None - -# metadata = field.metadata -# if b"semantic_type" in metadata: -# return metadata[b"semantic_type"].decode("utf-8") -# elif "semantic_type" in metadata: -# return metadata["semantic_type"] - -# return None - -# def _create_hash_column( -# self, -# original_column: pa.Array, -# hash_bytes: bytes, -# original_field: pa.Field, -# hash_algorithm: str | None = None, -# ) -> tuple[pa.Array, pa.Field]: -# """Create a new column containing the hash bytes.""" -# # Create array of hash bytes (one hash value repeated for each row) -# hash_value = hash_bytes.hex() # Convert to hex string for readability -# hash_array = pa.array([hash_value] * len(original_column)) - -# # Create new field with modified metadata -# new_metadata = dict(original_field.metadata) if original_field.metadata else {} -# new_metadata["original_semantic_type"] = new_metadata.get( -# "semantic_type", "unknown" -# ) -# new_metadata["semantic_type"] = "hash" -# new_metadata["hash_algorithm"] = hash_algorithm or self.hasher_id - -# new_field = pa.field( -# original_field.name, -# pa.large_string(), # Hash stored as large string -# nullable=original_field.nullable, -# metadata=new_metadata, -# ) - -# return hash_array, new_field - -# def _process_table_columns(self, table: pa.Table) -> pa.Table: -# # TODO: add copy of table-level metadata to the new table -# """Process table columns, replacing semantic type columns with their hashes.""" -# new_columns = [] -# new_fields = [] - -# for i, field in enumerate(table.schema): -# column = table.column(i) -# semantic_type = self._get_semantic_type(field) - -# if semantic_type in self.semantic_type_hashers: -# # Hash the column using the appropriate semantic hasher -# hasher = self.semantic_type_hashers[semantic_type] -# hash_bytes = hasher.hash_column(column) - -# # Replace column with hash -# hash_column, hash_field = self._create_hash_column( -# column, hash_bytes, field -# ) -# new_columns.append(hash_column) -# new_fields.append(hash_field) -# else: -# # Keep original column -# new_columns.append(column) -# new_fields.append(field) - -# # Create new table with processed columns -# new_schema = pa.schema(new_fields) -# return pa.table(new_columns, schema=new_schema) - -# def _sort_table_columns(self, table: pa.Table) -> pa.Table: -# """Sort table columns by field name for deterministic ordering.""" -# # Get column indices sorted by field name -# sorted_indices = sorted( -# range(len(table.schema)), key=lambda i: table.schema.field(i).name -# ) - -# # Reorder columns -# sorted_columns = [table.column(i) for i in sorted_indices] -# sorted_fields = [table.schema.field(i) for i in sorted_indices] - -# sorted_schema = pa.schema(sorted_fields) -# return pa.table(sorted_columns, schema=sorted_schema) - -# # def _serialize_table_ipc(self, table: pa.Table) -> bytes: -# # # TODO: fix and use logical table hashing instead -# # """Serialize table using Arrow IPC format for stable binary representation.""" -# # buffer = BytesIO() - -# # # Use IPC stream format for deterministic serialization -# # with ipc.new_stream(buffer, table.schema) as writer: -# # writer.write_table(table) - -# # return buffer.getvalue() - -# def hash_table(self, table: pa.Table, prefix_hasher_id: bool = True) -> str: -# """ -# Compute stable hash of Arrow table. - -# Args: -# table: Arrow table to hash - -# Returns: -# Hex string of the computed hash -# """ - -# # Step 1: Process columns with semantic types -# processed_table = self._process_table_columns(table) - -# # Step 2: Sort columns by name for deterministic ordering -# sorted_table = self._sort_table_columns(processed_table) - -# # normalize all string to large strings by passing through polars -# # TODO: consider cleaner approach in the future -# import polars as pl - -# sorted_table = pl.DataFrame(sorted_table).to_arrow() - -# # Step 3: Serialize using Arrow IPC format -# serialized_bytes = self._serialize_arrow_table(sorted_table) - -# # Step 4: Compute final hash -# hasher = hashlib.new(self.hash_algorithm) -# hasher.update(serialized_bytes) - -# hash_str = hasher.hexdigest() -# if prefix_hasher_id: -# hash_str = f"{self.hasher_id}@{hash_str}" - -# return hash_str - -# def hash_table_with_metadata(self, table: pa.Table) -> dict[str, Any]: -# """ -# Compute hash with additional metadata about the process. - -# Returns: -# Dictionary containing hash, metadata, and processing info -# """ -# processed_columns = [] - -# # Track processing steps -# for i, field in enumerate(table.schema): -# semantic_type = self._get_semantic_type(field) -# column_info = { -# "name": field.name, -# "original_type": str(field.type), -# "semantic_type": semantic_type, -# "processed": semantic_type in self.semantic_type_hashers, -# } -# processed_columns.append(column_info) - -# # Compute hash -# table_hash = self.hash_table(table) - -# return { -# "hash": table_hash, -# "num_rows": len(table), -# "num_columns": len(table.schema), -# "processed_columns": processed_columns, -# "column_order": [field.name for field in table.schema], -# } diff --git a/src/orcapod/hashing/content_identifiable.py b/src/orcapod/hashing/content_identifiable.py deleted file mode 100644 index af1ccb0..0000000 --- a/src/orcapod/hashing/content_identifiable.py +++ /dev/null @@ -1,331 +0,0 @@ -# from collections.abc import Collection, Mapping -# from pathlib import Path -# from typing import Any -# from uuid import UUID -# from orcapod import contexts -# import logging -# from orcapod.protocols import hashing_protocols as hp - -# logger = logging.getLogger(__name__) - - -# class ContentIdentifiableBase: -# """ -# Base class for content-identifiable objects. -# This class provides a way to define objects that can be uniquely identified -# based on their content rather than their identity in memory. Specifically, the identity of the -# object is determined by the structure returned by the `identity_structure` method. -# The hash of the object is computed based on the `identity_structure` using the provided `ObjectHasher`, -# which defaults to the one returned by `get_default_object_hasher`. -# Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. -# """ - -# def __init__( -# self, -# label: str | None = None, -# data_context: Any = None, # Placeholder for ObjectHasher or similar context -# ) -> None: -# """ -# Initialize the ContentHashable with an optional ObjectHasher. -# """ -# self._data_context = contexts.resolve_context(data_context) -# self._label = label -# self._cached_hash: bytes | None = None - -# @property -# def has_assigned_label(self) -> bool: -# """ -# Check if the label is explicitly set for this object. - -# Returns: -# bool: True if the label is explicitly set, False otherwise. -# """ -# return self._label is not None - -# @property -# def label(self) -> str: -# """ -# Get the label of this object. - -# Returns: -# str | None: The label of the object, or None if not set. -# """ -# return self._label or self.computed_label() or self.__class__.__name__ - -# @label.setter -# def label(self, label: str | None) -> None: -# """ -# Set the label of this object. - -# Args: -# label (str | None): The label to set for this object. -# """ -# self._label = label - -# def computed_label(self) -> str | None: -# """ -# Compute a label for this object based on its content. If label is not explicitly set for this object -# and computed_label returns a valid value, it will be used as label of this object. -# """ -# return None - -# def identity_structure(self) -> Any: -# """ -# Return a structure that represents the identity of this object. - -# Override this method in your subclass to provide a stable representation -# of your object's content. The structure should contain all fields that -# determine the object's identity. - -# Returns: -# Any: A structure representing this object's content, or None to use default hash -# """ -# return None - -# def content_hash(self) -> bytes: -# """ -# Compute a hash based on the content of this object. - -# This method uses the identity structure to compute a hash value. -# If no identity structure is provided, it will return None. - -# Returns: -# int: A hash value based on the content of this object, or None if no identity structure is provided. -# """ -# if self._cached_hash is None: -# structure = self.identity_structure() - -# processed_structure = process_structure(structure) - -# self._cached_hash = self._data_context.object_hasher.hash( -# processed_structure -# ) -# return self._cached_hash - -# def __hash__(self) -> int: -# """ -# Hash implementation that uses the identity structure if provided, -# otherwise falls back to the superclass's hash method. - -# Returns: -# int: A hash value based on either content or identity -# """ -# # Get the identity structure -# structure = self.identity_structure() -# if structure is None: -# # If no identity structure is provided, use the default hash -# return super().__hash__() - -# return self._data_context.object_hasher.hash_to_int(structure) - -# def __eq__(self, other: object) -> bool: -# """ -# Equality check that compares the identity structures of two objects. - -# Args: -# other (object): The object to compare against. - -# Returns: -# bool: True if both objects have the same identity structure, False otherwise. -# """ -# if not isinstance(other, ContentIdentifiableBase): -# return NotImplemented - -# return self.identity_structure() == other.identity_structure() - - -# def process_structure( -# obj: Any, -# visited: set[int] | None = None, -# force_hash: bool = True, -# function_info_extractor: hp.FunctionInfoExtractor | None = None, -# ) -> Any: -# """ -# Recursively process a structure to prepare it for hashing. - -# Args: -# obj: The object or structure to process -# visited: Set of object ids already visited (to handle circular references) -# function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation - -# Returns: -# A processed version of the structure suitable for stable hashing -# """ -# # Initialize the visited set if this is the top-level call -# if visited is None: -# visited = set() -# else: -# visited = visited.copy() # Copy to avoid modifying the original set - -# # Check for circular references - use object's memory address -# # NOTE: While id() is not stable across sessions, we only use it within a session -# # to detect circular references, not as part of the final hash -# obj_id = id(obj) -# if obj_id in visited: -# logger.debug( -# f"Detected circular reference for object of type {type(obj).__name__}" -# ) -# return "CircularRef" # Don't include the actual id in hash output - -# # For objects that could contain circular references, add to visited -# if isinstance(obj, (dict, list, tuple, set)) or not isinstance( -# obj, (str, int, float, bool, type(None)) -# ): -# visited.add(obj_id) - -# # Handle None -# if obj is None: -# return None - -# # TODO: currently using runtime_checkable on ContentIdentifiable protocol -# # Re-evaluate this strategy to see if a faster / more robust check could be used -# if isinstance(obj, hp.ContentIdentifiable): -# logger.debug( -# f"Processing ContentHashableBase instance of type {type(obj).__name__}" -# ) -# return obj.content_hash() - -# # Handle basic types -# if isinstance(obj, (str, int, float, bool)): -# return obj - -# # Handle bytes and bytearray -# if isinstance(obj, (bytes, bytearray)): -# logger.debug( -# f"Converting bytes/bytearray of length {len(obj)} to hex representation" -# ) -# return obj.hex() - -# # Handle Path objects -# if isinstance(obj, Path): -# logger.debug(f"Converting Path object to string: {obj}") -# raise NotImplementedError( -# "Path objects are not supported in this hasher. Please convert to string." -# ) -# return str(obj) - -# # Handle UUID objects -# if isinstance(obj, UUID): -# logger.debug(f"Converting UUID to string: {obj}") -# raise NotImplementedError( -# "UUID objects are not supported in this hasher. Please convert to string." -# ) -# return str(obj) - -# # Handle named tuples (which are subclasses of tuple) -# if hasattr(obj, "_fields") and isinstance(obj, tuple): -# logger.debug(f"Processing named tuple of type {type(obj).__name__}") -# # For namedtuples, convert to dict and then process -# d = {field: getattr(obj, field) for field in obj._fields} # type: ignore -# return process_structure(d, visited) - -# # Handle mappings (dict-like objects) -# if isinstance(obj, Mapping): -# # Process both keys and values -# processed_items = [ -# ( -# process_structure(k, visited), -# process_structure(v, visited), -# ) -# for k, v in obj.items() -# ] - -# # Sort by the processed keys for deterministic order -# processed_items.sort(key=lambda x: str(x[0])) - -# # Create a new dictionary with string keys based on processed keys -# # TODO: consider checking for possibly problematic values in processed_k -# # and issue a warning -# return { -# str(processed_k): processed_v -# for processed_k, processed_v in processed_items -# } - -# # Handle sets and frozensets -# if isinstance(obj, (set, frozenset)): -# logger.debug( -# f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" -# ) -# # Process each item first, then sort the processed results -# processed_items = [process_structure(item, visited) for item in obj] -# return sorted(processed_items, key=str) - -# # Handle collections (list-like objects) -# if isinstance(obj, Collection): -# logger.debug( -# f"Processing collection of type {type(obj).__name__} with {len(obj)} items" -# ) -# return [process_structure(item, visited) for item in obj] - -# # For functions, use the function_content_hash -# if callable(obj) and hasattr(obj, "__code__"): -# logger.debug(f"Processing function: {getattr(obj, '__name__')}") -# if function_info_extractor is not None: -# # Use the extractor to get a stable representation -# function_info = function_info_extractor.extract_function_info(obj) -# logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") - -# # simply return the function info as a stable representation -# return function_info -# else: -# raise ValueError( -# f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" -# ) - -# # handle data types -# if isinstance(obj, type): -# logger.debug(f"Processing class/type: {obj.__name__}") -# return f"type:{obj.__name__}" - -# # For other objects, attempt to create deterministic representation only if force_hash=True -# class_name = obj.__class__.__name__ -# module_name = obj.__class__.__module__ -# if force_hash: -# try: -# import re - -# logger.debug( -# f"Processing generic object of type {module_name}.{class_name}" -# ) - -# # Try to get a stable dict representation if possible -# if hasattr(obj, "__dict__"): -# # Sort attributes to ensure stable order -# attrs = sorted( -# (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") -# ) -# # Limit to first 10 attributes to avoid extremely long representations -# if len(attrs) > 10: -# logger.debug( -# f"Object has {len(attrs)} attributes, limiting to first 10" -# ) -# attrs = attrs[:10] -# attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] -# obj_repr = f"{{{', '.join(attr_strs)}}}" -# else: -# # Get basic repr but remove memory addresses -# logger.debug( -# "Object has no __dict__, using repr() with memory address removal" -# ) -# obj_repr = repr(obj) -# if len(obj_repr) > 1000: -# logger.debug( -# f"Object repr is {len(obj_repr)} chars, truncating to 1000" -# ) -# obj_repr = obj_repr[:1000] + "..." -# # Remove memory addresses which look like '0x7f9a1c2b3d4e' -# obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) - -# return f"{module_name}.{class_name}:{obj_repr}" -# except Exception as e: -# # Last resort - use class name only -# logger.warning(f"Failed to process object representation: {e}") -# try: -# return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" -# except AttributeError: -# logger.error("Could not determine object class, using UnknownObject") -# return "UnknownObject" -# else: -# raise ValueError( -# f"Processing of {obj} of type {module_name}.{class_name} is not supported" -# ) diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 956fd11..07fc518 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -14,7 +14,7 @@ class ObjectHasherBase(ABC): @abstractmethod - def hash(self, obj: object) -> bytes: ... + def hash_object(self, obj: object) -> hp.ContentHash: ... @property @abstractmethod @@ -23,8 +23,8 @@ def hasher_id(self) -> str: ... def hash_to_hex( self, obj: Any, char_count: int | None = None, prefix_hasher_id: bool = False ) -> str: - hash_bytes = self.hash(obj) - hex_str = hash_bytes.hex() + content_hash = self.hash_object(obj) + hex_str = content_hash.to_hex() # TODO: clean up this logic, as char_count handling is messy if char_count is not None: @@ -57,7 +57,8 @@ def hash_to_uuid( namespace: uuid.UUID = uuid.NAMESPACE_OID, ) -> uuid.UUID: """Convert hash to proper UUID5.""" - return uuid.uuid5(namespace, self.hash(obj)) + # TODO: decide whether to use to_hex or digest here + return uuid.uuid5(namespace, self.hash_object(obj).to_hex()) class BasicObjectHasher(ObjectHasherBase): @@ -126,7 +127,7 @@ def process_structure( logger.debug( f"Processing ContentHashableBase instance of type {type(obj).__name__}" ) - return self._hash_object(obj.identity_structure(), visited=visited).hex() + return self._hash_object(obj.identity_structure(), visited=visited).to_hex() # Handle basic types if isinstance(obj, (str, int, float, bool)): @@ -281,7 +282,7 @@ def _hash_object( self, obj: Any, visited: set[int] | None = None, - ) -> bytes: + ) -> hp.ContentHash: # Process the object to handle nested structures and HashableMixin instances processed = self.process_structure(obj, visited=visited) @@ -294,7 +295,7 @@ def _hash_object( ) # Create the hash - return hashlib.sha256(json_str).digest() + return hp.ContentHash(self.hasher_id, hashlib.sha256(json_str).digest()) - def hash(self, obj: object) -> bytes: + def hash_object(self, obj: object) -> hp.ContentHash: return self._hash_object(obj) diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index bcd489f..7cd279a 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -54,9 +54,10 @@ def _hash_file_content(self, file_path: str) -> bytes: if self.cacher: # Cache the computed hash hex self.cacher.set_cached( - f"{self.cache_key_prefix}:{file_path}", hashed_value.hex() + f"{self.cache_key_prefix}:{file_path}", hashed_value.to_hex() ) - return hashed_value + # TODO: make consistent use of bytes/string for hash + return hashed_value.digest except (IOError, OSError, PermissionError) as e: if self.handle_missing == "error": diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index fe185f4..ac61dce 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -318,14 +318,13 @@ def add_pipeline_record( ) -> None: # combine dp.Tag with packet content hash to compute entry hash # TODO: add system tag columns + # TODO: consider using bytes instead of string representation tag_with_hash = tag.as_table(include_system_tags=True).append_column( constants.INPUT_PACKET_HASH, - pa.array([input_packet.content_hash()], type=pa.large_string()), + pa.array([str(input_packet.content_hash())], type=pa.large_string()), ) - entry_id = self.data_context.arrow_hasher.hash_table( - tag_with_hash, prefix_hasher_id=True - ) + entry_id = str(self.data_context.arrow_hasher.hash_table(tag_with_hash)) # FIXME: consider and implement more robust cache lookup logic existing_record = None if not skip_cache_lookup: diff --git a/src/orcapod/protocols/data_protocols.py b/src/orcapod/protocols/data_protocols.py deleted file mode 100644 index 1e07ece..0000000 --- a/src/orcapod/protocols/data_protocols.py +++ /dev/null @@ -1,2297 +0,0 @@ -from collections.abc import Collection, Iterator, Mapping, Callable -from datetime import datetime -from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING, runtime_checkable -from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import DataValue, TypeSpec - - -if TYPE_CHECKING: - import pyarrow as pa - import polars as pl - import pandas as pd - - -@runtime_checkable -class ExecutionEngine(Protocol): - @property - def name(self) -> str: ... - - def submit_sync(self, function: Callable, *args, **kwargs) -> Any: - """ - Run the given function with the provided arguments. - This method should be implemented by the execution engine. - """ - ... - - async def submit_async(self, function: Callable, *args, **kwargs) -> Any: - """ - Asynchronously run the given function with the provided arguments. - This method should be implemented by the execution engine. - """ - ... - - -@runtime_checkable -class Datagram(Protocol): - """ - Protocol for immutable datagram containers in Orcapod. - - Datagrams are the fundamental units of data that flow through the system. - They provide a unified interface for data access, conversion, and manipulation, - ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). - - Each datagram contains: - - **Data columns**: The primary business data (user_id, name, etc.) - - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} ('__') prefixes (__processed_at, etc.) - - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) - - Future Packet subclass will also include: - - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) - - When exposing to external tools without field metadata support, semantic types - are encoded as `_{semantic_type}_` prefixes (_path_config_file, _id_user_name, etc.). - - All operations are immutable - methods return new datagram instances rather than - modifying existing ones. - - Example: - >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) - >>> updated = datagram.update(name="Alice Smith") - >>> filtered = datagram.select("user_id", "name") - >>> table = datagram.as_table() - """ - - # 1. Core Properties (Identity & Structure) - @property - def data_context_key(self) -> str: - """ - Return the data context key for this datagram. - - This key identifies the semantic type registry, arrow hasher, and other - contextual information needed to properly interpret and work with this - datagram across various operations (storage, visualization, processing, etc.). - - Returns: - str: Context key for proper datagram interpretation - """ - ... - - @property - def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" - ... - - # 2. Dict-like Interface (Data Access) - def __getitem__(self, key: str) -> DataValue: - """ - Get data column value by key. - - Provides dict-like access to data columns only. Meta columns - are not accessible through this method (use `get_meta_value()` instead). - - Args: - key: Data column name. - - Returns: - The value stored in the specified data column. - - Raises: - KeyError: If the column doesn't exist in data columns. - - Example: - >>> datagram["user_id"] - 123 - >>> datagram["name"] - 'Alice' - """ - ... - - def __contains__(self, key: str) -> bool: - """ - Check if data column exists. - - Args: - key: Column name to check. - - Returns: - True if column exists in data columns, False otherwise. - - Example: - >>> "user_id" in datagram - True - >>> "nonexistent" in datagram - False - """ - ... - - def __iter__(self) -> Iterator[str]: - """ - Iterate over data column names. - - Provides for-loop support over column names, enabling natural iteration - patterns without requiring conversion to dict. - - Yields: - Data column names in no particular order. - - Example: - >>> for column in datagram: - ... value = datagram[column] - ... print(f"{column}: {value}") - """ - ... - - def get(self, key: str, default: DataValue = None) -> DataValue: - """ - Get data column value with default fallback. - - Args: - key: Data column name. - default: Value to return if column doesn't exist. - - Returns: - Column value if exists, otherwise the default value. - - Example: - >>> datagram.get("user_id") - 123 - >>> datagram.get("missing", "default") - 'default' - """ - ... - - # 3. Structural Information - def keys( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> tuple[str, ...]: - """ - Return tuple of column names. - - Provides access to column names with filtering options for different - column types. Default returns only data column names. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Return only data column names (default) - - True: Include all meta column names - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - - Returns: - Tuple of column names based on inclusion criteria. - - Example: - >>> datagram.keys() # Data columns only - ('user_id', 'name', 'email') - >>> datagram.keys(include_meta_columns=True) - ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_meta_columns=["pipeline"]) - ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_context=True) - ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') - """ - ... - - def types( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> TypeSpec: - """ - Return type specification mapping field names to Python types. - - The TypeSpec enables type checking and validation throughout the system. - - Args: - include_meta_columns: Controls meta column type inclusion. - - False: Exclude meta column types (default) - - True: Include all meta column types - - Collection[str]: Include meta column types matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context type. - - Returns: - TypeSpec mapping field names to their Python types. - - Example: - >>> datagram.types() - {'user_id': , 'name': } - """ - ... - - def arrow_schema( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> "pa.Schema": - """ - Return PyArrow schema representation. - - The schema provides structured field and type information for efficient - serialization and deserialization with PyArrow. - - Args: - include_meta_columns: Controls meta column schema inclusion. - - False: Exclude meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - - Returns: - PyArrow Schema describing the datagram structure. - - Example: - >>> schema = datagram.arrow_schema() - >>> schema.names - ['user_id', 'name'] - """ - ... - - def content_hash(self) -> str: - """ - Return deterministic hash of datagram content. - - The hash should reflect the data content, typically excluding meta columns - and context. Used for caching, comparison, and deduplication. For exact details of - hash computation, refer to the implementation in the specific datagram class/subclass. - - Returns: - Deterministic content hash string. - - Note: - Two datagrams with identical data columns will have the same hash, - even if they differ in meta columns or context. - - Example: - >>> datagram.content_hash() - 'sha256:abc123def456...' - """ - ... - - # 4. Format Conversions (Export) - def as_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, DataValue]: - """ - Convert datagram to dictionary format. - - Provides a simple key-value representation useful for debugging, - serialization, and interop with dict-based APIs. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context key. - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - - - Returns: - Dictionary with requested columns as key-value pairs. - - Example: - >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} - >>> full_data = datagram.as_dict( - ... include_meta_columns=True, - ... include_context=True - ... ) - """ - ... - - def as_table( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> "pa.Table": - """ - Convert datagram to PyArrow Table format. - - Provides a standardized columnar representation suitable for analysis, - processing, and interoperability with Arrow-based tools. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context column. - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - - Returns: - PyArrow Table with requested columns. - - Example: - >>> table = datagram.as_table() # Data columns only - >>> full_table = datagram.as_table( - ... include_meta_columns=True, - ... include_context=True - ... ) - >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" - """ - ... - - def as_arrow_compatible_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - ) -> dict[str, Any]: - """ - Return dictionary with values optimized for Arrow table conversion. - - This method returns a dictionary where values are in a form that can be - efficiently converted to Arrow format using pa.Table.from_pylist(). - - The key insight is that this avoids the expensive as_table() → concat pattern - by providing values that are "Arrow-ready" while remaining in dict format - for efficient batching. - - Implementation note: This may involve format conversions (e.g., Path objects - to strings, datetime objects to ISO strings, etc.) to ensure compatibility - with Arrow's expected input formats. - - Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied - with arrow_schema(...) with the same argument options to ensure that the schema matches the table. - - Args: - include_all_info: Include all available information - include_meta_columns: Controls meta column inclusion - include_context: Whether to include context key - - Returns: - Dictionary with values optimized for Arrow conversion - - Example: - # Efficient batch conversion pattern - arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] - schema = datagrams[0].arrow_schema() - table = pa.Table.from_pylist(arrow_dicts, schema=schema) - """ - ... - - # 5. Meta Column Operations - def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: - """ - Get meta column value with optional default. - - Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. - This method handles both prefixed and unprefixed key formats. - - Args: - key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). - default: Value to return if meta column doesn't exist. - - Returns: - Meta column value if exists, otherwise the default value. - - Example: - >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed - 'v2.1.0' - >>> datagram.get_meta_value("__pipeline_version") # Already prefixed - 'v2.1.0' - >>> datagram.get_meta_value("missing", "default") - 'default' - """ - ... - - def with_meta_columns(self, **updates: DataValue) -> Self: - """ - Create new datagram with updated meta columns. - - Adds or updates operational metadata while preserving all data columns. - Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. - - Args: - **updates: Meta column updates as keyword arguments. - - Returns: - New datagram instance with updated meta columns. - - Example: - >>> tracked = datagram.with_meta_columns( - ... processed_by="pipeline_v2", - ... timestamp="2024-01-15T10:30:00Z" - ... ) - """ - ... - - def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: - """ - Create new datagram with specified meta columns removed. - - Args: - *keys: Meta column keys to remove (prefixes optional). - ignore_missing: If True, ignore missing columns without raising an error. - - - Returns: - New datagram instance without specified meta columns. - - Raises: - KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. - - Example: - >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") - """ - ... - - # 6. Data Column Operations - def select(self, *column_names: str) -> Self: - """ - Create new datagram with only specified data columns. - - Args: - *column_names: Data column names to keep. - - - Returns: - New datagram instance with only specified data columns. All other columns including - meta columns and context are preserved. - - Raises: - KeyError: If any specified column doesn't exist. - - Example: - >>> subset = datagram.select("user_id", "name", "email") - """ - ... - - def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: - """ - Create new datagram with specified data columns removed. Note that this does not - remove meta columns or context column. Refer to `drop_meta_columns()` for dropping - specific meta columns. Context key column can never be dropped but a modified copy - can be created with a different context key using `with_data_context()`. - - Args: - *column_names: Data column names to remove. - ignore_missing: If True, ignore missing columns without raising an error. - - Returns: - New datagram instance without specified data columns. - - Raises: - KeryError: If any specified column to drop doesn't exist and ignore_missing=False. - - Example: - >>> filtered = datagram.drop("temp_field", "debug_info") - """ - ... - - def rename( - self, - column_mapping: Mapping[str, str], - ) -> Self: - """ - Create new datagram with data columns renamed. - - Args: - column_mapping: Mapping from old names to new names. - - Returns: - New datagram instance with renamed data columns. - - Example: - >>> renamed = datagram.rename( - ... {"old_id": "user_id", "old_name": "full_name"}, - ... column_types={"user_id": int} - ... ) - """ - ... - - def update(self, **updates: DataValue) -> Self: - """ - Create new datagram with existing column values updated. - - Updates values in existing data columns. Will error if any specified - column doesn't exist - use with_columns() to add new columns. - - Args: - **updates: Column names and their new values. - - Returns: - New datagram instance with updated values. - - Raises: - KeyError: If any specified column doesn't exist. - - Example: - >>> updated = datagram.update( - ... file_path="/new/absolute/path.txt", - ... status="processed" - ... ) - """ - ... - - def with_columns( - self, - column_types: Mapping[str, type] | None = None, - **updates: DataValue, - ) -> Self: - """ - Create new datagram with additional data columns. - - Adds new data columns to the datagram. Will error if any specified - column already exists - use update() to modify existing columns. - - Args: - column_types: Optional type specifications for new columns. If not provided, the column type is - inferred from the provided values. If value is None, the column type defaults to `str`. - **kwargs: New columns as keyword arguments. - - Returns: - New datagram instance with additional data columns. - - Raises: - ValueError: If any specified column already exists. - - Example: - >>> expanded = datagram.with_columns( - ... status="active", - ... score=95.5, - ... column_types={"score": float} - ... ) - """ - ... - - # 7. Context Operations - def with_context_key(self, new_context_key: str) -> Self: - """ - Create new datagram with different context key. - - Changes the semantic interpretation context while preserving all data. - The context key affects how columns are processed and converted. - - Args: - new_context_key: New context key string. - - Returns: - New datagram instance with updated context key. - - Note: - How the context is interpreted depends on the datagram implementation. - Semantic processing may be rebuilt for the new context. - - Example: - >>> financial_datagram = datagram.with_context_key("financial_v1") - """ - ... - - # 8. Utility Operations - def copy(self) -> Self: - """ - Create a shallow copy of the datagram. - - Returns a new datagram instance with the same data and cached values. - This is more efficient than reconstructing from scratch when you need - an identical datagram instance. - - Returns: - New datagram instance with copied data and caches. - - Example: - >>> copied = datagram.copy() - >>> copied is datagram # False - different instance - False - """ - ... - - # 9. String Representations - def __str__(self) -> str: - """ - Return user-friendly string representation. - - Shows the datagram as a simple dictionary for user-facing output, - messages, and logging. Only includes data columns for clean output. - - Returns: - Dictionary-style string representation of data columns only. - """ - ... - - def __repr__(self) -> str: - """ - Return detailed string representation for debugging. - - Shows the datagram type and comprehensive information for debugging. - - Returns: - Detailed representation with type and metadata information. - """ - ... - - -@runtime_checkable -class Tag(Datagram, Protocol): - """ - Metadata associated with each data item in a stream. - - Tags carry contextual information about data packets as they flow through - the computational graph. They are immutable and provide metadata that - helps with: - - Data lineage tracking - - Grouping and aggregation operations - - Temporal information (timestamps) - - Source identification - - Processing context - - Common examples include: - - Timestamps indicating when data was created/processed - - Source identifiers showing data origin - - Processing metadata like batch IDs or session information - - Grouping keys for aggregation operations - - Quality indicators or confidence scores - """ - - def keys( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> tuple[str, ...]: - """ - Return tuple of column names. - - Provides access to column names with filtering options for different - column types. Default returns only data column names. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Return only data column names (default) - - True: Include all meta column names - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - Tuple of column names based on inclusion criteria. - - Example: - >>> datagram.keys() # Data columns only - ('user_id', 'name', 'email') - >>> datagram.keys(include_meta_columns=True) - ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_meta_columns=["pipeline"]) - ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_context=True) - ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') - """ - ... - - def types( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> TypeSpec: - """ - Return type specification mapping field names to Python types. - - The TypeSpec enables type checking and validation throughout the system. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column type inclusion. - - False: Exclude meta column types (default) - - True: Include all meta column types - - Collection[str]: Include meta column types matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context type. - include_source: Whether to include source info fields. - - Returns: - TypeSpec mapping field names to their Python types. - - Example: - >>> datagram.types() - {'user_id': , 'name': } - """ - ... - - def arrow_schema( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> "pa.Schema": - """ - Return PyArrow schema representation. - - The schema provides structured field and type information for efficient - serialization and deserialization with PyArrow. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column schema inclusion. - - False: Exclude meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - PyArrow Schema describing the datagram structure. - - Example: - >>> schema = datagram.arrow_schema() - >>> schema.names - ['user_id', 'name'] - """ - ... - - def as_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> dict[str, DataValue]: - """ - Convert datagram to dictionary format. - - Provides a simple key-value representation useful for debugging, - serialization, and interop with dict-based APIs. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context key. - include_source: Whether to include source info fields. - - - Returns: - Dictionary with requested columns as key-value pairs. - - Example: - >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} - >>> full_data = datagram.as_dict( - ... include_meta_columns=True, - ... include_context=True - ... ) - """ - ... - - def as_table( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_system_tags: bool = False, - ) -> "pa.Table": - """ - Convert datagram to PyArrow Table format. - - Provides a standardized columnar representation suitable for analysis, - processing, and interoperability with Arrow-based tools. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context column. - include_source: Whether to include source info columns in the schema. - - Returns: - PyArrow Table with requested columns. - - Example: - >>> table = datagram.as_table() # Data columns only - >>> full_table = datagram.as_table( - ... include_meta_columns=True, - ... include_context=True - ... ) - >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" - """ - ... - - # TODO: add this back - # def as_arrow_compatible_dict( - # self, - # include_all_info: bool = False, - # include_meta_columns: bool | Collection[str] = False, - # include_context: bool = False, - # include_source: bool = False, - # ) -> dict[str, Any]: - # """Extended version with source info support.""" - # ... - - def as_datagram( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_system_tags: bool = False, - ) -> Datagram: - """ - Convert the packet to a Datagram. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - - Returns: - Datagram: Datagram representation of packet data - """ - ... - - def system_tags(self) -> dict[str, DataValue]: - """ - Return metadata about the packet's source/origin. - - Provides debugging and lineage information about where the packet - originated. May include information like: - - File paths for file-based sources - - Database connection strings - - API endpoints - - Processing pipeline information - - Returns: - dict[str, str | None]: Source information for each data column as key-value pairs. - """ - ... - - -@runtime_checkable -class Packet(Datagram, Protocol): - """ - The actual data payload in a stream. - - Packets represent the core data being processed through the computational - graph. Unlike Tags (which are metadata), Packets contain the actual - information that computations operate on. - - Packets extend Datagram with additional capabilities for: - - Source tracking and lineage - - Content-based hashing for caching - - Metadata inclusion for debugging - - The distinction between Tag and Packet is crucial for understanding - data flow: Tags provide context, Packets provide content. - """ - - def keys( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> tuple[str, ...]: - """ - Return tuple of column names. - - Provides access to column names with filtering options for different - column types. Default returns only data column names. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Return only data column names (default) - - True: Include all meta column names - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - Tuple of column names based on inclusion criteria. - - Example: - >>> datagram.keys() # Data columns only - ('user_id', 'name', 'email') - >>> datagram.keys(include_meta_columns=True) - ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_meta_columns=["pipeline"]) - ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') - >>> datagram.keys(include_context=True) - ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') - """ - ... - - def types( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> TypeSpec: - """ - Return type specification mapping field names to Python types. - - The TypeSpec enables type checking and validation throughout the system. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column type inclusion. - - False: Exclude meta column types (default) - - True: Include all meta column types - - Collection[str]: Include meta column types matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context type. - include_source: Whether to include source info fields. - - Returns: - TypeSpec mapping field names to their Python types. - - Example: - >>> datagram.types() - {'user_id': , 'name': } - """ - ... - - def arrow_schema( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> "pa.Schema": - """ - Return PyArrow schema representation. - - The schema provides structured field and type information for efficient - serialization and deserialization with PyArrow. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column schema inclusion. - - False: Exclude meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include context column. - include_source: Whether to include source info fields. - - - Returns: - PyArrow Schema describing the datagram structure. - - Example: - >>> schema = datagram.arrow_schema() - >>> schema.names - ['user_id', 'name'] - """ - ... - - def as_dict( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> dict[str, DataValue]: - """ - Convert datagram to dictionary format. - - Provides a simple key-value representation useful for debugging, - serialization, and interop with dict-based APIs. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context key. - include_source: Whether to include source info fields. - - - Returns: - Dictionary with requested columns as key-value pairs. - - Example: - >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} - >>> full_data = datagram.as_dict( - ... include_meta_columns=True, - ... include_context=True - ... ) - """ - ... - - def as_table( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_context: bool = False, - include_source: bool = False, - ) -> "pa.Table": - """ - Convert datagram to PyArrow Table format. - - Provides a standardized columnar representation suitable for analysis, - processing, and interoperability with Arrow-based tools. - - Args: - include_all_info: If True, include all available information. This option supersedes all other inclusion options. - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - include_context: Whether to include the context column. - include_source: Whether to include source info columns in the schema. - - Returns: - PyArrow Table with requested columns. - - Example: - >>> table = datagram.as_table() # Data columns only - >>> full_table = datagram.as_table( - ... include_meta_columns=True, - ... include_context=True - ... ) - >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" - """ - ... - - # TODO: add this back - # def as_arrow_compatible_dict( - # self, - # include_all_info: bool = False, - # include_meta_columns: bool | Collection[str] = False, - # include_context: bool = False, - # include_source: bool = False, - # ) -> dict[str, Any]: - # """Extended version with source info support.""" - # ... - - def as_datagram( - self, - include_all_info: bool = False, - include_meta_columns: bool | Collection[str] = False, - include_source: bool = False, - ) -> Datagram: - """ - Convert the packet to a Datagram. - - Args: - include_meta_columns: Controls meta column inclusion. - - False: Exclude all meta columns (default) - - True: Include all meta columns - - Collection[str]: Include meta columns matching these prefixes. If absent, - {orcapod.META_PREFIX} ('__') prefix is prepended to each key. - - Returns: - Datagram: Datagram representation of packet data - """ - ... - - def source_info(self) -> dict[str, str | None]: - """ - Return metadata about the packet's source/origin. - - Provides debugging and lineage information about where the packet - originated. May include information like: - - File paths for file-based sources - - Database connection strings - - API endpoints - - Processing pipeline information - - Returns: - dict[str, str | None]: Source information for each data column as key-value pairs. - """ - ... - - def with_source_info( - self, - **source_info: str | None, - ) -> Self: - """ - Create new packet with updated source information. - - Adds or updates source metadata for the packet. This is useful for - tracking data provenance and lineage through the computational graph. - - Args: - **source_info: Source metadata as keyword arguments. - - Returns: - New packet instance with updated source information. - - Example: - >>> updated_packet = packet.with_source_info( - ... file_path="/new/path/to/file.txt", - ... source_id="source_123" - ... ) - """ - ... - - -@runtime_checkable -class PodFunction(Protocol): - """ - A function suitable for use in a FunctionPod. - - PodFunctions define the computational logic that operates on individual - packets within a Pod. They represent pure functions that transform - data values without side effects. - - These functions are designed to be: - - Stateless: No dependency on external state - - Deterministic: Same inputs always produce same outputs - - Serializable: Can be cached and distributed - - Type-safe: Clear input/output contracts - - PodFunctions accept named arguments corresponding to packet fields - and return transformed data values. - """ - - def __call__(self, **kwargs: DataValue) -> None | DataValue: - """ - Execute the pod function with the given arguments. - - The function receives packet data as named arguments and returns - either transformed data or None (for filtering operations). - - Args: - **kwargs: Named arguments mapping packet fields to data values - - Returns: - None: Filter out this packet (don't include in output) - DataValue: Single transformed value - - Raises: - TypeError: If required arguments are missing - ValueError: If argument values are invalid - """ - ... - - -@runtime_checkable -class Labelable(Protocol): - """ - Protocol for objects that can have a human-readable label. - - Labels provide meaningful names for objects in the computational graph, - making debugging, visualization, and monitoring much easier. They serve - as human-friendly identifiers that complement the technical identifiers - used internally. - - Labels are optional but highly recommended for: - - Debugging complex computational graphs - - Visualization and monitoring tools - - Error messages and logging - - User interfaces and dashboards - """ - - @property - def label(self) -> str | None: - """ - Return the human-readable label for this object. - - Labels should be descriptive and help users understand the purpose - or role of the object in the computational graph. - - Returns: - str: Human-readable label for this object - None: No label is set (will use default naming) - """ - ... - - -@runtime_checkable -class Stream(ContentIdentifiable, Labelable, Protocol): - """ - Base protocol for all streams in Orcapod. - - Streams represent sequences of (Tag, Packet) pairs flowing through the - computational graph. They are the fundamental data structure connecting - kernels and carrying both data and metadata. - - Streams can be either: - - Static: Immutable snapshots created at a specific point in time - - Live: Dynamic streams that stay current with upstream dependencies - - All streams provide: - - Iteration over (tag, packet) pairs - - Type information and schema access - - Lineage information (source kernel and upstream streams) - - Basic caching and freshness tracking - - Conversion to common formats (tables, dictionaries) - """ - - @property - def substream_identities(self) -> tuple[str, ...]: - """ - Unique identifiers for sub-streams within this stream. - - This property provides a way to identify and differentiate - sub-streams that may be part of a larger stream. It is useful - for tracking and managing complex data flows. - - Returns: - tuple[str, ...]: Unique identifiers for each sub-stream - """ - ... - - @property - def execution_engine(self) -> ExecutionEngine | None: - """ - The execution engine attached to this stream. By default, the stream - will use this execution engine whenever it needs to perform computation. - None means the stream is not attached to any execution engine and will default - to running natively. - """ - - @execution_engine.setter - def execution_engine(self, engine: ExecutionEngine | None) -> None: - """ - Set the execution engine for this stream. - - This allows the stream to use a specific execution engine for - computation, enabling optimized execution strategies and resource - management. - - Args: - engine: The execution engine to attach to this stream - """ - ... - - def get_substream(self, substream_id: str) -> "Stream": - """ - Retrieve a specific sub-stream by its identifier. - - This method allows access to individual sub-streams within the - main stream, enabling focused operations on specific data segments. - - Args: - substream_id: Unique identifier for the desired sub-stream. - - Returns: - Stream: The requested sub-stream if it exists - """ - ... - - @property - def source(self) -> "Kernel | None": - """ - The kernel that produced this stream. - - This provides lineage information for tracking data flow through - the computational graph. Root streams (like file sources) may - have no source kernel. - - Returns: - Kernel: The source kernel that created this stream - None: This is a root stream with no source kernel - """ - ... - - @property - def upstreams(self) -> tuple["Stream", ...]: - """ - Input streams used to produce this stream. - - These are the streams that were provided as input to the source - kernel when this stream was created. Used for dependency tracking - and cache invalidation. - - Returns: - tuple[Stream, ...]: Upstream dependency streams (empty for sources) - """ - ... - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Available keys/fields in the stream content. - - Returns the field names present in both tags and packets. - This provides schema information without requiring type details, - useful for: - - Schema inspection and exploration - - Query planning and optimization - - Field validation and mapping - - Returns: - tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) - """ - ... - - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: - """ - Type specifications for the stream content. - - Returns the type schema for both tags and packets in this stream. - This information is used for: - - Type checking and validation - - Schema inference and planning - - Compatibility checking between kernels - - Returns: - tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) - """ - ... - - @property - def last_modified(self) -> datetime | None: - """ - When the stream's content was last modified. - - This property is crucial for caching decisions and dependency tracking: - - datetime: Content was last modified at this time (cacheable) - - None: Content is never stable, always recompute (some dynamic streams) - - Both static and live streams typically return datetime values, but - live streams update this timestamp whenever their content changes. - - Returns: - datetime: Timestamp of last modification for most streams - None: Stream content is never stable (some special dynamic streams) - """ - ... - - @property - def is_current(self) -> bool: - """ - Whether the stream is up-to-date with its dependencies. - - A stream is current if its content reflects the latest state of its - source kernel and upstream streams. This is used for cache validation - and determining when refresh is needed. - - For live streams, this should always return True since they stay - current automatically. For static streams, this indicates whether - the cached content is still valid. - - Returns: - bool: True if stream is up-to-date, False if refresh needed - """ - ... - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - """ - Iterate over (tag, packet) pairs in the stream. - - This is the primary way to access stream data. The behavior depends - on the stream type: - - Static streams: Return cached/precomputed data - - Live streams: May trigger computation and always reflect current state - - Yields: - tuple[Tag, Packet]: Sequential (tag, packet) pairs - """ - ... - - def iter_packets( - self, execution_engine: ExecutionEngine | None = None - ) -> Iterator[tuple[Tag, Packet]]: - """ - Alias for __iter__ for explicit packet iteration. - - Provides a more explicit method name when the intent is to iterate - over packets specifically, improving code readability. - - This method must return an immutable iterator -- that is, the returned iterator - should not change and must consistently return identical tag,packet pairs across - multiple iterations of the iterator. - - Note that this is NOT to mean that multiple invocation of `iter_packets` must always - return an identical iterator. The iterator returned by `iter_packets` may change - between invocations, but the iterator itself must not change. Consequently, it should be understood - that the returned iterators may be a burden on memory if the stream is large or infinite. - - Yields: - tuple[Tag, Packet]: Sequential (tag, packet) pairs - """ - ... - - def run(self, execution_engine: ExecutionEngine | None = None) -> None: - """ - Execute the stream using the provided execution engine. - - This method triggers computation of the stream content based on its - source kernel and upstream streams. It returns a new stream instance - containing the computed (tag, packet) pairs. - - Args: - execution_engine: The execution engine to use for computation - - """ - ... - - async def run_async(self, execution_engine: ExecutionEngine | None = None) -> None: - """ - Asynchronously execute the stream using the provided execution engine. - - This method triggers computation of the stream content based on its - source kernel and upstream streams. It returns a new stream instance - containing the computed (tag, packet) pairs. - - Args: - execution_engine: The execution engine to use for computation - - """ - ... - - def as_df( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - execution_engine: ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": - """ - Convert the entire stream to a Polars DataFrame. - """ - ... - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - execution_engine: ExecutionEngine | None = None, - ) -> "pa.Table": - """ - Convert the entire stream to a PyArrow Table. - - Materializes all (tag, packet) pairs into a single table for - analysis and processing. This operation may be expensive for - large streams or live streams that need computation. - - If include_content_hash is True, an additional column called "_content_hash" - containing the content hash of each packet is included. If include_content_hash - is a string, it is used as the name of the content hash column. - - Returns: - pa.Table: Complete stream data as a PyArrow Table - """ - ... - - def flow( - self, execution_engine: ExecutionEngine | None = None - ) -> Collection[tuple[Tag, Packet]]: - """ - Return the entire stream as a collection of (tag, packet) pairs. - - This method materializes the stream content into a list or similar - collection type. It is useful for small streams or when you need - to process all data at once. - - Args: - execution_engine: Optional execution engine to use for computation. - If None, the stream will use its default execution engine. - """ - ... - - def join(self, other_stream: "Stream") -> "Stream": - """ - Join this stream with another stream. - - Combines two streams into a single stream by merging their content. - The resulting stream contains all (tag, packet) pairs from both - streams, preserving their order. - - Args: - other_stream: The other stream to join with this one. - - Returns: - Self: New stream containing combined content from both streams. - """ - ... - - def semi_join(self, other_stream: "Stream") -> "Stream": - """ - Perform a semi-join with another stream. - - This operation filters this stream to only include packets that have - corresponding tags in the other stream. The resulting stream contains - all (tag, packet) pairs from this stream that match tags in the other. - - Args: - other_stream: The other stream to semi-join with this one. - - Returns: - Self: New stream containing filtered content based on the semi-join. - """ - ... - - def map_tags( - self, name_map: Mapping[str, str], drop_unmapped: bool = True - ) -> "Stream": - """ - Map tag names in this stream to new names based on the provided mapping. - """ - ... - - def map_packets( - self, name_map: Mapping[str, str], drop_unmapped: bool = True - ) -> "Stream": - """ - Map packet names in this stream to new names based on the provided mapping. - """ - ... - - -@runtime_checkable -class LiveStream(Stream, Protocol): - """ - A stream that automatically stays up-to-date with its upstream dependencies. - - LiveStream extends the base Stream protocol with capabilities for "up-to-date" - data flow and reactive computation. Unlike static streams which represent - snapshots, LiveStreams provide the guarantee that their content always - reflects the current state of their dependencies. - - Key characteristics: - - Automatically refresh the stream if changes in the upstreams are detected - - Track last_modified timestamp when content changes - - Support manual refresh triggering and invalidation - - By design, LiveStream would return True for is_current except when auto-update fails. - - LiveStreams are always returned by Kernel.__call__() methods, ensuring - that normal kernel usage produces live, up-to-date results. - - Caching behavior: - - last_modified updates whenever content changes - - Can be cached based on dependency timestamps - - Invalidation happens automatically when upstreams change - - Use cases: - - Real-time data processing pipelines - - Reactive user interfaces - - Monitoring and alerting systems - - Dynamic dashboard updates - - Any scenario requiring current data - """ - - def refresh(self, force: bool = False) -> bool: - """ - Manually trigger a refresh of this stream's content. - - Forces the stream to check its upstream dependencies and update - its content if necessary. This is useful when: - - You want to ensure the latest data before a critical operation - - You need to force computation at a specific time - - You're debugging data flow issues - - You want to pre-compute results for performance - Args: - force: If True, always refresh even if the stream is current. - If False, only refresh if the stream is not current. - - Returns: - bool: True if the stream was refreshed, False if it was already current. - Note: LiveStream refreshes automatically on access, so this - method may be a no-op for some implementations. However, it's - always safe to call if you need to control when the cache is refreshed. - """ - ... - - def invalidate(self) -> None: - """ - Mark this stream as invalid, forcing a refresh on next access. - - This method is typically called when: - - Upstream dependencies have changed - - The source kernel has been modified - - External data sources have been updated - - Manual cache invalidation is needed - - The stream will automatically refresh its content the next time - it's accessed (via iteration, as_table(), etc.). - - This is more efficient than immediate refresh when you know the - data will be accessed later. - """ - ... - - -@runtime_checkable -class Kernel(ContentIdentifiable, Labelable, Protocol): - """ - The fundamental unit of computation in Orcapod. - - Kernels are the building blocks of computational graphs, transforming - zero, one, or more input streams into a single output stream. They - encapsulate computation logic while providing consistent interfaces - for validation, type checking, and execution. - - Key design principles: - - Immutable: Kernels don't change after creation - - Deterministic: Same inputs always produce same outputs - - Composable: Kernels can be chained and combined - - Trackable: All invocations are recorded for lineage - - Type-safe: Strong typing and validation throughout - - Execution modes: - - __call__(): Full-featured execution with tracking, returns LiveStream - - forward(): Pure computation without side effects, returns Stream - - The distinction between these modes enables both production use (with - full tracking) and testing/debugging (without side effects). - """ - - @property - def kernel_id(self) -> tuple[str, ...]: - """ - Return a unique identifier for this Pod. - - The pod_id is used for caching and tracking purposes. It should - uniquely identify the Pod's computational logic, parameters, and - any relevant metadata that affects its behavior. - - Returns: - tuple[str, ...]: Unique identifier for this Pod - """ - ... - - @property - def data_context_key(self) -> str: - """ - Return the context key for this kernel's data processing. - - The context key is used to interpret how data columns should be - processed and converted. It provides semantic meaning to the data - being processed by this kernel. - - Returns: - str: Context key for this kernel's data processing - """ - ... - - @property - def last_modified(self) -> datetime | None: - """ - When the kernel was last modified. For most kernels, this is the timestamp - of the kernel creation. - """ - ... - - def __call__( - self, *streams: Stream, label: str | None = None, **kwargs - ) -> LiveStream: - """ - Main interface for kernel invocation with full tracking and guarantees. - - This is the primary way to invoke kernels in production. It provides - a complete execution pipeline: - 1. Validates input streams against kernel requirements - 2. Registers the invocation with the computational graph - 3. Calls forward() to perform the actual computation - 4. Ensures the result is a LiveStream that stays current - - The returned LiveStream automatically stays up-to-date with its - upstream dependencies, making it suitable for real-time processing - and reactive applications. - - Args: - *streams: Input streams to process (can be empty for source kernels) - label: Optional label for this invocation (overrides kernel.label) - **kwargs: Additional arguments for kernel configuration - - Returns: - LiveStream: Live stream that stays up-to-date with upstreams - - Raises: - ValidationError: If input streams are invalid for this kernel - TypeMismatchError: If stream types are incompatible - ValueError: If required arguments are missing - """ - ... - - def forward(self, *streams: Stream) -> Stream: - """ - Perform the actual computation without side effects. - - This method contains the core computation logic and should be - overridden by subclasses. It performs pure computation without: - - Registering with the computational graph - - Performing validation (caller's responsibility) - - Guaranteeing result type (may return static or live streams) - - The returned stream must be accurate at the time of invocation but - need not stay up-to-date with upstream changes. This makes forward() - suitable for: - - Testing and debugging - - Batch processing where currency isn't required - - Internal implementation details - - Args: - *streams: Input streams to process - - Returns: - Stream: Result of the computation (may be static or live) - """ - ... - - def output_types( - self, *streams: Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: - """ - Determine output types without triggering computation. - - This method performs type inference based on input stream types, - enabling efficient type checking and stream property queries. - It should be fast and not trigger any expensive computation. - - Used for: - - Pre-execution type validation - - Query planning and optimization - - Schema inference in complex pipelines - - IDE support and developer tooling - - Args: - *streams: Input streams to analyze - - Returns: - tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output - - Raises: - ValidationError: If input types are incompatible - TypeError: If stream types cannot be processed - """ - ... - - def validate_inputs(self, *streams: Stream) -> None: - """ - Validate input streams, raising exceptions if incompatible. - - This method is called automatically by __call__ before computation - to provide fail-fast behavior. It should check: - - Number of input streams - - Stream types and schemas - - Any kernel-specific requirements - - Business logic constraints - - The goal is to catch errors early, before expensive computation - begins, and provide clear error messages for debugging. - - Args: - *streams: Input streams to validate - - Raises: - ValidationError: If streams are invalid for this kernel - TypeError: If stream types are incompatible - ValueError: If stream content violates business rules - """ - ... - - def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: - """ - Generate a unique identity structure for this kernel and/or kernel invocation. - When invoked without streams, it should return a structure - that uniquely identifies the kernel itself (e.g., class name, parameters). - When invoked with streams, it should include the identity of the streams - to distinguish different invocations of the same kernel. - - This structure is used for: - - Caching and memoization - - Debugging and error reporting - - Tracking kernel invocations in computational graphs - - Args: - streams: Optional input streams for this invocation. If None, identity_structure is - based solely on the kernel. If streams are provided, they are included in the identity - to differentiate between different invocations of the same kernel. - - Returns: - Any: Unique identity structure (e.g., tuple of class name and stream identities) - """ - ... - - -@runtime_checkable -class Pod(Kernel, Protocol): - """ - Specialized kernel for packet-level processing with advanced caching. - - Pods represent a different computational model from regular kernels: - - Process data one packet at a time (enabling fine-grained parallelism) - - Support just-in-time evaluation (computation deferred until needed) - - Provide stricter type contracts (clear input/output schemas) - - Enable advanced caching strategies (packet-level caching) - - The Pod abstraction is ideal for: - - Expensive computations that benefit from caching - - Operations that can be parallelized at the packet level - - Transformations with strict type contracts - - Processing that needs to be deferred until access time - - Functions that operate on individual data items - - Pods use a different execution model where computation is deferred - until results are actually needed, enabling efficient resource usage - and fine-grained caching. - """ - - @property - def version(self) -> str: ... - - def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... - - @property - def tiered_pod_id(self) -> dict[str, str]: - """ - Return a dictionary representation of the tiered pod's unique identifier. - The key is supposed to be ordered from least to most specific, allowing - for hierarchical identification of the pod. - - This is primarily used for tiered memoization/caching strategies. - - Returns: - dict[str, str]: Dictionary representation of the pod's ID - """ - ... - - def input_packet_types(self) -> TypeSpec: - """ - TypeSpec for input packets that this Pod can process. - - Defines the exact schema that input packets must conform to. - Pods are typically much stricter about input types than regular - kernels, requiring precise type matching for their packet-level - processing functions. - - This specification is used for: - - Runtime type validation - - Compile-time type checking - - Schema inference and documentation - - Input validation and error reporting - - Returns: - TypeSpec: Dictionary mapping field names to required packet types - """ - ... - - def output_packet_types(self) -> TypeSpec: - """ - TypeSpec for output packets that this Pod produces. - - Defines the schema of packets that will be produced by this Pod. - This is typically determined by the Pod's computational function - and is used for: - - Type checking downstream kernels - - Schema inference in complex pipelines - - Query planning and optimization - - Documentation and developer tooling - - Returns: - TypeSpec: Dictionary mapping field names to output packet types - """ - ... - - async def async_call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - ) -> tuple[Tag, Packet | None]: ... - - def call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - ) -> tuple[Tag, Packet | None]: - """ - Process a single packet with its associated tag. - - This is the core method that defines the Pod's computational behavior. - It processes one (tag, packet) pair at a time, enabling: - - Fine-grained caching at the packet level - - Parallelization opportunities - - Just-in-time evaluation - - Filtering operations (by returning None) - - The method signature supports: - - Tag transformation (modify metadata) - - Packet transformation (modify content) - - Filtering (return None to exclude packet) - - Pass-through (return inputs unchanged) - - Args: - tag: Metadata associated with the packet - packet: The data payload to process - - Returns: - tuple[Tag, Packet | None]: - - Tag: Output tag (may be modified from input) - - Packet: Processed packet, or None to filter it out - - Raises: - TypeError: If packet doesn't match input_packet_types - ValueError: If packet data is invalid for processing - """ - ... - - -@runtime_checkable -class CachedPod(Pod, Protocol): - async def async_call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[Tag, Packet | None]: ... - - def call( - self, - tag: Tag, - packet: Packet, - record_id: str | None = None, - execution_engine: ExecutionEngine | None = None, - skip_cache_lookup: bool = False, - skip_cache_insert: bool = False, - ) -> tuple[Tag, Packet | None]: - """ - Process a single packet with its associated tag. - - This is the core method that defines the Pod's computational behavior. - It processes one (tag, packet) pair at a time, enabling: - - Fine-grained caching at the packet level - - Parallelization opportunities - - Just-in-time evaluation - - Filtering operations (by returning None) - - The method signature supports: - - Tag transformation (modify metadata) - - Packet transformation (modify content) - - Filtering (return None to exclude packet) - - Pass-through (return inputs unchanged) - - Args: - tag: Metadata associated with the packet - packet: The data payload to process - - Returns: - tuple[Tag, Packet | None]: - - Tag: Output tag (may be modified from input) - - Packet: Processed packet, or None to filter it out - - Raises: - TypeError: If packet doesn't match input_packet_types - ValueError: If packet data is invalid for processing - """ - ... - - def get_all_records( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all records processed by this Pod. - - This method returns a table containing all packets processed by the Pod, - including metadata and system columns if requested. It is useful for: - - Debugging and analysis - - Auditing and data lineage tracking - - Performance monitoring - - Args: - include_system_columns: Whether to include system columns in the output - - Returns: - pa.Table | None: A table containing all processed records, or None if no records are available - """ - ... - - -@runtime_checkable -class Source(Kernel, Stream, Protocol): - """ - Entry point for data into the computational graph. - - Sources are special objects that serve dual roles: - - As Kernels: Can be invoked to produce streams - - As Streams: Directly provide data without upstream dependencies - - Sources represent the roots of computational graphs and typically - interface with external data sources. They bridge the gap between - the outside world and the Orcapod computational model. - - Common source types: - - File readers (CSV, JSON, Parquet, etc.) - - Database connections and queries - - API endpoints and web services - - Generated data sources (synthetic data) - - Manual data input and user interfaces - - Message queues and event streams - - Sources have unique properties: - - No upstream dependencies (upstreams is empty) - - Can be both invoked and iterated - - Serve as the starting point for data lineage - - May have their own refresh/update mechanisms - """ - - @property - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - ... - - @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - ... - - def get_all_records( - self, include_system_columns: bool = False - ) -> "pa.Table | None": - """ - Retrieve all records from the source. - - Args: - include_system_columns: Whether to include system columns in the output - - Returns: - pa.Table | None: A table containing all records, or None if no records are available - """ - ... - - def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... - - def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": ... - - def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... - - def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... - - -@runtime_checkable -class Tracker(Protocol): - """ - Records kernel invocations and stream creation for computational graph tracking. - - Trackers are responsible for maintaining the computational graph by recording - relationships between kernels, streams, and invocations. They enable: - - Lineage tracking and data provenance - - Caching and memoization strategies - - Debugging and error analysis - - Performance monitoring and optimization - - Reproducibility and auditing - - Multiple trackers can be active simultaneously, each serving different - purposes (e.g., one for caching, another for debugging, another for - monitoring). This allows for flexible and composable tracking strategies. - - Trackers can be selectively activated/deactivated to control overhead - and focus on specific aspects of the computational graph. - """ - - def set_active(self, active: bool = True) -> None: - """ - Set the active state of the tracker. - - When active, the tracker will record all kernel invocations and - stream creations. When inactive, no recording occurs, reducing - overhead for performance-critical sections. - - Args: - active: True to activate recording, False to deactivate - """ - ... - - def is_active(self) -> bool: - """ - Check if the tracker is currently recording invocations. - - Returns: - bool: True if tracker is active and recording, False otherwise - """ - ... - - def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None - ) -> None: - """ - Record a kernel invocation in the computational graph. - - This method is called whenever a kernel is invoked. The tracker - should record: - - The kernel and its properties - - The input streams that were used as input - - Timing and performance information - - Any relevant metadata - - Args: - kernel: The kernel that was invoked - upstreams: The input streams used for this invocation - """ - ... - - def record_source_invocation( - self, source: Source, label: str | None = None - ) -> None: - """ - Record a source invocation in the computational graph. - - This method is called whenever a source is invoked. The tracker - should record: - - The source and its properties - - Timing and performance information - - Any relevant metadata - - Args: - source: The source that was invoked - """ - ... - - def record_pod_invocation( - self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None - ) -> None: - """ - Record a pod invocation in the computational graph. - - This method is called whenever a pod is invoked. The tracker - should record: - - The pod and its properties - - The upstream streams that were used as input - - Timing and performance information - - Any relevant metadata - - Args: - pod: The pod that was invoked - upstreams: The input streams used for this invocation - """ - ... - - -@runtime_checkable -class TrackerManager(Protocol): - """ - Manages multiple trackers and coordinates their activity. - - The TrackerManager provides a centralized way to: - - Register and manage multiple trackers - - Coordinate recording across all active trackers - - Provide a single interface for graph recording - - Enable dynamic tracker registration/deregistration - - This design allows for: - - Multiple concurrent tracking strategies - - Pluggable tracking implementations - - Easy testing and debugging (mock trackers) - - Performance optimization (selective tracking) - """ - - def get_active_trackers(self) -> list[Tracker]: - """ - Get all currently active trackers. - - Returns only trackers that are both registered and active, - providing the list of trackers that will receive recording events. - - Returns: - list[Tracker]: List of trackers that are currently recording - """ - ... - - def register_tracker(self, tracker: Tracker) -> None: - """ - Register a new tracker in the system. - - The tracker will be included in future recording operations - if it is active. Registration is separate from activation - to allow for dynamic control of tracking overhead. - - Args: - tracker: The tracker to register - """ - ... - - def deregister_tracker(self, tracker: Tracker) -> None: - """ - Remove a tracker from the system. - - The tracker will no longer receive recording notifications - even if it is still active. This is useful for: - - Cleaning up temporary trackers - - Removing failed or problematic trackers - - Dynamic tracker management - - Args: - tracker: The tracker to remove - """ - ... - - def record_kernel_invocation( - self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None - ) -> None: - """ - Record a stream in all active trackers. - - This method broadcasts the stream recording to all currently - active and registered trackers. It provides a single point - of entry for recording events, simplifying kernel implementations. - - Args: - stream: The stream to record in all active trackers - """ - ... - - def record_source_invocation( - self, source: Source, label: str | None = None - ) -> None: - """ - Record a source invocation in the computational graph. - - This method is called whenever a source is invoked. The tracker - should record: - - The source and its properties - - Timing and performance information - - Any relevant metadata - - Args: - source: The source that was invoked - """ - ... - - def record_pod_invocation( - self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None - ) -> None: - """ - Record a stream in all active trackers. - - This method broadcasts the stream recording to all currently` - active and registered trackers. It provides a single point - of entry for recording events, simplifying kernel implementations. - - Args: - stream: The stream to record in all active trackers - """ - ... - - def no_tracking(self) -> ContextManager[None]: ... diff --git a/src/orcapod/protocols/data_protocols/__init__.py b/src/orcapod/protocols/data_protocols/__init__.py new file mode 100644 index 0000000..f9c711d --- /dev/null +++ b/src/orcapod/protocols/data_protocols/__init__.py @@ -0,0 +1,24 @@ +from .base import ExecutionEngine, PodFunction +from .datagrams import Datagram, Tag, Packet +from .streams import Stream, LiveStream +from .kernel import Kernel +from .pods import Pod, CachedPod +from .source import Source +from .trackers import Tracker, TrackerManager + + +__all__ = [ + "ExecutionEngine", + "PodFunction", + "Datagram", + "Tag", + "Packet", + "Stream", + "LiveStream", + "Kernel", + "Pod", + "CachedPod", + "Source", + "Tracker", + "TrackerManager", +] diff --git a/src/orcapod/protocols/data_protocols/base.py b/src/orcapod/protocols/data_protocols/base.py new file mode 100644 index 0000000..080e2f3 --- /dev/null +++ b/src/orcapod/protocols/data_protocols/base.py @@ -0,0 +1,97 @@ +from collections.abc import Callable +from typing import Any, Protocol, runtime_checkable +from orcapod.types import DataValue + + +@runtime_checkable +class ExecutionEngine(Protocol): + @property + def name(self) -> str: ... + + def submit_sync(self, function: Callable, *args, **kwargs) -> Any: + """ + Run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... + + async def submit_async(self, function: Callable, *args, **kwargs) -> Any: + """ + Asynchronously run the given function with the provided arguments. + This method should be implemented by the execution engine. + """ + ... + + # TODO: consider adding batch submission + + +@runtime_checkable +class PodFunction(Protocol): + """ + A function suitable for use in a FunctionPod. + + PodFunctions define the computational logic that operates on individual + packets within a Pod. They represent pure functions that transform + data values without side effects. + + These functions are designed to be: + - Stateless: No dependency on external state + - Deterministic: Same inputs always produce same outputs + - Serializable: Can be cached and distributed + - Type-safe: Clear input/output contracts + + PodFunctions accept named arguments corresponding to packet fields + and return transformed data values. + """ + + def __call__(self, **kwargs: DataValue) -> None | DataValue: + """ + Execute the pod function with the given arguments. + + The function receives packet data as named arguments and returns + either transformed data or None (for filtering operations). + + Args: + **kwargs: Named arguments mapping packet fields to data values + + Returns: + None: Filter out this packet (don't include in output) + DataValue: Single transformed value + + Raises: + TypeError: If required arguments are missing + ValueError: If argument values are invalid + """ + ... + + +@runtime_checkable +class Labelable(Protocol): + """ + Protocol for objects that can have a human-readable label. + + Labels provide meaningful names for objects in the computational graph, + making debugging, visualization, and monitoring much easier. They serve + as human-friendly identifiers that complement the technical identifiers + used internally. + + Labels are optional but highly recommended for: + - Debugging complex computational graphs + - Visualization and monitoring tools + - Error messages and logging + - User interfaces and dashboards + """ + + @property + def label(self) -> str | None: + """ + Return the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Returns: + str: Human-readable label for this object + None: No label is set (will use default naming) + """ + ... diff --git a/src/orcapod/protocols/data_protocols/datagrams.py b/src/orcapod/protocols/data_protocols/datagrams.py new file mode 100644 index 0000000..50ded9e --- /dev/null +++ b/src/orcapod/protocols/data_protocols/datagrams.py @@ -0,0 +1,1105 @@ +from collections.abc import Collection, Iterator, Mapping +from typing import Any, Protocol, Self, TYPE_CHECKING, runtime_checkable +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import DataValue, TypeSpec + + +if TYPE_CHECKING: + import pyarrow as pa + + +@runtime_checkable +class Datagram(ContentIdentifiable, Protocol): + """ + Protocol for immutable datagram containers in Orcapod. + + Datagrams are the fundamental units of data that flow through the system. + They provide a unified interface for data access, conversion, and manipulation, + ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). + + Each datagram contains: + - **Data columns**: The primary business data (user_id, name, etc.) + - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} (typically '__') prefixes (e.g. __processed_at, etc.) + - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) + + Derivative of datagram (such as Packet or Tag) will also include some specific columns pertinent to the function of the specialized datagram: + - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) used in Packet + - **System tags**: Internal tags for system use, typically prefixed with {orcapod.SYSTEM_TAG_PREFIX} ('_system_') (_system_created_at, etc.) used in Tag + + All operations are by design immutable - methods return new datagram instances rather than + modifying existing ones. + + Example: + >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) + >>> updated = datagram.update(name="Alice Smith") + >>> filtered = datagram.select("user_id", "name") + >>> table = datagram.as_table() + """ + + # 1. Core Properties (Identity & Structure) + @property + def data_context_key(self) -> str: + """ + Return the data context key for this datagram. + + This key identifies a collection of system components that collectively controls + how information is serialized, hashed and represented, including the semantic type registry, + arrow data hasher, and other contextual information. Same piece of information (that is two datagrams + with an identical *logical* content) may bear distinct internal representation if they are + represented under two distinct data context, as signified by distinct data context keys. + + Returns: + str: Context key for proper datagram interpretation + """ + ... + + @property + def meta_columns(self) -> tuple[str, ...]: + """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" + ... + + # 2. Dict-like Interface (Data Access) + def __getitem__(self, key: str) -> DataValue: + """ + Get data column value by key. + + Provides dict-like access to data columns only. Meta columns + are not accessible through this method (use `get_meta_value()` instead). + + Args: + key: Data column name. + + Returns: + The value stored in the specified data column. + + Raises: + KeyError: If the column doesn't exist in data columns. + + Example: + >>> datagram["user_id"] + 123 + >>> datagram["name"] + 'Alice' + """ + ... + + def __contains__(self, key: str) -> bool: + """ + Check if data column exists. + + Args: + key: Column name to check. + + Returns: + True if column exists in data columns, False otherwise. + + Example: + >>> "user_id" in datagram + True + >>> "nonexistent" in datagram + False + """ + ... + + def __iter__(self) -> Iterator[str]: + """ + Iterate over data column names. + + Provides for-loop support over column names, enabling natural iteration + patterns without requiring conversion to dict. + + Yields: + Data column names in no particular order. + + Example: + >>> for column in datagram: + ... value = datagram[column] + ... print(f"{column}: {value}") + """ + ... + + def get(self, key: str, default: DataValue = None) -> DataValue: + """ + Get data column value with default fallback. + + Args: + key: Data column name. + default: Value to return if column doesn't exist. + + Returns: + Column value if exists, otherwise the default value. + + Example: + >>> datagram.get("user_id") + 123 + >>> datagram.get("missing", "default") + 'default' + """ + ... + + # 3. Structural Information + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> tuple[str, ...]: + """ + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + + Returns: + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') + """ + ... + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> TypeSpec: + """ + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + + Returns: + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } + """ + ... + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Schema": + """ + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + + Returns: + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] + """ + ... + + # 4. Format Conversions (Export) + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + def as_arrow_compatible_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + ) -> dict[str, Any]: + """ + Return dictionary with values optimized for Arrow table conversion. + + This method returns a dictionary where values are in a form that can be + efficiently converted to Arrow format using pa.Table.from_pylist(). + + The key insight is that this avoids the expensive as_table() → concat pattern + by providing values that are "Arrow-ready" while remaining in dict format + for efficient batching. + + Implementation note: This may involve format conversions (e.g., Path objects + to strings, datetime objects to ISO strings, etc.) to ensure compatibility + with Arrow's expected input formats. + + Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied + with arrow_schema(...) with the same argument options to ensure that the schema matches the table. + + Args: + include_all_info: Include all available information + include_meta_columns: Controls meta column inclusion + include_context: Whether to include context key + + Returns: + Dictionary with values optimized for Arrow conversion + + Example: + # Efficient batch conversion pattern + arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] + schema = datagrams[0].arrow_schema() + table = pa.Table.from_pylist(arrow_dicts, schema=schema) + """ + ... + + # 5. Meta Column Operations + def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: + """ + Get meta column value with optional default. + + Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. + This method handles both prefixed and unprefixed key formats. + + Args: + key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). + default: Value to return if meta column doesn't exist. + + Returns: + Meta column value if exists, otherwise the default value. + + Example: + >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed + 'v2.1.0' + >>> datagram.get_meta_value("__pipeline_version") # Already prefixed + 'v2.1.0' + >>> datagram.get_meta_value("missing", "default") + 'default' + """ + ... + + def with_meta_columns(self, **updates: DataValue) -> Self: + """ + Create new datagram with updated meta columns. + + Adds or updates operational metadata while preserving all data columns. + Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. + + Args: + **updates: Meta column updates as keyword arguments. + + Returns: + New datagram instance with updated meta columns. + + Example: + >>> tracked = datagram.with_meta_columns( + ... processed_by="pipeline_v2", + ... timestamp="2024-01-15T10:30:00Z" + ... ) + """ + ... + + def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified meta columns removed. + + Args: + *keys: Meta column keys to remove (prefixes optional). + ignore_missing: If True, ignore missing columns without raising an error. + + + Returns: + New datagram instance without specified meta columns. + + Raises: + KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. + + Example: + >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") + """ + ... + + # 6. Data Column Operations + def select(self, *column_names: str) -> Self: + """ + Create new datagram with only specified data columns. + + Args: + *column_names: Data column names to keep. + + + Returns: + New datagram instance with only specified data columns. All other columns including + meta columns and context are preserved. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> subset = datagram.select("user_id", "name", "email") + """ + ... + + def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: + """ + Create new datagram with specified data columns removed. Note that this does not + remove meta columns or context column. Refer to `drop_meta_columns()` for dropping + specific meta columns. Context key column can never be dropped but a modified copy + can be created with a different context key using `with_data_context()`. + + Args: + *column_names: Data column names to remove. + ignore_missing: If True, ignore missing columns without raising an error. + + Returns: + New datagram instance without specified data columns. + + Raises: + KeryError: If any specified column to drop doesn't exist and ignore_missing=False. + + Example: + >>> filtered = datagram.drop("temp_field", "debug_info") + """ + ... + + def rename( + self, + column_mapping: Mapping[str, str], + ) -> Self: + """ + Create new datagram with data columns renamed. + + Args: + column_mapping: Mapping from old names to new names. + + Returns: + New datagram instance with renamed data columns. + + Example: + >>> renamed = datagram.rename( + ... {"old_id": "user_id", "old_name": "full_name"}, + ... column_types={"user_id": int} + ... ) + """ + ... + + def update(self, **updates: DataValue) -> Self: + """ + Create new datagram with existing column values updated. + + Updates values in existing data columns. Will error if any specified + column doesn't exist - use with_columns() to add new columns. + + Args: + **updates: Column names and their new values. + + Returns: + New datagram instance with updated values. + + Raises: + KeyError: If any specified column doesn't exist. + + Example: + >>> updated = datagram.update( + ... file_path="/new/absolute/path.txt", + ... status="processed" + ... ) + """ + ... + + def with_columns( + self, + column_types: Mapping[str, type] | None = None, + **updates: DataValue, + ) -> Self: + """ + Create new datagram with additional data columns. + + Adds new data columns to the datagram. Will error if any specified + column already exists - use update() to modify existing columns. + + Args: + column_types: Optional type specifications for new columns. If not provided, the column type is + inferred from the provided values. If value is None, the column type defaults to `str`. + **kwargs: New columns as keyword arguments. + + Returns: + New datagram instance with additional data columns. + + Raises: + ValueError: If any specified column already exists. + + Example: + >>> expanded = datagram.with_columns( + ... status="active", + ... score=95.5, + ... column_types={"score": float} + ... ) + """ + ... + + # 7. Context Operations + def with_context_key(self, new_context_key: str) -> Self: + """ + Create new datagram with different context key. + + Changes the semantic interpretation context while preserving all data. + The context key affects how columns are processed and converted. + + Args: + new_context_key: New context key string. + + Returns: + New datagram instance with updated context key. + + Note: + How the context is interpreted depends on the datagram implementation. + Semantic processing may be rebuilt for the new context. + + Example: + >>> financial_datagram = datagram.with_context_key("financial_v1") + """ + ... + + # 8. Utility Operations + def copy(self) -> Self: + """ + Create a shallow copy of the datagram. + + Returns a new datagram instance with the same data and cached values. + This is more efficient than reconstructing from scratch when you need + an identical datagram instance. + + Returns: + New datagram instance with copied data and caches. + + Example: + >>> copied = datagram.copy() + >>> copied is datagram # False - different instance + False + """ + ... + + # 9. String Representations + def __str__(self) -> str: + """ + Return user-friendly string representation. + + Shows the datagram as a simple dictionary for user-facing output, + messages, and logging. Only includes data columns for clean output. + + Returns: + Dictionary-style string representation of data columns only. + """ + ... + + def __repr__(self) -> str: + """ + Return detailed string representation for debugging. + + Shows the datagram type and comprehensive information for debugging. + + Returns: + Detailed representation with type and metadata information. + """ + ... + + +@runtime_checkable +class Tag(Datagram, Protocol): + """ + Metadata associated with each data item in a stream. + + Tags carry contextual information about data packets as they flow through + the computational graph. They are immutable and provide metadata that + helps with: + - Data lineage tracking + - Grouping and aggregation operations + - Temporal information (timestamps) + - Source identification + - Processing context + + Common examples include: + - Timestamps indicating when data was created/processed + - Source identifiers showing data origin + - Processing metadata like batch IDs or session information + - Grouping keys for aggregation operations + - Quality indicators or confidence scores + """ + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> tuple[str, ...]: + """ + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') + """ + ... + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> TypeSpec: + """ + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + include_source: Whether to include source info fields. + + Returns: + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } + """ + ... + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> "pa.Schema": + """ + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] + """ + ... + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_source: Whether to include source info fields. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_system_tags: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_source: Whether to include source info columns in the schema. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # include_source: bool = False, + # ) -> dict[str, Any]: + # """Extended version with source info support.""" + # ... + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_system_tags: bool = False, + ) -> Datagram: + """ + Convert the packet to a Datagram. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + + Returns: + Datagram: Datagram representation of packet data + """ + ... + + def system_tags(self) -> dict[str, DataValue]: + """ + Return metadata about the packet's source/origin. + + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information + + Returns: + dict[str, str | None]: Source information for each data column as key-value pairs. + """ + ... + + +@runtime_checkable +class Packet(Datagram, Protocol): + """ + The actual data payload in a stream. + + Packets represent the core data being processed through the computational + graph. Unlike Tags (which are metadata), Packets contain the actual + information that computations operate on. + + Packets extend Datagram with additional capabilities for: + - Source tracking and lineage + - Content-based hashing for caching + - Metadata inclusion for debugging + + The distinction between Tag and Packet is crucial for understanding + data flow: Tags provide context, Packets provide content. + """ + + def keys( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> tuple[str, ...]: + """ + Return tuple of column names. + + Provides access to column names with filtering options for different + column types. Default returns only data column names. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Return only data column names (default) + - True: Include all meta column names + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + Tuple of column names based on inclusion criteria. + + Example: + >>> datagram.keys() # Data columns only + ('user_id', 'name', 'email') + >>> datagram.keys(include_meta_columns=True) + ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_meta_columns=["pipeline"]) + ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') + >>> datagram.keys(include_context=True) + ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') + """ + ... + + def types( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> TypeSpec: + """ + Return type specification mapping field names to Python types. + + The TypeSpec enables type checking and validation throughout the system. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column type inclusion. + - False: Exclude meta column types (default) + - True: Include all meta column types + - Collection[str]: Include meta column types matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context type. + include_source: Whether to include source info fields. + + Returns: + TypeSpec mapping field names to their Python types. + + Example: + >>> datagram.types() + {'user_id': , 'name': } + """ + ... + + def arrow_schema( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Schema": + """ + Return PyArrow schema representation. + + The schema provides structured field and type information for efficient + serialization and deserialization with PyArrow. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column schema inclusion. + - False: Exclude meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include context column. + include_source: Whether to include source info fields. + + + Returns: + PyArrow Schema describing the datagram structure. + + Example: + >>> schema = datagram.arrow_schema() + >>> schema.names + ['user_id', 'name'] + """ + ... + + def as_dict( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> dict[str, DataValue]: + """ + Convert datagram to dictionary format. + + Provides a simple key-value representation useful for debugging, + serialization, and interop with dict-based APIs. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context key. + include_source: Whether to include source info fields. + + + Returns: + Dictionary with requested columns as key-value pairs. + + Example: + >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} + >>> full_data = datagram.as_dict( + ... include_meta_columns=True, + ... include_context=True + ... ) + """ + ... + + def as_table( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_context: bool = False, + include_source: bool = False, + ) -> "pa.Table": + """ + Convert datagram to PyArrow Table format. + + Provides a standardized columnar representation suitable for analysis, + processing, and interoperability with Arrow-based tools. + + Args: + include_all_info: If True, include all available information. This option supersedes all other inclusion options. + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + include_context: Whether to include the context column. + include_source: Whether to include source info columns in the schema. + + Returns: + PyArrow Table with requested columns. + + Example: + >>> table = datagram.as_table() # Data columns only + >>> full_table = datagram.as_table( + ... include_meta_columns=True, + ... include_context=True + ... ) + >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" + """ + ... + + # TODO: add this back + # def as_arrow_compatible_dict( + # self, + # include_all_info: bool = False, + # include_meta_columns: bool | Collection[str] = False, + # include_context: bool = False, + # include_source: bool = False, + # ) -> dict[str, Any]: + # """Extended version with source info support.""" + # ... + + def as_datagram( + self, + include_all_info: bool = False, + include_meta_columns: bool | Collection[str] = False, + include_source: bool = False, + ) -> Datagram: + """ + Convert the packet to a Datagram. + + Args: + include_meta_columns: Controls meta column inclusion. + - False: Exclude all meta columns (default) + - True: Include all meta columns + - Collection[str]: Include meta columns matching these prefixes. If absent, + {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + + Returns: + Datagram: Datagram representation of packet data + """ + ... + + def source_info(self) -> dict[str, str | None]: + """ + Return metadata about the packet's source/origin. + + Provides debugging and lineage information about where the packet + originated. May include information like: + - File paths for file-based sources + - Database connection strings + - API endpoints + - Processing pipeline information + + Returns: + dict[str, str | None]: Source information for each data column as key-value pairs. + """ + ... + + def with_source_info( + self, + **source_info: str | None, + ) -> Self: + """ + Create new packet with updated source information. + + Adds or updates source metadata for the packet. This is useful for + tracking data provenance and lineage through the computational graph. + + Args: + **source_info: Source metadata as keyword arguments. + + Returns: + New packet instance with updated source information. + + Example: + >>> updated_packet = packet.with_source_info( + ... file_path="/new/path/to/file.txt", + ... source_id="source_123" + ... ) + """ + ... diff --git a/src/orcapod/protocols/data_protocols/kernel.py b/src/orcapod/protocols/data_protocols/kernel.py new file mode 100644 index 0000000..a9b2a76 --- /dev/null +++ b/src/orcapod/protocols/data_protocols/kernel.py @@ -0,0 +1,201 @@ +from collections.abc import Collection +from datetime import datetime +from typing import Any, Protocol, runtime_checkable +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import TypeSpec +from orcapod.protocols.data_protocols.base import Labelable +from orcapod.protocols.data_protocols.streams import Stream, LiveStream + + +@runtime_checkable +class Kernel(ContentIdentifiable, Labelable, Protocol): + """ + The fundamental unit of computation in Orcapod. + + Kernels are the building blocks of computational graphs, transforming + zero, one, or more input streams into a single output stream. They + encapsulate computation logic while providing consistent interfaces + for validation, type checking, and execution. + + Key design principles: + - Immutable: Kernels don't change after creation + - Deterministic: Same inputs always produce same outputs + - Composable: Kernels can be chained and combined + - Trackable: All invocations are recorded for lineage + - Type-safe: Strong typing and validation throughout + + Execution modes: + - __call__(): Full-featured execution with tracking, returns LiveStream + - forward(): Pure computation without side effects, returns Stream + + The distinction between these modes enables both production use (with + full tracking) and testing/debugging (without side effects). + """ + + @property + def kernel_id(self) -> tuple[str, ...]: + """ + Return a unique identifier for this Pod. + + The pod_id is used for caching and tracking purposes. It should + uniquely identify the Pod's computational logic, parameters, and + any relevant metadata that affects its behavior. + + Returns: + tuple[str, ...]: Unique identifier for this Pod + """ + ... + + @property + def data_context_key(self) -> str: + """ + Return the context key for this kernel's data processing. + + The context key is used to interpret how data columns should be + processed and converted. It provides semantic meaning to the data + being processed by this kernel. + + Returns: + str: Context key for this kernel's data processing + """ + ... + + @property + def last_modified(self) -> datetime | None: + """ + When the kernel was last modified. For most kernels, this is the timestamp + of the kernel creation. + """ + ... + + def __call__( + self, *streams: Stream, label: str | None = None, **kwargs + ) -> LiveStream: + """ + Main interface for kernel invocation with full tracking and guarantees. + + This is the primary way to invoke kernels in production. It provides + a complete execution pipeline: + 1. Validates input streams against kernel requirements + 2. Registers the invocation with the computational graph + 3. Calls forward() to perform the actual computation + 4. Ensures the result is a LiveStream that stays current + + The returned LiveStream automatically stays up-to-date with its + upstream dependencies, making it suitable for real-time processing + and reactive applications. + + Args: + *streams: Input streams to process (can be empty for source kernels) + label: Optional label for this invocation (overrides kernel.label) + **kwargs: Additional arguments for kernel configuration + + Returns: + LiveStream: Live stream that stays up-to-date with upstreams + + Raises: + ValidationError: If input streams are invalid for this kernel + TypeMismatchError: If stream types are incompatible + ValueError: If required arguments are missing + """ + ... + + def forward(self, *streams: Stream) -> Stream: + """ + Perform the actual computation without side effects. + + This method contains the core computation logic and should be + overridden by subclasses. It performs pure computation without: + - Registering with the computational graph + - Performing validation (caller's responsibility) + - Guaranteeing result type (may return static or live streams) + + The returned stream must be accurate at the time of invocation but + need not stay up-to-date with upstream changes. This makes forward() + suitable for: + - Testing and debugging + - Batch processing where currency isn't required + - Internal implementation details + + Args: + *streams: Input streams to process + + Returns: + Stream: Result of the computation (may be static or live) + """ + ... + + def output_types( + self, *streams: Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + """ + Determine output types without triggering computation. + + This method performs type inference based on input stream types, + enabling efficient type checking and stream property queries. + It should be fast and not trigger any expensive computation. + + Used for: + - Pre-execution type validation + - Query planning and optimization + - Schema inference in complex pipelines + - IDE support and developer tooling + + Args: + *streams: Input streams to analyze + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + + Raises: + ValidationError: If input types are incompatible + TypeError: If stream types cannot be processed + """ + ... + + def validate_inputs(self, *streams: Stream) -> None: + """ + Validate input streams, raising exceptions if incompatible. + + This method is called automatically by __call__ before computation + to provide fail-fast behavior. It should check: + - Number of input streams + - Stream types and schemas + - Any kernel-specific requirements + - Business logic constraints + + The goal is to catch errors early, before expensive computation + begins, and provide clear error messages for debugging. + + Args: + *streams: Input streams to validate + + Raises: + ValidationError: If streams are invalid for this kernel + TypeError: If stream types are incompatible + ValueError: If stream content violates business rules + """ + ... + + def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: + """ + Generate a unique identity structure for this kernel and/or kernel invocation. + When invoked without streams, it should return a structure + that uniquely identifies the kernel itself (e.g., class name, parameters). + When invoked with streams, it should include the identity of the streams + to distinguish different invocations of the same kernel. + + This structure is used for: + - Caching and memoization + - Debugging and error reporting + - Tracking kernel invocations in computational graphs + + Args: + streams: Optional input streams for this invocation. If None, identity_structure is + based solely on the kernel. If streams are provided, they are included in the identity + to differentiate between different invocations of the same kernel. + + Returns: + Any: Unique identity structure (e.g., tuple of class name and stream identities) + """ + ... diff --git a/src/orcapod/protocols/data_protocols/pods.py b/src/orcapod/protocols/data_protocols/pods.py new file mode 100644 index 0000000..80ce1d1 --- /dev/null +++ b/src/orcapod/protocols/data_protocols/pods.py @@ -0,0 +1,208 @@ +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +from orcapod.protocols.data_protocols.base import ExecutionEngine +from orcapod.protocols.data_protocols.datagrams import Packet, Tag +from orcapod.protocols.data_protocols.kernel import Kernel +from orcapod.types import TypeSpec + +if TYPE_CHECKING: + import pyarrow as pa + + +@runtime_checkable +class Pod(Kernel, Protocol): + """ + Specialized kernel for packet-level processing with advanced caching. + + Pods represent a different computational model from regular kernels: + - Process data one packet at a time (enabling fine-grained parallelism) + - Support just-in-time evaluation (computation deferred until needed) + - Provide stricter type contracts (clear input/output schemas) + - Enable advanced caching strategies (packet-level caching) + + The Pod abstraction is ideal for: + - Expensive computations that benefit from caching + - Operations that can be parallelized at the packet level + - Transformations with strict type contracts + - Processing that needs to be deferred until access time + - Functions that operate on individual data items + + Pods use a different execution model where computation is deferred + until results are actually needed, enabling efficient resource usage + and fine-grained caching. + """ + + @property + def version(self) -> str: ... + + def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... + + @property + def tiered_pod_id(self) -> dict[str, str]: + """ + Return a dictionary representation of the tiered pod's unique identifier. + The key is supposed to be ordered from least to most specific, allowing + for hierarchical identification of the pod. + + This is primarily used for tiered memoization/caching strategies. + + Returns: + dict[str, str]: Dictionary representation of the pod's ID + """ + ... + + def input_packet_types(self) -> TypeSpec: + """ + TypeSpec for input packets that this Pod can process. + + Defines the exact schema that input packets must conform to. + Pods are typically much stricter about input types than regular + kernels, requiring precise type matching for their packet-level + processing functions. + + This specification is used for: + - Runtime type validation + - Compile-time type checking + - Schema inference and documentation + - Input validation and error reporting + + Returns: + TypeSpec: Dictionary mapping field names to required packet types + """ + ... + + def output_packet_types(self) -> TypeSpec: + """ + TypeSpec for output packets that this Pod produces. + + Defines the schema of packets that will be produced by this Pod. + This is typically determined by the Pod's computational function + and is used for: + - Type checking downstream kernels + - Schema inference in complex pipelines + - Query planning and optimization + - Documentation and developer tooling + + Returns: + TypeSpec: Dictionary mapping field names to output packet types + """ + ... + + async def async_call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + ) -> tuple[Tag, Packet | None]: ... + + def call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + ) -> tuple[Tag, Packet | None]: + """ + Process a single packet with its associated tag. + + This is the core method that defines the Pod's computational behavior. + It processes one (tag, packet) pair at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Tag transformation (modify metadata) + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + tag: Metadata associated with the packet + packet: The data payload to process + + Returns: + tuple[Tag, Packet | None]: + - Tag: Output tag (may be modified from input) + - Packet: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing + """ + ... + + +@runtime_checkable +class CachedPod(Pod, Protocol): + async def async_call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[Tag, Packet | None]: ... + + def call( + self, + tag: Tag, + packet: Packet, + record_id: str | None = None, + execution_engine: ExecutionEngine | None = None, + skip_cache_lookup: bool = False, + skip_cache_insert: bool = False, + ) -> tuple[Tag, Packet | None]: + """ + Process a single packet with its associated tag. + + This is the core method that defines the Pod's computational behavior. + It processes one (tag, packet) pair at a time, enabling: + - Fine-grained caching at the packet level + - Parallelization opportunities + - Just-in-time evaluation + - Filtering operations (by returning None) + + The method signature supports: + - Tag transformation (modify metadata) + - Packet transformation (modify content) + - Filtering (return None to exclude packet) + - Pass-through (return inputs unchanged) + + Args: + tag: Metadata associated with the packet + packet: The data payload to process + + Returns: + tuple[Tag, Packet | None]: + - Tag: Output tag (may be modified from input) + - Packet: Processed packet, or None to filter it out + + Raises: + TypeError: If packet doesn't match input_packet_types + ValueError: If packet data is invalid for processing + """ + ... + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all records processed by this Pod. + + This method returns a table containing all packets processed by the Pod, + including metadata and system columns if requested. It is useful for: + - Debugging and analysis + - Auditing and data lineage tracking + - Performance monitoring + + Args: + include_system_columns: Whether to include system columns in the output + + Returns: + pa.Table | None: A table containing all processed records, or None if no records are available + """ + ... diff --git a/src/orcapod/protocols/data_protocols/source.py b/src/orcapod/protocols/data_protocols/source.py new file mode 100644 index 0000000..c0befd6 --- /dev/null +++ b/src/orcapod/protocols/data_protocols/source.py @@ -0,0 +1,55 @@ +from typing import Protocol, runtime_checkable + +from orcapod.protocols.data_protocols.kernel import Kernel +from orcapod.protocols.data_protocols.streams import Stream + + +@runtime_checkable +class Source(Kernel, Stream, Protocol): + """ + Entry point for data into the computational graph. + + Sources are special objects that serve dual roles: + - As Kernels: Can be invoked to produce streams + - As Streams: Directly provide data without upstream dependencies + + Sources represent the roots of computational graphs and typically + interface with external data sources. They bridge the gap between + the outside world and the Orcapod computational model. + + Common source types: + - File readers (CSV, JSON, Parquet, etc.) + - Database connections and queries + - API endpoints and web services + - Generated data sources (synthetic data) + - Manual data input and user interfaces + - Message queues and event streams + + Sources have unique properties: + - No upstream dependencies (upstreams is empty) + - Can be both invoked and iterated + - Serve as the starting point for data lineage + - May have their own refresh/update mechanisms + """ + + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + ... + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + ... + + # def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... + + # def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... + + # def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... diff --git a/src/orcapod/protocols/data_protocols/streams.py b/src/orcapod/protocols/data_protocols/streams.py new file mode 100644 index 0000000..5c4c495 --- /dev/null +++ b/src/orcapod/protocols/data_protocols/streams.py @@ -0,0 +1,424 @@ +from collections.abc import Collection, Iterator, Mapping +from datetime import datetime +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +from orcapod.protocols.data_protocols.base import ExecutionEngine, Labelable +from orcapod.protocols.data_protocols.datagrams import Packet, Tag +from orcapod.protocols.hashing_protocols import ContentIdentifiable +from orcapod.types import TypeSpec + +if TYPE_CHECKING: + import polars as pl + import pyarrow as pa + from orcapod.protocols.data_protocols.kernel import Kernel + + +@runtime_checkable +class Stream(ContentIdentifiable, Labelable, Protocol): + """ + Base protocol for all streams in Orcapod. + + Streams represent sequences of (Tag, Packet) pairs flowing through the + computational graph. They are the fundamental data structure connecting + kernels and carrying both data and metadata. + + Streams can be either: + - Static: Immutable snapshots created at a specific point in time + - Live: Dynamic streams that stay current with upstream dependencies + + All streams provide: + - Iteration over (tag, packet) pairs + - Type information and schema access + - Lineage information (source kernel and upstream streams) + - Basic caching and freshness tracking + - Conversion to common formats (tables, dictionaries) + """ + + @property + def substream_identities(self) -> tuple[str, ...]: + """ + Unique identifiers for sub-streams within this stream. + + This property provides a way to identify and differentiate + sub-streams that may be part of a larger stream. It is useful + for tracking and managing complex data flows. + + Returns: + tuple[str, ...]: Unique identifiers for each sub-stream + """ + ... + + @property + def execution_engine(self) -> ExecutionEngine | None: + """ + The execution engine attached to this stream. By default, the stream + will use this execution engine whenever it needs to perform computation. + None means the stream is not attached to any execution engine and will default + to running natively. + """ + + @execution_engine.setter + def execution_engine(self, engine: ExecutionEngine | None) -> None: + """ + Set the execution engine for this stream. + + This allows the stream to use a specific execution engine for + computation, enabling optimized execution strategies and resource + management. + + Args: + engine: The execution engine to attach to this stream + """ + ... + + def get_substream(self, substream_id: str) -> "Stream": + """ + Retrieve a specific sub-stream by its identifier. + + This method allows access to individual sub-streams within the + main stream, enabling focused operations on specific data segments. + + Args: + substream_id: Unique identifier for the desired sub-stream. + + Returns: + Stream: The requested sub-stream if it exists + """ + ... + + @property + def source(self) -> "Kernel | None": + """ + The kernel that produced this stream. + + This provides lineage information for tracking data flow through + the computational graph. Root streams (like file sources) may + have no source kernel. + + Returns: + Kernel: The source kernel that created this stream + None: This is a root stream with no source kernel + """ + ... + + @property + def upstreams(self) -> tuple["Stream", ...]: + """ + Input streams used to produce this stream. + + These are the streams that were provided as input to the source + kernel when this stream was created. Used for dependency tracking + and cache invalidation. + + Returns: + tuple[Stream, ...]: Upstream dependency streams (empty for sources) + """ + ... + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Available keys/fields in the stream content. + + Returns the field names present in both tags and packets. + This provides schema information without requiring type details, + useful for: + - Schema inspection and exploration + - Query planning and optimization + - Field validation and mapping + + Returns: + tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) + """ + ... + + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + """ + Type specifications for the stream content. + + Returns the type schema for both tags and packets in this stream. + This information is used for: + - Type checking and validation + - Schema inference and planning + - Compatibility checking between kernels + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) + """ + ... + + @property + def last_modified(self) -> datetime | None: + """ + When the stream's content was last modified. + + This property is crucial for caching decisions and dependency tracking: + - datetime: Content was last modified at this time (cacheable) + - None: Content is never stable, always recompute (some dynamic streams) + + Both static and live streams typically return datetime values, but + live streams update this timestamp whenever their content changes. + + Returns: + datetime: Timestamp of last modification for most streams + None: Stream content is never stable (some special dynamic streams) + """ + ... + + @property + def is_current(self) -> bool: + """ + Whether the stream is up-to-date with its dependencies. + + A stream is current if its content reflects the latest state of its + source kernel and upstream streams. This is used for cache validation + and determining when refresh is needed. + + For live streams, this should always return True since they stay + current automatically. For static streams, this indicates whether + the cached content is still valid. + + Returns: + bool: True if stream is up-to-date, False if refresh needed + """ + ... + + def __iter__(self) -> Iterator[tuple[Tag, Packet]]: + """ + Iterate over (tag, packet) pairs in the stream. + + This is the primary way to access stream data. The behavior depends + on the stream type: + - Static streams: Return cached/precomputed data + - Live streams: May trigger computation and always reflect current state + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs + """ + ... + + def iter_packets( + self, execution_engine: ExecutionEngine | None = None + ) -> Iterator[tuple[Tag, Packet]]: + """ + Alias for __iter__ for explicit packet iteration. + + Provides a more explicit method name when the intent is to iterate + over packets specifically, improving code readability. + + This method must return an immutable iterator -- that is, the returned iterator + should not change and must consistently return identical tag,packet pairs across + multiple iterations of the iterator. + + Note that this is NOT to mean that multiple invocation of `iter_packets` must always + return an identical iterator. The iterator returned by `iter_packets` may change + between invocations, but the iterator itself must not change. Consequently, it should be understood + that the returned iterators may be a burden on memory if the stream is large or infinite. + + Yields: + tuple[Tag, Packet]: Sequential (tag, packet) pairs + """ + ... + + def run(self, execution_engine: ExecutionEngine | None = None) -> None: + """ + Execute the stream using the provided execution engine. + + This method triggers computation of the stream content based on its + source kernel and upstream streams. It returns a new stream instance + containing the computed (tag, packet) pairs. + + Args: + execution_engine: The execution engine to use for computation + + """ + ... + + async def run_async(self, execution_engine: ExecutionEngine | None = None) -> None: + """ + Asynchronously execute the stream using the provided execution engine. + + This method triggers computation of the stream content based on its + source kernel and upstream streams. It returns a new stream instance + containing the computed (tag, packet) pairs. + + Args: + execution_engine: The execution engine to use for computation + + """ + ... + + def as_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: ExecutionEngine | None = None, + ) -> "pl.DataFrame | None": + """ + Convert the entire stream to a Polars DataFrame. + """ + ... + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: ExecutionEngine | None = None, + ) -> "pa.Table": + """ + Convert the entire stream to a PyArrow Table. + + Materializes all (tag, packet) pairs into a single table for + analysis and processing. This operation may be expensive for + large streams or live streams that need computation. + + If include_content_hash is True, an additional column called "_content_hash" + containing the content hash of each packet is included. If include_content_hash + is a string, it is used as the name of the content hash column. + + Returns: + pa.Table: Complete stream data as a PyArrow Table + """ + ... + + def flow( + self, execution_engine: ExecutionEngine | None = None + ) -> Collection[tuple[Tag, Packet]]: + """ + Return the entire stream as a collection of (tag, packet) pairs. + + This method materializes the stream content into a list or similar + collection type. It is useful for small streams or when you need + to process all data at once. + + Args: + execution_engine: Optional execution engine to use for computation. + If None, the stream will use its default execution engine. + """ + ... + + def join(self, other_stream: "Stream") -> "Stream": + """ + Join this stream with another stream. + + Combines two streams into a single stream by merging their content. + The resulting stream contains all (tag, packet) pairs from both + streams, preserving their order. + + Args: + other_stream: The other stream to join with this one. + + Returns: + Self: New stream containing combined content from both streams. + """ + ... + + def semi_join(self, other_stream: "Stream") -> "Stream": + """ + Perform a semi-join with another stream. + + This operation filters this stream to only include packets that have + corresponding tags in the other stream. The resulting stream contains + all (tag, packet) pairs from this stream that match tags in the other. + + Args: + other_stream: The other stream to semi-join with this one. + + Returns: + Self: New stream containing filtered content based on the semi-join. + """ + ... + + def map_tags( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> "Stream": + """ + Map tag names in this stream to new names based on the provided mapping. + """ + ... + + def map_packets( + self, name_map: Mapping[str, str], drop_unmapped: bool = True + ) -> "Stream": + """ + Map packet names in this stream to new names based on the provided mapping. + """ + ... + + +@runtime_checkable +class LiveStream(Stream, Protocol): + """ + A stream that automatically stays up-to-date with its upstream dependencies. + + LiveStream extends the base Stream protocol with capabilities for "up-to-date" + data flow and reactive computation. Unlike static streams which represent + snapshots, LiveStreams provide the guarantee that their content always + reflects the current state of their dependencies. + + Key characteristics: + - Automatically refresh the stream if changes in the upstreams are detected + - Track last_modified timestamp when content changes + - Support manual refresh triggering and invalidation + - By design, LiveStream would return True for is_current except when auto-update fails. + + LiveStreams are always returned by Kernel.__call__() methods, ensuring + that normal kernel usage produces live, up-to-date results. + + Caching behavior: + - last_modified updates whenever content changes + - Can be cached based on dependency timestamps + - Invalidation happens automatically when upstreams change + + Use cases: + - Real-time data processing pipelines + - Reactive user interfaces + - Monitoring and alerting systems + - Dynamic dashboard updates + - Any scenario requiring current data + """ + + def refresh(self, force: bool = False) -> bool: + """ + Manually trigger a refresh of this stream's content. + + Forces the stream to check its upstream dependencies and update + its content if necessary. This is useful when: + - You want to ensure the latest data before a critical operation + - You need to force computation at a specific time + - You're debugging data flow issues + - You want to pre-compute results for performance + Args: + force: If True, always refresh even if the stream is current. + If False, only refresh if the stream is not current. + + Returns: + bool: True if the stream was refreshed, False if it was already current. + Note: LiveStream refreshes automatically on access, so this + method may be a no-op for some implementations. However, it's + always safe to call if you need to control when the cache is refreshed. + """ + ... + + def invalidate(self) -> None: + """ + Mark this stream as invalid, forcing a refresh on next access. + + This method is typically called when: + - Upstream dependencies have changed + - The source kernel has been modified + - External data sources have been updated + - Manual cache invalidation is needed + + The stream will automatically refresh its content the next time + it's accessed (via iteration, as_table(), etc.). + + This is more efficient than immediate refresh when you know the + data will be accessed later. + """ + ... diff --git a/src/orcapod/protocols/data_protocols/trackers.py b/src/orcapod/protocols/data_protocols/trackers.py new file mode 100644 index 0000000..0e983db --- /dev/null +++ b/src/orcapod/protocols/data_protocols/trackers.py @@ -0,0 +1,213 @@ +from typing import Protocol, runtime_checkable +from contextlib import AbstractContextManager +from orcapod.protocols.data_protocols.kernel import Kernel +from orcapod.protocols.data_protocols.pods import Pod +from orcapod.protocols.data_protocols.source import Source +from orcapod.protocols.data_protocols.streams import Stream + + +@runtime_checkable +class Tracker(Protocol): + """ + Records kernel invocations and stream creation for computational graph tracking. + + Trackers are responsible for maintaining the computational graph by recording + relationships between kernels, streams, and invocations. They enable: + - Lineage tracking and data provenance + - Caching and memoization strategies + - Debugging and error analysis + - Performance monitoring and optimization + - Reproducibility and auditing + + Multiple trackers can be active simultaneously, each serving different + purposes (e.g., one for caching, another for debugging, another for + monitoring). This allows for flexible and composable tracking strategies. + + Trackers can be selectively activated/deactivated to control overhead + and focus on specific aspects of the computational graph. + """ + + def set_active(self, active: bool = True) -> None: + """ + Set the active state of the tracker. + + When active, the tracker will record all kernel invocations and + stream creations. When inactive, no recording occurs, reducing + overhead for performance-critical sections. + + Args: + active: True to activate recording, False to deactivate + """ + ... + + def is_active(self) -> bool: + """ + Check if the tracker is currently recording invocations. + + Returns: + bool: True if tracker is active and recording, False otherwise + """ + ... + + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: + """ + Record a kernel invocation in the computational graph. + + This method is called whenever a kernel is invoked. The tracker + should record: + - The kernel and its properties + - The input streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + kernel: The kernel that was invoked + upstreams: The input streams used for this invocation + """ + ... + + def record_source_invocation( + self, source: Source, label: str | None = None + ) -> None: + """ + Record a source invocation in the computational graph. + + This method is called whenever a source is invoked. The tracker + should record: + - The source and its properties + - Timing and performance information + - Any relevant metadata + + Args: + source: The source that was invoked + """ + ... + + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: + """ + Record a pod invocation in the computational graph. + + This method is called whenever a pod is invoked. The tracker + should record: + - The pod and its properties + - The upstream streams that were used as input + - Timing and performance information + - Any relevant metadata + + Args: + pod: The pod that was invoked + upstreams: The input streams used for this invocation + """ + ... + + +@runtime_checkable +class TrackerManager(Protocol): + """ + Manages multiple trackers and coordinates their activity. + + The TrackerManager provides a centralized way to: + - Register and manage multiple trackers + - Coordinate recording across all active trackers + - Provide a single interface for graph recording + - Enable dynamic tracker registration/deregistration + + This design allows for: + - Multiple concurrent tracking strategies + - Pluggable tracking implementations + - Easy testing and debugging (mock trackers) + - Performance optimization (selective tracking) + """ + + def get_active_trackers(self) -> list[Tracker]: + """ + Get all currently active trackers. + + Returns only trackers that are both registered and active, + providing the list of trackers that will receive recording events. + + Returns: + list[Tracker]: List of trackers that are currently recording + """ + ... + + def register_tracker(self, tracker: Tracker) -> None: + """ + Register a new tracker in the system. + + The tracker will be included in future recording operations + if it is active. Registration is separate from activation + to allow for dynamic control of tracking overhead. + + Args: + tracker: The tracker to register + """ + ... + + def deregister_tracker(self, tracker: Tracker) -> None: + """ + Remove a tracker from the system. + + The tracker will no longer receive recording notifications + even if it is still active. This is useful for: + - Cleaning up temporary trackers + - Removing failed or problematic trackers + - Dynamic tracker management + + Args: + tracker: The tracker to remove + """ + ... + + def record_kernel_invocation( + self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: + """ + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers + """ + ... + + def record_source_invocation( + self, source: Source, label: str | None = None + ) -> None: + """ + Record a source invocation in the computational graph. + + This method is called whenever a source is invoked. The tracker + should record: + - The source and its properties + - Timing and performance information + - Any relevant metadata + + Args: + source: The source that was invoked + """ + ... + + def record_pod_invocation( + self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None + ) -> None: + """ + Record a stream in all active trackers. + + This method broadcasts the stream recording to all currently` + active and registered trackers. It provides a single point + of entry for recording events, simplifying kernel implementations. + + Args: + stream: The stream to record in all active trackers + """ + ... + + def no_tracking(self) -> AbstractContextManager[None]: ... diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index ca19512..75fffbf 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -1,15 +1,72 @@ """Hash strategy protocols for dependency injection.""" -from collections.abc import Callable -from typing import Any, Protocol, runtime_checkable, TYPE_CHECKING import uuid +from collections.abc import Callable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable -from orcapod.types import TypeSpec, PathLike +from orcapod.types import PathLike, TypeSpec if TYPE_CHECKING: import pyarrow as pa +@dataclass(frozen=True, slots=True) +class ContentHash: + method: str + digest: bytes + + def to_hex(self, char_count: int | None = None) -> str: + """Convert digest to hex string, optionally truncated.""" + hex_str = self.digest.hex() + return hex_str[:char_count] if char_count else hex_str + + def to_int(self, hexdigits: int = 16) -> int: + """ + Convert digest to integer representation. + + Args: + hexdigits: Number of hex digits to use (truncates if needed) + + Returns: + Integer representation of the hash + """ + hex_str = self.to_hex()[:hexdigits] + return int(hex_str, 16) + + def to_uuid(self, namespace: uuid.UUID = uuid.NAMESPACE_OID) -> uuid.UUID: + """ + Convert digest to UUID format. + + Args: + namespace: UUID namespace for uuid5 generation + + Returns: + UUID derived from this hash + """ + # Using uuid5 with the hex string ensures deterministic UUIDs + return uuid.uuid5(namespace, self.to_hex()) + + def to_base64(self) -> str: + """Convert digest to base64 string.""" + import base64 + + return base64.b64encode(self.digest).decode("ascii") + + def __str__(self) -> str: + return f"{self.method}:{self.to_hex()}" + + @classmethod + def from_string(cls, hash_string: str) -> "ContentHash": + """Parse 'method:hex_digest' format.""" + method, hex_digest = hash_string.split(":", 1) + return cls(method, bytes.fromhex(hex_digest)) + + def display_name(self, length: int = 8) -> str: + """Return human-friendly display like 'arrow_v2.1:1a2b3c4d'.""" + return f"{self.method}:{self.to_hex(length)}" + + @runtime_checkable class ContentIdentifiable(Protocol): """Protocol for objects that can provide an identity structure.""" @@ -25,7 +82,7 @@ def identity_structure(self) -> Any: """ ... - def content_hash(self) -> bytes: + def content_hash(self) -> ContentHash: """ Compute a hash based on the content of this object. @@ -62,7 +119,7 @@ class ObjectHasher(Protocol): """Protocol for general object hashing.""" # TODO: consider more explicitly stating types of objects accepted - def hash(self, obj: Any) -> bytes: + def hash_object(self, obj: Any) -> ContentHash: """ Hash an object to a byte representation. Object hasher must be able to handle ContentIdentifiable objects to hash them based on their @@ -85,37 +142,11 @@ def hasher_id(self) -> str: """ ... - def hash_to_hex( - self, - obj: Any, - char_count: int | None = None, - prefix_hasher_id: bool = True, - ) -> str: ... - - def hash_to_int(self, obj: Any, hexdigits: int = 16) -> int: - """ - Hash an object to an integer. - - Args: - obj (Any): The object to hash. - hexdigits (int): Number of hexadecimal digits to use for the hash. - - Returns: - int: The integer representation of the hash. - """ - ... - - def hash_to_uuid( - self, - obj: Any, - namespace: uuid.UUID = uuid.NAMESPACE_OID, - ) -> uuid.UUID: ... - class FileContentHasher(Protocol): """Protocol for file-related hashing.""" - def hash_file(self, file_path: PathLike) -> bytes: ... + def hash_file(self, file_path: PathLike) -> ContentHash: ... class ArrowHasher(Protocol): @@ -123,7 +154,9 @@ class ArrowHasher(Protocol): def get_hasher_id(self) -> str: ... - def hash_table(self, table: "pa.Table", prefix_hasher_id: bool = True) -> str: ... + def hash_table( + self, table: "pa.Table", prefix_hasher_id: bool = True + ) -> ContentHash: ... class StringCacher(Protocol): @@ -160,7 +193,7 @@ def hash_column( self, column: "pa.Array", ) -> "pa.Array": - """Hash a column with this semantic type and return the hash bytes.""" + """Hash a column with this semantic type and return the hash bytes an an array""" ... def set_cacher(self, cacher: StringCacher) -> None: diff --git a/src/orcapod/protocols/legacy_data_protocols.py b/src/orcapod/protocols/legacy_data_protocols.py new file mode 100644 index 0000000..53a8657 --- /dev/null +++ b/src/orcapod/protocols/legacy_data_protocols.py @@ -0,0 +1,2278 @@ +# from collections.abc import Collection, Iterator, Mapping, Callable +# from datetime import datetime +# from typing import Any, ContextManager, Protocol, Self, TYPE_CHECKING, runtime_checkable +# from orcapod.protocols.hashing_protocols import ContentIdentifiable, ContentHash +# from orcapod.types import DataValue, TypeSpec + + +# if TYPE_CHECKING: +# import pyarrow as pa +# import polars as pl +# import pandas as pd + + +# @runtime_checkable +# class ExecutionEngine(Protocol): +# @property +# def name(self) -> str: ... + +# def submit_sync(self, function: Callable, *args, **kwargs) -> Any: +# """ +# Run the given function with the provided arguments. +# This method should be implemented by the execution engine. +# """ +# ... + +# async def submit_async(self, function: Callable, *args, **kwargs) -> Any: +# """ +# Asynchronously run the given function with the provided arguments. +# This method should be implemented by the execution engine. +# """ +# ... + +# # TODO: consider adding batch submission + + +# @runtime_checkable +# class Datagram(ContentIdentifiable, Protocol): +# """ +# Protocol for immutable datagram containers in Orcapod. + +# Datagrams are the fundamental units of data that flow through the system. +# They provide a unified interface for data access, conversion, and manipulation, +# ensuring consistent behavior across different storage backends (dict, Arrow table, etc.). + +# Each datagram contains: +# - **Data columns**: The primary business data (user_id, name, etc.) +# - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} (typically '__') prefixes (e.g. __processed_at, etc.) +# - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) + +# Derivative of datagram (such as Packet or Tag) will also include some specific columns pertinent to the function of the specialized datagram: +# - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) used in Packet +# - **System tags**: Internal tags for system use, typically prefixed with {orcapod.SYSTEM_TAG_PREFIX} ('_system_') (_system_created_at, etc.) used in Tag + +# All operations are by design immutable - methods return new datagram instances rather than +# modifying existing ones. + +# Example: +# >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) +# >>> updated = datagram.update(name="Alice Smith") +# >>> filtered = datagram.select("user_id", "name") +# >>> table = datagram.as_table() +# """ + +# # 1. Core Properties (Identity & Structure) +# @property +# def data_context_key(self) -> str: +# """ +# Return the data context key for this datagram. + +# This key identifies a collection of system components that collectively controls +# how information is serialized, hashed and represented, including the semantic type registry, +# arrow data hasher, and other contextual information. Same piece of information (that is two datagrams +# with an identical *logical* content) may bear distinct internal representation if they are +# represented under two distinct data context, as signified by distinct data context keys. + +# Returns: +# str: Context key for proper datagram interpretation +# """ +# ... + +# @property +# def meta_columns(self) -> tuple[str, ...]: +# """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" +# ... + +# # 2. Dict-like Interface (Data Access) +# def __getitem__(self, key: str) -> DataValue: +# """ +# Get data column value by key. + +# Provides dict-like access to data columns only. Meta columns +# are not accessible through this method (use `get_meta_value()` instead). + +# Args: +# key: Data column name. + +# Returns: +# The value stored in the specified data column. + +# Raises: +# KeyError: If the column doesn't exist in data columns. + +# Example: +# >>> datagram["user_id"] +# 123 +# >>> datagram["name"] +# 'Alice' +# """ +# ... + +# def __contains__(self, key: str) -> bool: +# """ +# Check if data column exists. + +# Args: +# key: Column name to check. + +# Returns: +# True if column exists in data columns, False otherwise. + +# Example: +# >>> "user_id" in datagram +# True +# >>> "nonexistent" in datagram +# False +# """ +# ... + +# def __iter__(self) -> Iterator[str]: +# """ +# Iterate over data column names. + +# Provides for-loop support over column names, enabling natural iteration +# patterns without requiring conversion to dict. + +# Yields: +# Data column names in no particular order. + +# Example: +# >>> for column in datagram: +# ... value = datagram[column] +# ... print(f"{column}: {value}") +# """ +# ... + +# def get(self, key: str, default: DataValue = None) -> DataValue: +# """ +# Get data column value with default fallback. + +# Args: +# key: Data column name. +# default: Value to return if column doesn't exist. + +# Returns: +# Column value if exists, otherwise the default value. + +# Example: +# >>> datagram.get("user_id") +# 123 +# >>> datagram.get("missing", "default") +# 'default' +# """ +# ... + +# # 3. Structural Information +# def keys( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> tuple[str, ...]: +# """ +# Return tuple of column names. + +# Provides access to column names with filtering options for different +# column types. Default returns only data column names. + +# Args: +# include_meta_columns: Controls meta column inclusion. +# - False: Return only data column names (default) +# - True: Include all meta column names +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. + +# Returns: +# Tuple of column names based on inclusion criteria. + +# Example: +# >>> datagram.keys() # Data columns only +# ('user_id', 'name', 'email') +# >>> datagram.keys(include_meta_columns=True) +# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_meta_columns=["pipeline"]) +# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_context=True) +# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') +# """ +# ... + +# def types( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> TypeSpec: +# """ +# Return type specification mapping field names to Python types. + +# The TypeSpec enables type checking and validation throughout the system. + +# Args: +# include_meta_columns: Controls meta column type inclusion. +# - False: Exclude meta column types (default) +# - True: Include all meta column types +# - Collection[str]: Include meta column types matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context type. + +# Returns: +# TypeSpec mapping field names to their Python types. + +# Example: +# >>> datagram.types() +# {'user_id': , 'name': } +# """ +# ... + +# def arrow_schema( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> "pa.Schema": +# """ +# Return PyArrow schema representation. + +# The schema provides structured field and type information for efficient +# serialization and deserialization with PyArrow. + +# Args: +# include_meta_columns: Controls meta column schema inclusion. +# - False: Exclude meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. + +# Returns: +# PyArrow Schema describing the datagram structure. + +# Example: +# >>> schema = datagram.arrow_schema() +# >>> schema.names +# ['user_id', 'name'] +# """ +# ... + +# # 4. Format Conversions (Export) +# def as_dict( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> dict[str, DataValue]: +# """ +# Convert datagram to dictionary format. + +# Provides a simple key-value representation useful for debugging, +# serialization, and interop with dict-based APIs. + +# Args: +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context key. +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. + + +# Returns: +# Dictionary with requested columns as key-value pairs. + +# Example: +# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} +# >>> full_data = datagram.as_dict( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# """ +# ... + +# def as_table( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> "pa.Table": +# """ +# Convert datagram to PyArrow Table format. + +# Provides a standardized columnar representation suitable for analysis, +# processing, and interoperability with Arrow-based tools. + +# Args: +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context column. +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. + +# Returns: +# PyArrow Table with requested columns. + +# Example: +# >>> table = datagram.as_table() # Data columns only +# >>> full_table = datagram.as_table( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" +# """ +# ... + +# def as_arrow_compatible_dict( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# ) -> dict[str, Any]: +# """ +# Return dictionary with values optimized for Arrow table conversion. + +# This method returns a dictionary where values are in a form that can be +# efficiently converted to Arrow format using pa.Table.from_pylist(). + +# The key insight is that this avoids the expensive as_table() → concat pattern +# by providing values that are "Arrow-ready" while remaining in dict format +# for efficient batching. + +# Implementation note: This may involve format conversions (e.g., Path objects +# to strings, datetime objects to ISO strings, etc.) to ensure compatibility +# with Arrow's expected input formats. + +# Arrow table that results from pa.Table.from_pylist on the output of this should be accompanied +# with arrow_schema(...) with the same argument options to ensure that the schema matches the table. + +# Args: +# include_all_info: Include all available information +# include_meta_columns: Controls meta column inclusion +# include_context: Whether to include context key + +# Returns: +# Dictionary with values optimized for Arrow conversion + +# Example: +# # Efficient batch conversion pattern +# arrow_dicts = [datagram.as_arrow_compatible_dict() for datagram in datagrams] +# schema = datagrams[0].arrow_schema() +# table = pa.Table.from_pylist(arrow_dicts, schema=schema) +# """ +# ... + +# # 5. Meta Column Operations +# def get_meta_value(self, key: str, default: DataValue = None) -> DataValue: +# """ +# Get meta column value with optional default. + +# Meta columns store operational metadata and use {orcapod.META_PREFIX} ('__') prefixes. +# This method handles both prefixed and unprefixed key formats. + +# Args: +# key: Meta column key (with or without {orcapod.META_PREFIX} ('__') prefix). +# default: Value to return if meta column doesn't exist. + +# Returns: +# Meta column value if exists, otherwise the default value. + +# Example: +# >>> datagram.get_meta_value("pipeline_version") # Auto-prefixed +# 'v2.1.0' +# >>> datagram.get_meta_value("__pipeline_version") # Already prefixed +# 'v2.1.0' +# >>> datagram.get_meta_value("missing", "default") +# 'default' +# """ +# ... + +# def with_meta_columns(self, **updates: DataValue) -> Self: +# """ +# Create new datagram with updated meta columns. + +# Adds or updates operational metadata while preserving all data columns. +# Keys are automatically prefixed with {orcapod.META_PREFIX} ('__') if needed. + +# Args: +# **updates: Meta column updates as keyword arguments. + +# Returns: +# New datagram instance with updated meta columns. + +# Example: +# >>> tracked = datagram.with_meta_columns( +# ... processed_by="pipeline_v2", +# ... timestamp="2024-01-15T10:30:00Z" +# ... ) +# """ +# ... + +# def drop_meta_columns(self, *keys: str, ignore_missing: bool = False) -> Self: +# """ +# Create new datagram with specified meta columns removed. + +# Args: +# *keys: Meta column keys to remove (prefixes optional). +# ignore_missing: If True, ignore missing columns without raising an error. + + +# Returns: +# New datagram instance without specified meta columns. + +# Raises: +# KeryError: If any specified meta column to drop doesn't exist and ignore_missing=False. + +# Example: +# >>> cleaned = datagram.drop_meta_columns("old_source", "temp_debug") +# """ +# ... + +# # 6. Data Column Operations +# def select(self, *column_names: str) -> Self: +# """ +# Create new datagram with only specified data columns. + +# Args: +# *column_names: Data column names to keep. + + +# Returns: +# New datagram instance with only specified data columns. All other columns including +# meta columns and context are preserved. + +# Raises: +# KeyError: If any specified column doesn't exist. + +# Example: +# >>> subset = datagram.select("user_id", "name", "email") +# """ +# ... + +# def drop(self, *column_names: str, ignore_missing: bool = False) -> Self: +# """ +# Create new datagram with specified data columns removed. Note that this does not +# remove meta columns or context column. Refer to `drop_meta_columns()` for dropping +# specific meta columns. Context key column can never be dropped but a modified copy +# can be created with a different context key using `with_data_context()`. + +# Args: +# *column_names: Data column names to remove. +# ignore_missing: If True, ignore missing columns without raising an error. + +# Returns: +# New datagram instance without specified data columns. + +# Raises: +# KeryError: If any specified column to drop doesn't exist and ignore_missing=False. + +# Example: +# >>> filtered = datagram.drop("temp_field", "debug_info") +# """ +# ... + +# def rename( +# self, +# column_mapping: Mapping[str, str], +# ) -> Self: +# """ +# Create new datagram with data columns renamed. + +# Args: +# column_mapping: Mapping from old names to new names. + +# Returns: +# New datagram instance with renamed data columns. + +# Example: +# >>> renamed = datagram.rename( +# ... {"old_id": "user_id", "old_name": "full_name"}, +# ... column_types={"user_id": int} +# ... ) +# """ +# ... + +# def update(self, **updates: DataValue) -> Self: +# """ +# Create new datagram with existing column values updated. + +# Updates values in existing data columns. Will error if any specified +# column doesn't exist - use with_columns() to add new columns. + +# Args: +# **updates: Column names and their new values. + +# Returns: +# New datagram instance with updated values. + +# Raises: +# KeyError: If any specified column doesn't exist. + +# Example: +# >>> updated = datagram.update( +# ... file_path="/new/absolute/path.txt", +# ... status="processed" +# ... ) +# """ +# ... + +# def with_columns( +# self, +# column_types: Mapping[str, type] | None = None, +# **updates: DataValue, +# ) -> Self: +# """ +# Create new datagram with additional data columns. + +# Adds new data columns to the datagram. Will error if any specified +# column already exists - use update() to modify existing columns. + +# Args: +# column_types: Optional type specifications for new columns. If not provided, the column type is +# inferred from the provided values. If value is None, the column type defaults to `str`. +# **kwargs: New columns as keyword arguments. + +# Returns: +# New datagram instance with additional data columns. + +# Raises: +# ValueError: If any specified column already exists. + +# Example: +# >>> expanded = datagram.with_columns( +# ... status="active", +# ... score=95.5, +# ... column_types={"score": float} +# ... ) +# """ +# ... + +# # 7. Context Operations +# def with_context_key(self, new_context_key: str) -> Self: +# """ +# Create new datagram with different context key. + +# Changes the semantic interpretation context while preserving all data. +# The context key affects how columns are processed and converted. + +# Args: +# new_context_key: New context key string. + +# Returns: +# New datagram instance with updated context key. + +# Note: +# How the context is interpreted depends on the datagram implementation. +# Semantic processing may be rebuilt for the new context. + +# Example: +# >>> financial_datagram = datagram.with_context_key("financial_v1") +# """ +# ... + +# # 8. Utility Operations +# def copy(self) -> Self: +# """ +# Create a shallow copy of the datagram. + +# Returns a new datagram instance with the same data and cached values. +# This is more efficient than reconstructing from scratch when you need +# an identical datagram instance. + +# Returns: +# New datagram instance with copied data and caches. + +# Example: +# >>> copied = datagram.copy() +# >>> copied is datagram # False - different instance +# False +# """ +# ... + +# # 9. String Representations +# def __str__(self) -> str: +# """ +# Return user-friendly string representation. + +# Shows the datagram as a simple dictionary for user-facing output, +# messages, and logging. Only includes data columns for clean output. + +# Returns: +# Dictionary-style string representation of data columns only. +# """ +# ... + +# def __repr__(self) -> str: +# """ +# Return detailed string representation for debugging. + +# Shows the datagram type and comprehensive information for debugging. + +# Returns: +# Detailed representation with type and metadata information. +# """ +# ... + + +# @runtime_checkable +# class Tag(Datagram, Protocol): +# """ +# Metadata associated with each data item in a stream. + +# Tags carry contextual information about data packets as they flow through +# the computational graph. They are immutable and provide metadata that +# helps with: +# - Data lineage tracking +# - Grouping and aggregation operations +# - Temporal information (timestamps) +# - Source identification +# - Processing context + +# Common examples include: +# - Timestamps indicating when data was created/processed +# - Source identifiers showing data origin +# - Processing metadata like batch IDs or session information +# - Grouping keys for aggregation operations +# - Quality indicators or confidence scores +# """ + +# def keys( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_system_tags: bool = False, +# ) -> tuple[str, ...]: +# """ +# Return tuple of column names. + +# Provides access to column names with filtering options for different +# column types. Default returns only data column names. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Return only data column names (default) +# - True: Include all meta column names +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. +# include_source: Whether to include source info fields. + + +# Returns: +# Tuple of column names based on inclusion criteria. + +# Example: +# >>> datagram.keys() # Data columns only +# ('user_id', 'name', 'email') +# >>> datagram.keys(include_meta_columns=True) +# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_meta_columns=["pipeline"]) +# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_context=True) +# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') +# """ +# ... + +# def types( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_system_tags: bool = False, +# ) -> TypeSpec: +# """ +# Return type specification mapping field names to Python types. + +# The TypeSpec enables type checking and validation throughout the system. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column type inclusion. +# - False: Exclude meta column types (default) +# - True: Include all meta column types +# - Collection[str]: Include meta column types matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context type. +# include_source: Whether to include source info fields. + +# Returns: +# TypeSpec mapping field names to their Python types. + +# Example: +# >>> datagram.types() +# {'user_id': , 'name': } +# """ +# ... + +# def arrow_schema( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_system_tags: bool = False, +# ) -> "pa.Schema": +# """ +# Return PyArrow schema representation. + +# The schema provides structured field and type information for efficient +# serialization and deserialization with PyArrow. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column schema inclusion. +# - False: Exclude meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. +# include_source: Whether to include source info fields. + + +# Returns: +# PyArrow Schema describing the datagram structure. + +# Example: +# >>> schema = datagram.arrow_schema() +# >>> schema.names +# ['user_id', 'name'] +# """ +# ... + +# def as_dict( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_system_tags: bool = False, +# ) -> dict[str, DataValue]: +# """ +# Convert datagram to dictionary format. + +# Provides a simple key-value representation useful for debugging, +# serialization, and interop with dict-based APIs. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context key. +# include_source: Whether to include source info fields. + + +# Returns: +# Dictionary with requested columns as key-value pairs. + +# Example: +# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} +# >>> full_data = datagram.as_dict( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# """ +# ... + +# def as_table( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_system_tags: bool = False, +# ) -> "pa.Table": +# """ +# Convert datagram to PyArrow Table format. + +# Provides a standardized columnar representation suitable for analysis, +# processing, and interoperability with Arrow-based tools. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context column. +# include_source: Whether to include source info columns in the schema. + +# Returns: +# PyArrow Table with requested columns. + +# Example: +# >>> table = datagram.as_table() # Data columns only +# >>> full_table = datagram.as_table( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" +# """ +# ... + +# # TODO: add this back +# # def as_arrow_compatible_dict( +# # self, +# # include_all_info: bool = False, +# # include_meta_columns: bool | Collection[str] = False, +# # include_context: bool = False, +# # include_source: bool = False, +# # ) -> dict[str, Any]: +# # """Extended version with source info support.""" +# # ... + +# def as_datagram( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_system_tags: bool = False, +# ) -> Datagram: +# """ +# Convert the packet to a Datagram. + +# Args: +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + +# Returns: +# Datagram: Datagram representation of packet data +# """ +# ... + +# def system_tags(self) -> dict[str, DataValue]: +# """ +# Return metadata about the packet's source/origin. + +# Provides debugging and lineage information about where the packet +# originated. May include information like: +# - File paths for file-based sources +# - Database connection strings +# - API endpoints +# - Processing pipeline information + +# Returns: +# dict[str, str | None]: Source information for each data column as key-value pairs. +# """ +# ... + + +# @runtime_checkable +# class Packet(Datagram, Protocol): +# """ +# The actual data payload in a stream. + +# Packets represent the core data being processed through the computational +# graph. Unlike Tags (which are metadata), Packets contain the actual +# information that computations operate on. + +# Packets extend Datagram with additional capabilities for: +# - Source tracking and lineage +# - Content-based hashing for caching +# - Metadata inclusion for debugging + +# The distinction between Tag and Packet is crucial for understanding +# data flow: Tags provide context, Packets provide content. +# """ + +# def keys( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_source: bool = False, +# ) -> tuple[str, ...]: +# """ +# Return tuple of column names. + +# Provides access to column names with filtering options for different +# column types. Default returns only data column names. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Return only data column names (default) +# - True: Include all meta column names +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. +# include_source: Whether to include source info fields. + + +# Returns: +# Tuple of column names based on inclusion criteria. + +# Example: +# >>> datagram.keys() # Data columns only +# ('user_id', 'name', 'email') +# >>> datagram.keys(include_meta_columns=True) +# ('user_id', 'name', 'email', f'{orcapod.META_PREFIX}processed_at', f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_meta_columns=["pipeline"]) +# ('user_id', 'name', 'email',f'{orcapod.META_PREFIX}pipeline_version') +# >>> datagram.keys(include_context=True) +# ('user_id', 'name', 'email', f'{orcapod.CONTEXT_KEY}') +# """ +# ... + +# def types( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_source: bool = False, +# ) -> TypeSpec: +# """ +# Return type specification mapping field names to Python types. + +# The TypeSpec enables type checking and validation throughout the system. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column type inclusion. +# - False: Exclude meta column types (default) +# - True: Include all meta column types +# - Collection[str]: Include meta column types matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context type. +# include_source: Whether to include source info fields. + +# Returns: +# TypeSpec mapping field names to their Python types. + +# Example: +# >>> datagram.types() +# {'user_id': , 'name': } +# """ +# ... + +# def arrow_schema( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_source: bool = False, +# ) -> "pa.Schema": +# """ +# Return PyArrow schema representation. + +# The schema provides structured field and type information for efficient +# serialization and deserialization with PyArrow. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column schema inclusion. +# - False: Exclude meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include context column. +# include_source: Whether to include source info fields. + + +# Returns: +# PyArrow Schema describing the datagram structure. + +# Example: +# >>> schema = datagram.arrow_schema() +# >>> schema.names +# ['user_id', 'name'] +# """ +# ... + +# def as_dict( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_source: bool = False, +# ) -> dict[str, DataValue]: +# """ +# Convert datagram to dictionary format. + +# Provides a simple key-value representation useful for debugging, +# serialization, and interop with dict-based APIs. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context key. +# include_source: Whether to include source info fields. + + +# Returns: +# Dictionary with requested columns as key-value pairs. + +# Example: +# >>> data = datagram.as_dict() # {'user_id': 123, 'name': 'Alice'} +# >>> full_data = datagram.as_dict( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# """ +# ... + +# def as_table( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_context: bool = False, +# include_source: bool = False, +# ) -> "pa.Table": +# """ +# Convert datagram to PyArrow Table format. + +# Provides a standardized columnar representation suitable for analysis, +# processing, and interoperability with Arrow-based tools. + +# Args: +# include_all_info: If True, include all available information. This option supersedes all other inclusion options. +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. +# include_context: Whether to include the context column. +# include_source: Whether to include source info columns in the schema. + +# Returns: +# PyArrow Table with requested columns. + +# Example: +# >>> table = datagram.as_table() # Data columns only +# >>> full_table = datagram.as_table( +# ... include_meta_columns=True, +# ... include_context=True +# ... ) +# >>> filtered = datagram.as_table(include_meta_columns=["pipeline"]) # same as passing f"{orcapod.META_PREFIX}pipeline" +# """ +# ... + +# # TODO: add this back +# # def as_arrow_compatible_dict( +# # self, +# # include_all_info: bool = False, +# # include_meta_columns: bool | Collection[str] = False, +# # include_context: bool = False, +# # include_source: bool = False, +# # ) -> dict[str, Any]: +# # """Extended version with source info support.""" +# # ... + +# def as_datagram( +# self, +# include_all_info: bool = False, +# include_meta_columns: bool | Collection[str] = False, +# include_source: bool = False, +# ) -> Datagram: +# """ +# Convert the packet to a Datagram. + +# Args: +# include_meta_columns: Controls meta column inclusion. +# - False: Exclude all meta columns (default) +# - True: Include all meta columns +# - Collection[str]: Include meta columns matching these prefixes. If absent, +# {orcapod.META_PREFIX} ('__') prefix is prepended to each key. + +# Returns: +# Datagram: Datagram representation of packet data +# """ +# ... + +# def source_info(self) -> dict[str, str | None]: +# """ +# Return metadata about the packet's source/origin. + +# Provides debugging and lineage information about where the packet +# originated. May include information like: +# - File paths for file-based sources +# - Database connection strings +# - API endpoints +# - Processing pipeline information + +# Returns: +# dict[str, str | None]: Source information for each data column as key-value pairs. +# """ +# ... + +# def with_source_info( +# self, +# **source_info: str | None, +# ) -> Self: +# """ +# Create new packet with updated source information. + +# Adds or updates source metadata for the packet. This is useful for +# tracking data provenance and lineage through the computational graph. + +# Args: +# **source_info: Source metadata as keyword arguments. + +# Returns: +# New packet instance with updated source information. + +# Example: +# >>> updated_packet = packet.with_source_info( +# ... file_path="/new/path/to/file.txt", +# ... source_id="source_123" +# ... ) +# """ +# ... + + +# @runtime_checkable +# class PodFunction(Protocol): +# """ +# A function suitable for use in a FunctionPod. + +# PodFunctions define the computational logic that operates on individual +# packets within a Pod. They represent pure functions that transform +# data values without side effects. + +# These functions are designed to be: +# - Stateless: No dependency on external state +# - Deterministic: Same inputs always produce same outputs +# - Serializable: Can be cached and distributed +# - Type-safe: Clear input/output contracts + +# PodFunctions accept named arguments corresponding to packet fields +# and return transformed data values. +# """ + +# def __call__(self, **kwargs: DataValue) -> None | DataValue: +# """ +# Execute the pod function with the given arguments. + +# The function receives packet data as named arguments and returns +# either transformed data or None (for filtering operations). + +# Args: +# **kwargs: Named arguments mapping packet fields to data values + +# Returns: +# None: Filter out this packet (don't include in output) +# DataValue: Single transformed value + +# Raises: +# TypeError: If required arguments are missing +# ValueError: If argument values are invalid +# """ +# ... + + +# @runtime_checkable +# class Labelable(Protocol): +# """ +# Protocol for objects that can have a human-readable label. + +# Labels provide meaningful names for objects in the computational graph, +# making debugging, visualization, and monitoring much easier. They serve +# as human-friendly identifiers that complement the technical identifiers +# used internally. + +# Labels are optional but highly recommended for: +# - Debugging complex computational graphs +# - Visualization and monitoring tools +# - Error messages and logging +# - User interfaces and dashboards +# """ + +# @property +# def label(self) -> str | None: +# """ +# Return the human-readable label for this object. + +# Labels should be descriptive and help users understand the purpose +# or role of the object in the computational graph. + +# Returns: +# str: Human-readable label for this object +# None: No label is set (will use default naming) +# """ +# ... + + +# @runtime_checkable +# class Stream(ContentIdentifiable, Labelable, Protocol): +# """ +# Base protocol for all streams in Orcapod. + +# Streams represent sequences of (Tag, Packet) pairs flowing through the +# computational graph. They are the fundamental data structure connecting +# kernels and carrying both data and metadata. + +# Streams can be either: +# - Static: Immutable snapshots created at a specific point in time +# - Live: Dynamic streams that stay current with upstream dependencies + +# All streams provide: +# - Iteration over (tag, packet) pairs +# - Type information and schema access +# - Lineage information (source kernel and upstream streams) +# - Basic caching and freshness tracking +# - Conversion to common formats (tables, dictionaries) +# """ + +# @property +# def substream_identities(self) -> tuple[str, ...]: +# """ +# Unique identifiers for sub-streams within this stream. + +# This property provides a way to identify and differentiate +# sub-streams that may be part of a larger stream. It is useful +# for tracking and managing complex data flows. + +# Returns: +# tuple[str, ...]: Unique identifiers for each sub-stream +# """ +# ... + +# @property +# def execution_engine(self) -> ExecutionEngine | None: +# """ +# The execution engine attached to this stream. By default, the stream +# will use this execution engine whenever it needs to perform computation. +# None means the stream is not attached to any execution engine and will default +# to running natively. +# """ + +# @execution_engine.setter +# def execution_engine(self, engine: ExecutionEngine | None) -> None: +# """ +# Set the execution engine for this stream. + +# This allows the stream to use a specific execution engine for +# computation, enabling optimized execution strategies and resource +# management. + +# Args: +# engine: The execution engine to attach to this stream +# """ +# ... + +# def get_substream(self, substream_id: str) -> "Stream": +# """ +# Retrieve a specific sub-stream by its identifier. + +# This method allows access to individual sub-streams within the +# main stream, enabling focused operations on specific data segments. + +# Args: +# substream_id: Unique identifier for the desired sub-stream. + +# Returns: +# Stream: The requested sub-stream if it exists +# """ +# ... + +# @property +# def source(self) -> "Kernel | None": +# """ +# The kernel that produced this stream. + +# This provides lineage information for tracking data flow through +# the computational graph. Root streams (like file sources) may +# have no source kernel. + +# Returns: +# Kernel: The source kernel that created this stream +# None: This is a root stream with no source kernel +# """ +# ... + +# @property +# def upstreams(self) -> tuple["Stream", ...]: +# """ +# Input streams used to produce this stream. + +# These are the streams that were provided as input to the source +# kernel when this stream was created. Used for dependency tracking +# and cache invalidation. + +# Returns: +# tuple[Stream, ...]: Upstream dependency streams (empty for sources) +# """ +# ... + +# def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: +# """ +# Available keys/fields in the stream content. + +# Returns the field names present in both tags and packets. +# This provides schema information without requiring type details, +# useful for: +# - Schema inspection and exploration +# - Query planning and optimization +# - Field validation and mapping + +# Returns: +# tuple[tuple[str, ...], tuple[str, ...]]: (tag_keys, packet_keys) +# """ +# ... + +# def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: +# """ +# Type specifications for the stream content. + +# Returns the type schema for both tags and packets in this stream. +# This information is used for: +# - Type checking and validation +# - Schema inference and planning +# - Compatibility checking between kernels + +# Returns: +# tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) +# """ +# ... + +# @property +# def last_modified(self) -> datetime | None: +# """ +# When the stream's content was last modified. + +# This property is crucial for caching decisions and dependency tracking: +# - datetime: Content was last modified at this time (cacheable) +# - None: Content is never stable, always recompute (some dynamic streams) + +# Both static and live streams typically return datetime values, but +# live streams update this timestamp whenever their content changes. + +# Returns: +# datetime: Timestamp of last modification for most streams +# None: Stream content is never stable (some special dynamic streams) +# """ +# ... + +# @property +# def is_current(self) -> bool: +# """ +# Whether the stream is up-to-date with its dependencies. + +# A stream is current if its content reflects the latest state of its +# source kernel and upstream streams. This is used for cache validation +# and determining when refresh is needed. + +# For live streams, this should always return True since they stay +# current automatically. For static streams, this indicates whether +# the cached content is still valid. + +# Returns: +# bool: True if stream is up-to-date, False if refresh needed +# """ +# ... + +# def __iter__(self) -> Iterator[tuple[Tag, Packet]]: +# """ +# Iterate over (tag, packet) pairs in the stream. + +# This is the primary way to access stream data. The behavior depends +# on the stream type: +# - Static streams: Return cached/precomputed data +# - Live streams: May trigger computation and always reflect current state + +# Yields: +# tuple[Tag, Packet]: Sequential (tag, packet) pairs +# """ +# ... + +# def iter_packets( +# self, execution_engine: ExecutionEngine | None = None +# ) -> Iterator[tuple[Tag, Packet]]: +# """ +# Alias for __iter__ for explicit packet iteration. + +# Provides a more explicit method name when the intent is to iterate +# over packets specifically, improving code readability. + +# This method must return an immutable iterator -- that is, the returned iterator +# should not change and must consistently return identical tag,packet pairs across +# multiple iterations of the iterator. + +# Note that this is NOT to mean that multiple invocation of `iter_packets` must always +# return an identical iterator. The iterator returned by `iter_packets` may change +# between invocations, but the iterator itself must not change. Consequently, it should be understood +# that the returned iterators may be a burden on memory if the stream is large or infinite. + +# Yields: +# tuple[Tag, Packet]: Sequential (tag, packet) pairs +# """ +# ... + +# def run(self, execution_engine: ExecutionEngine | None = None) -> None: +# """ +# Execute the stream using the provided execution engine. + +# This method triggers computation of the stream content based on its +# source kernel and upstream streams. It returns a new stream instance +# containing the computed (tag, packet) pairs. + +# Args: +# execution_engine: The execution engine to use for computation + +# """ +# ... + +# async def run_async(self, execution_engine: ExecutionEngine | None = None) -> None: +# """ +# Asynchronously execute the stream using the provided execution engine. + +# This method triggers computation of the stream content based on its +# source kernel and upstream streams. It returns a new stream instance +# containing the computed (tag, packet) pairs. + +# Args: +# execution_engine: The execution engine to use for computation + +# """ +# ... + +# def as_df( +# self, +# include_data_context: bool = False, +# include_source: bool = False, +# include_system_tags: bool = False, +# include_content_hash: bool | str = False, +# execution_engine: ExecutionEngine | None = None, +# ) -> "pl.DataFrame | None": +# """ +# Convert the entire stream to a Polars DataFrame. +# """ +# ... + +# def as_table( +# self, +# include_data_context: bool = False, +# include_source: bool = False, +# include_system_tags: bool = False, +# include_content_hash: bool | str = False, +# execution_engine: ExecutionEngine | None = None, +# ) -> "pa.Table": +# """ +# Convert the entire stream to a PyArrow Table. + +# Materializes all (tag, packet) pairs into a single table for +# analysis and processing. This operation may be expensive for +# large streams or live streams that need computation. + +# If include_content_hash is True, an additional column called "_content_hash" +# containing the content hash of each packet is included. If include_content_hash +# is a string, it is used as the name of the content hash column. + +# Returns: +# pa.Table: Complete stream data as a PyArrow Table +# """ +# ... + +# def flow( +# self, execution_engine: ExecutionEngine | None = None +# ) -> Collection[tuple[Tag, Packet]]: +# """ +# Return the entire stream as a collection of (tag, packet) pairs. + +# This method materializes the stream content into a list or similar +# collection type. It is useful for small streams or when you need +# to process all data at once. + +# Args: +# execution_engine: Optional execution engine to use for computation. +# If None, the stream will use its default execution engine. +# """ +# ... + +# def join(self, other_stream: "Stream") -> "Stream": +# """ +# Join this stream with another stream. + +# Combines two streams into a single stream by merging their content. +# The resulting stream contains all (tag, packet) pairs from both +# streams, preserving their order. + +# Args: +# other_stream: The other stream to join with this one. + +# Returns: +# Self: New stream containing combined content from both streams. +# """ +# ... + +# def semi_join(self, other_stream: "Stream") -> "Stream": +# """ +# Perform a semi-join with another stream. + +# This operation filters this stream to only include packets that have +# corresponding tags in the other stream. The resulting stream contains +# all (tag, packet) pairs from this stream that match tags in the other. + +# Args: +# other_stream: The other stream to semi-join with this one. + +# Returns: +# Self: New stream containing filtered content based on the semi-join. +# """ +# ... + +# def map_tags( +# self, name_map: Mapping[str, str], drop_unmapped: bool = True +# ) -> "Stream": +# """ +# Map tag names in this stream to new names based on the provided mapping. +# """ +# ... + +# def map_packets( +# self, name_map: Mapping[str, str], drop_unmapped: bool = True +# ) -> "Stream": +# """ +# Map packet names in this stream to new names based on the provided mapping. +# """ +# ... + + +# @runtime_checkable +# class LiveStream(Stream, Protocol): +# """ +# A stream that automatically stays up-to-date with its upstream dependencies. + +# LiveStream extends the base Stream protocol with capabilities for "up-to-date" +# data flow and reactive computation. Unlike static streams which represent +# snapshots, LiveStreams provide the guarantee that their content always +# reflects the current state of their dependencies. + +# Key characteristics: +# - Automatically refresh the stream if changes in the upstreams are detected +# - Track last_modified timestamp when content changes +# - Support manual refresh triggering and invalidation +# - By design, LiveStream would return True for is_current except when auto-update fails. + +# LiveStreams are always returned by Kernel.__call__() methods, ensuring +# that normal kernel usage produces live, up-to-date results. + +# Caching behavior: +# - last_modified updates whenever content changes +# - Can be cached based on dependency timestamps +# - Invalidation happens automatically when upstreams change + +# Use cases: +# - Real-time data processing pipelines +# - Reactive user interfaces +# - Monitoring and alerting systems +# - Dynamic dashboard updates +# - Any scenario requiring current data +# """ + +# def refresh(self, force: bool = False) -> bool: +# """ +# Manually trigger a refresh of this stream's content. + +# Forces the stream to check its upstream dependencies and update +# its content if necessary. This is useful when: +# - You want to ensure the latest data before a critical operation +# - You need to force computation at a specific time +# - You're debugging data flow issues +# - You want to pre-compute results for performance +# Args: +# force: If True, always refresh even if the stream is current. +# If False, only refresh if the stream is not current. + +# Returns: +# bool: True if the stream was refreshed, False if it was already current. +# Note: LiveStream refreshes automatically on access, so this +# method may be a no-op for some implementations. However, it's +# always safe to call if you need to control when the cache is refreshed. +# """ +# ... + +# def invalidate(self) -> None: +# """ +# Mark this stream as invalid, forcing a refresh on next access. + +# This method is typically called when: +# - Upstream dependencies have changed +# - The source kernel has been modified +# - External data sources have been updated +# - Manual cache invalidation is needed + +# The stream will automatically refresh its content the next time +# it's accessed (via iteration, as_table(), etc.). + +# This is more efficient than immediate refresh when you know the +# data will be accessed later. +# """ +# ... + + +# @runtime_checkable +# class Kernel(ContentIdentifiable, Labelable, Protocol): +# """ +# The fundamental unit of computation in Orcapod. + +# Kernels are the building blocks of computational graphs, transforming +# zero, one, or more input streams into a single output stream. They +# encapsulate computation logic while providing consistent interfaces +# for validation, type checking, and execution. + +# Key design principles: +# - Immutable: Kernels don't change after creation +# - Deterministic: Same inputs always produce same outputs +# - Composable: Kernels can be chained and combined +# - Trackable: All invocations are recorded for lineage +# - Type-safe: Strong typing and validation throughout + +# Execution modes: +# - __call__(): Full-featured execution with tracking, returns LiveStream +# - forward(): Pure computation without side effects, returns Stream + +# The distinction between these modes enables both production use (with +# full tracking) and testing/debugging (without side effects). +# """ + +# @property +# def kernel_id(self) -> tuple[str, ...]: +# """ +# Return a unique identifier for this Pod. + +# The pod_id is used for caching and tracking purposes. It should +# uniquely identify the Pod's computational logic, parameters, and +# any relevant metadata that affects its behavior. + +# Returns: +# tuple[str, ...]: Unique identifier for this Pod +# """ +# ... + +# @property +# def data_context_key(self) -> str: +# """ +# Return the context key for this kernel's data processing. + +# The context key is used to interpret how data columns should be +# processed and converted. It provides semantic meaning to the data +# being processed by this kernel. + +# Returns: +# str: Context key for this kernel's data processing +# """ +# ... + +# @property +# def last_modified(self) -> datetime | None: +# """ +# When the kernel was last modified. For most kernels, this is the timestamp +# of the kernel creation. +# """ +# ... + +# def __call__( +# self, *streams: Stream, label: str | None = None, **kwargs +# ) -> LiveStream: +# """ +# Main interface for kernel invocation with full tracking and guarantees. + +# This is the primary way to invoke kernels in production. It provides +# a complete execution pipeline: +# 1. Validates input streams against kernel requirements +# 2. Registers the invocation with the computational graph +# 3. Calls forward() to perform the actual computation +# 4. Ensures the result is a LiveStream that stays current + +# The returned LiveStream automatically stays up-to-date with its +# upstream dependencies, making it suitable for real-time processing +# and reactive applications. + +# Args: +# *streams: Input streams to process (can be empty for source kernels) +# label: Optional label for this invocation (overrides kernel.label) +# **kwargs: Additional arguments for kernel configuration + +# Returns: +# LiveStream: Live stream that stays up-to-date with upstreams + +# Raises: +# ValidationError: If input streams are invalid for this kernel +# TypeMismatchError: If stream types are incompatible +# ValueError: If required arguments are missing +# """ +# ... + +# def forward(self, *streams: Stream) -> Stream: +# """ +# Perform the actual computation without side effects. + +# This method contains the core computation logic and should be +# overridden by subclasses. It performs pure computation without: +# - Registering with the computational graph +# - Performing validation (caller's responsibility) +# - Guaranteeing result type (may return static or live streams) + +# The returned stream must be accurate at the time of invocation but +# need not stay up-to-date with upstream changes. This makes forward() +# suitable for: +# - Testing and debugging +# - Batch processing where currency isn't required +# - Internal implementation details + +# Args: +# *streams: Input streams to process + +# Returns: +# Stream: Result of the computation (may be static or live) +# """ +# ... + +# def output_types( +# self, *streams: Stream, include_system_tags: bool = False +# ) -> tuple[TypeSpec, TypeSpec]: +# """ +# Determine output types without triggering computation. + +# This method performs type inference based on input stream types, +# enabling efficient type checking and stream property queries. +# It should be fast and not trigger any expensive computation. + +# Used for: +# - Pre-execution type validation +# - Query planning and optimization +# - Schema inference in complex pipelines +# - IDE support and developer tooling + +# Args: +# *streams: Input streams to analyze + +# Returns: +# tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) for output + +# Raises: +# ValidationError: If input types are incompatible +# TypeError: If stream types cannot be processed +# """ +# ... + +# def validate_inputs(self, *streams: Stream) -> None: +# """ +# Validate input streams, raising exceptions if incompatible. + +# This method is called automatically by __call__ before computation +# to provide fail-fast behavior. It should check: +# - Number of input streams +# - Stream types and schemas +# - Any kernel-specific requirements +# - Business logic constraints + +# The goal is to catch errors early, before expensive computation +# begins, and provide clear error messages for debugging. + +# Args: +# *streams: Input streams to validate + +# Raises: +# ValidationError: If streams are invalid for this kernel +# TypeError: If stream types are incompatible +# ValueError: If stream content violates business rules +# """ +# ... + +# def identity_structure(self, streams: Collection[Stream] | None = None) -> Any: +# """ +# Generate a unique identity structure for this kernel and/or kernel invocation. +# When invoked without streams, it should return a structure +# that uniquely identifies the kernel itself (e.g., class name, parameters). +# When invoked with streams, it should include the identity of the streams +# to distinguish different invocations of the same kernel. + +# This structure is used for: +# - Caching and memoization +# - Debugging and error reporting +# - Tracking kernel invocations in computational graphs + +# Args: +# streams: Optional input streams for this invocation. If None, identity_structure is +# based solely on the kernel. If streams are provided, they are included in the identity +# to differentiate between different invocations of the same kernel. + +# Returns: +# Any: Unique identity structure (e.g., tuple of class name and stream identities) +# """ +# ... + + +# @runtime_checkable +# class Pod(Kernel, Protocol): +# """ +# Specialized kernel for packet-level processing with advanced caching. + +# Pods represent a different computational model from regular kernels: +# - Process data one packet at a time (enabling fine-grained parallelism) +# - Support just-in-time evaluation (computation deferred until needed) +# - Provide stricter type contracts (clear input/output schemas) +# - Enable advanced caching strategies (packet-level caching) + +# The Pod abstraction is ideal for: +# - Expensive computations that benefit from caching +# - Operations that can be parallelized at the packet level +# - Transformations with strict type contracts +# - Processing that needs to be deferred until access time +# - Functions that operate on individual data items + +# Pods use a different execution model where computation is deferred +# until results are actually needed, enabling efficient resource usage +# and fine-grained caching. +# """ + +# @property +# def version(self) -> str: ... + +# def get_record_id(self, packet: Packet, execution_engine_hash: str) -> str: ... + +# @property +# def tiered_pod_id(self) -> dict[str, str]: +# """ +# Return a dictionary representation of the tiered pod's unique identifier. +# The key is supposed to be ordered from least to most specific, allowing +# for hierarchical identification of the pod. + +# This is primarily used for tiered memoization/caching strategies. + +# Returns: +# dict[str, str]: Dictionary representation of the pod's ID +# """ +# ... + +# def input_packet_types(self) -> TypeSpec: +# """ +# TypeSpec for input packets that this Pod can process. + +# Defines the exact schema that input packets must conform to. +# Pods are typically much stricter about input types than regular +# kernels, requiring precise type matching for their packet-level +# processing functions. + +# This specification is used for: +# - Runtime type validation +# - Compile-time type checking +# - Schema inference and documentation +# - Input validation and error reporting + +# Returns: +# TypeSpec: Dictionary mapping field names to required packet types +# """ +# ... + +# def output_packet_types(self) -> TypeSpec: +# """ +# TypeSpec for output packets that this Pod produces. + +# Defines the schema of packets that will be produced by this Pod. +# This is typically determined by the Pod's computational function +# and is used for: +# - Type checking downstream kernels +# - Schema inference in complex pipelines +# - Query planning and optimization +# - Documentation and developer tooling + +# Returns: +# TypeSpec: Dictionary mapping field names to output packet types +# """ +# ... + +# async def async_call( +# self, +# tag: Tag, +# packet: Packet, +# record_id: str | None = None, +# execution_engine: ExecutionEngine | None = None, +# ) -> tuple[Tag, Packet | None]: ... + +# def call( +# self, +# tag: Tag, +# packet: Packet, +# record_id: str | None = None, +# execution_engine: ExecutionEngine | None = None, +# ) -> tuple[Tag, Packet | None]: +# """ +# Process a single packet with its associated tag. + +# This is the core method that defines the Pod's computational behavior. +# It processes one (tag, packet) pair at a time, enabling: +# - Fine-grained caching at the packet level +# - Parallelization opportunities +# - Just-in-time evaluation +# - Filtering operations (by returning None) + +# The method signature supports: +# - Tag transformation (modify metadata) +# - Packet transformation (modify content) +# - Filtering (return None to exclude packet) +# - Pass-through (return inputs unchanged) + +# Args: +# tag: Metadata associated with the packet +# packet: The data payload to process + +# Returns: +# tuple[Tag, Packet | None]: +# - Tag: Output tag (may be modified from input) +# - Packet: Processed packet, or None to filter it out + +# Raises: +# TypeError: If packet doesn't match input_packet_types +# ValueError: If packet data is invalid for processing +# """ +# ... + + +# @runtime_checkable +# class CachedPod(Pod, Protocol): +# async def async_call( +# self, +# tag: Tag, +# packet: Packet, +# record_id: str | None = None, +# execution_engine: ExecutionEngine | None = None, +# skip_cache_lookup: bool = False, +# skip_cache_insert: bool = False, +# ) -> tuple[Tag, Packet | None]: ... + +# def call( +# self, +# tag: Tag, +# packet: Packet, +# record_id: str | None = None, +# execution_engine: ExecutionEngine | None = None, +# skip_cache_lookup: bool = False, +# skip_cache_insert: bool = False, +# ) -> tuple[Tag, Packet | None]: +# """ +# Process a single packet with its associated tag. + +# This is the core method that defines the Pod's computational behavior. +# It processes one (tag, packet) pair at a time, enabling: +# - Fine-grained caching at the packet level +# - Parallelization opportunities +# - Just-in-time evaluation +# - Filtering operations (by returning None) + +# The method signature supports: +# - Tag transformation (modify metadata) +# - Packet transformation (modify content) +# - Filtering (return None to exclude packet) +# - Pass-through (return inputs unchanged) + +# Args: +# tag: Metadata associated with the packet +# packet: The data payload to process + +# Returns: +# tuple[Tag, Packet | None]: +# - Tag: Output tag (may be modified from input) +# - Packet: Processed packet, or None to filter it out + +# Raises: +# TypeError: If packet doesn't match input_packet_types +# ValueError: If packet data is invalid for processing +# """ +# ... + +# def get_all_records( +# self, include_system_columns: bool = False +# ) -> "pa.Table | None": +# """ +# Retrieve all records processed by this Pod. + +# This method returns a table containing all packets processed by the Pod, +# including metadata and system columns if requested. It is useful for: +# - Debugging and analysis +# - Auditing and data lineage tracking +# - Performance monitoring + +# Args: +# include_system_columns: Whether to include system columns in the output + +# Returns: +# pa.Table | None: A table containing all processed records, or None if no records are available +# """ +# ... + + +# @runtime_checkable +# class Source(Kernel, Stream, Protocol): +# """ +# Entry point for data into the computational graph. + +# Sources are special objects that serve dual roles: +# - As Kernels: Can be invoked to produce streams +# - As Streams: Directly provide data without upstream dependencies + +# Sources represent the roots of computational graphs and typically +# interface with external data sources. They bridge the gap between +# the outside world and the Orcapod computational model. + +# Common source types: +# - File readers (CSV, JSON, Parquet, etc.) +# - Database connections and queries +# - API endpoints and web services +# - Generated data sources (synthetic data) +# - Manual data input and user interfaces +# - Message queues and event streams + +# Sources have unique properties: +# - No upstream dependencies (upstreams is empty) +# - Can be both invoked and iterated +# - Serve as the starting point for data lineage +# - May have their own refresh/update mechanisms +# """ + +# @property +# def tag_keys(self) -> tuple[str, ...]: +# """ +# Return the keys used for the tag in the pipeline run records. +# This is used to store the run-associated tag info. +# """ +# ... + +# @property +# def packet_keys(self) -> tuple[str, ...]: +# """ +# Return the keys used for the packet in the pipeline run records. +# This is used to store the run-associated packet info. +# """ +# ... + +# def get_all_records( +# self, include_system_columns: bool = False +# ) -> "pa.Table | None": +# """ +# Retrieve all records from the source. + +# Args: +# include_system_columns: Whether to include system columns in the output + +# Returns: +# pa.Table | None: A table containing all records, or None if no records are available +# """ +# ... + +# def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... + +# def as_df(self, sort_by_tags: bool = True) -> "pl.DataFrame | None": ... + +# def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... + +# def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... + + +# @runtime_checkable +# class Tracker(Protocol): +# """ +# Records kernel invocations and stream creation for computational graph tracking. + +# Trackers are responsible for maintaining the computational graph by recording +# relationships between kernels, streams, and invocations. They enable: +# - Lineage tracking and data provenance +# - Caching and memoization strategies +# - Debugging and error analysis +# - Performance monitoring and optimization +# - Reproducibility and auditing + +# Multiple trackers can be active simultaneously, each serving different +# purposes (e.g., one for caching, another for debugging, another for +# monitoring). This allows for flexible and composable tracking strategies. + +# Trackers can be selectively activated/deactivated to control overhead +# and focus on specific aspects of the computational graph. +# """ + +# def set_active(self, active: bool = True) -> None: +# """ +# Set the active state of the tracker. + +# When active, the tracker will record all kernel invocations and +# stream creations. When inactive, no recording occurs, reducing +# overhead for performance-critical sections. + +# Args: +# active: True to activate recording, False to deactivate +# """ +# ... + +# def is_active(self) -> bool: +# """ +# Check if the tracker is currently recording invocations. + +# Returns: +# bool: True if tracker is active and recording, False otherwise +# """ +# ... + +# def record_kernel_invocation( +# self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None +# ) -> None: +# """ +# Record a kernel invocation in the computational graph. + +# This method is called whenever a kernel is invoked. The tracker +# should record: +# - The kernel and its properties +# - The input streams that were used as input +# - Timing and performance information +# - Any relevant metadata + +# Args: +# kernel: The kernel that was invoked +# upstreams: The input streams used for this invocation +# """ +# ... + +# def record_source_invocation( +# self, source: Source, label: str | None = None +# ) -> None: +# """ +# Record a source invocation in the computational graph. + +# This method is called whenever a source is invoked. The tracker +# should record: +# - The source and its properties +# - Timing and performance information +# - Any relevant metadata + +# Args: +# source: The source that was invoked +# """ +# ... + +# def record_pod_invocation( +# self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None +# ) -> None: +# """ +# Record a pod invocation in the computational graph. + +# This method is called whenever a pod is invoked. The tracker +# should record: +# - The pod and its properties +# - The upstream streams that were used as input +# - Timing and performance information +# - Any relevant metadata + +# Args: +# pod: The pod that was invoked +# upstreams: The input streams used for this invocation +# """ +# ... + + +# @runtime_checkable +# class TrackerManager(Protocol): +# """ +# Manages multiple trackers and coordinates their activity. + +# The TrackerManager provides a centralized way to: +# - Register and manage multiple trackers +# - Coordinate recording across all active trackers +# - Provide a single interface for graph recording +# - Enable dynamic tracker registration/deregistration + +# This design allows for: +# - Multiple concurrent tracking strategies +# - Pluggable tracking implementations +# - Easy testing and debugging (mock trackers) +# - Performance optimization (selective tracking) +# """ + +# def get_active_trackers(self) -> list[Tracker]: +# """ +# Get all currently active trackers. + +# Returns only trackers that are both registered and active, +# providing the list of trackers that will receive recording events. + +# Returns: +# list[Tracker]: List of trackers that are currently recording +# """ +# ... + +# def register_tracker(self, tracker: Tracker) -> None: +# """ +# Register a new tracker in the system. + +# The tracker will be included in future recording operations +# if it is active. Registration is separate from activation +# to allow for dynamic control of tracking overhead. + +# Args: +# tracker: The tracker to register +# """ +# ... + +# def deregister_tracker(self, tracker: Tracker) -> None: +# """ +# Remove a tracker from the system. + +# The tracker will no longer receive recording notifications +# even if it is still active. This is useful for: +# - Cleaning up temporary trackers +# - Removing failed or problematic trackers +# - Dynamic tracker management + +# Args: +# tracker: The tracker to remove +# """ +# ... + +# def record_kernel_invocation( +# self, kernel: Kernel, upstreams: tuple[Stream, ...], label: str | None = None +# ) -> None: +# """ +# Record a stream in all active trackers. + +# This method broadcasts the stream recording to all currently +# active and registered trackers. It provides a single point +# of entry for recording events, simplifying kernel implementations. + +# Args: +# stream: The stream to record in all active trackers +# """ +# ... + +# def record_source_invocation( +# self, source: Source, label: str | None = None +# ) -> None: +# """ +# Record a source invocation in the computational graph. + +# This method is called whenever a source is invoked. The tracker +# should record: +# - The source and its properties +# - Timing and performance information +# - Any relevant metadata + +# Args: +# source: The source that was invoked +# """ +# ... + +# def record_pod_invocation( +# self, pod: Pod, upstreams: tuple[Stream, ...], label: str | None = None +# ) -> None: +# """ +# Record a stream in all active trackers. + +# This method broadcasts the stream recording to all currently` +# active and registered trackers. It provides a single point +# of entry for recording events, simplifying kernel implementations. + +# Args: +# stream: The stream to record in all active trackers +# """ +# ... + +# def no_tracking(self) -> ContextManager[None]: ... diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_protocols.py index 1ce53bb..52a78cf 100644 --- a/src/orcapod/protocols/semantic_protocols.py +++ b/src/orcapod/protocols/semantic_protocols.py @@ -1,9 +1,56 @@ -from typing import Protocol, Any, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Protocol +from collections.abc import Callable + if TYPE_CHECKING: import pyarrow as pa +class TypeConverter(Protocol): + def python_type_to_arrow_type(self, python_type: type) -> "pa.DataType": ... + + def python_schema_to_arrow_schema( + self, python_schema: dict[str, type] + ) -> "pa.Schema": ... + + def arrow_type_to_python_type(self, arrow_type: "pa.DataType") -> type: ... + + def arrow_schema_to_python_schema( + self, arrow_schema: "pa.Schema" + ) -> dict[str, type]: ... + + def python_dicts_to_struct_dicts( + self, + python_dicts: list[dict[str, Any]], + python_schema: dict[str, type] | None = None, + ) -> list[dict[str, Any]]: ... + + def struct_dicts_to_python_dicts( + self, + struct_dict: list[dict[str, Any]], + arrow_schema: "pa.Schema", + ) -> list[dict[str, Any]]: ... + + def python_dicts_to_arrow_table( + self, + python_dicts: list[dict[str, Any]], + python_schema: dict[str, type] | None = None, + arrow_schema: "pa.Schema | None" = None, + ) -> "pa.Table": ... + + def arrow_table_to_python_dicts( + self, arrow_table: "pa.Table" + ) -> list[dict[str, Any]]: ... + + def get_python_to_arrow_converter( + self, python_type: type + ) -> "Callable[[Any], Any]": ... + + def get_arrow_to_python_converter( + self, arrow_type: "pa.DataType" + ) -> "Callable[[Any], Any]": ... + + # Core protocols class SemanticStructConverter(Protocol): """Protocol for converting between Python objects and semantic structs.""" diff --git a/tests/test_data/test_datagrams/test_arrow_datagram.py b/tests/test_data/test_datagrams/test_arrow_datagram.py index a3da84b..304bace 100644 --- a/tests/test_data/test_datagrams/test_arrow_datagram.py +++ b/tests/test_data/test_datagrams/test_arrow_datagram.py @@ -21,6 +21,7 @@ from orcapod.data.datagrams import ArrowDatagram from orcapod.data.system_constants import constants from orcapod.protocols.data_protocols import Datagram +from orcapod.protocols.hashing_protocols import ContentHash class TestArrowDatagramInitialization: @@ -324,8 +325,8 @@ def test_content_hash(self, datagram_with_meta): # Hash should be consistent assert hash1 == hash2 - assert isinstance(hash1, str) - assert len(hash1) > 0 + assert isinstance(hash1, ContentHash) + assert len(hash1.digest) > 0 def test_content_hash_same_data_different_meta_data(self): """Test that the content hash is the same for identical data with different meta data.""" diff --git a/tests/test_data/test_datagrams/test_dict_datagram.py b/tests/test_data/test_datagrams/test_dict_datagram.py index 0e1af69..3a1d40d 100644 --- a/tests/test_data/test_datagrams/test_dict_datagram.py +++ b/tests/test_data/test_datagrams/test_dict_datagram.py @@ -233,8 +233,8 @@ def test_arrow_schema_with_context(self, datagram_with_meta): def test_content_hash(self, datagram_with_meta): """Test content hash calculation.""" - hash1 = datagram_with_meta.content_hash() - hash2 = datagram_with_meta.content_hash() + hash1 = datagram_with_meta.content_hash().to_hex() + hash2 = datagram_with_meta.content_hash().to_hex() # Hash should be consistent assert hash1 == hash2 From 282bc3a00ed90b455b4e0220432bfbc2a6bdb168 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 22 Aug 2025 17:33:56 -0700 Subject: [PATCH 192/224] refactor: clean up protocols and add batch operator --- pyproject.toml | 8 + src/orcapod/data/base.py | 2 +- src/orcapod/data/operators/__init__.py | 3 +- src/orcapod/data/operators/base.py | 29 +- src/orcapod/data/operators/batch.py | 105 +++ src/orcapod/data/operators/join.py | 5 +- src/orcapod/data/pods.py | 38 +- src/orcapod/data/streams.py | 111 +++- src/orcapod/pipeline/graph.py | 19 +- src/orcapod/pipeline/nodes.py | 24 +- src/orcapod/protocols/data_protocols/base.py | 15 +- src/orcapod/protocols/data_protocols/pods.py | 16 +- src/orcapod/protocols/hashing_protocols.py | 8 +- src/orcapod/protocols/pipeline_protocols.py | 37 ++ src/orcapod/utils/arrow_utils.py | 15 +- uv.lock | 639 +++++++++++++++++++ 16 files changed, 992 insertions(+), 82 deletions(-) create mode 100644 src/orcapod/data/operators/batch.py create mode 100644 src/orcapod/protocols/pipeline_protocols.py diff --git a/pyproject.toml b/pyproject.toml index 95f8bbf..8bef0b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "polars>=1.31.0", "beartype>=0.21.0", "deltalake>=1.0.2", + "selection-pipeline", ] readme = "README.md" requires-python = ">=3.12.0" @@ -46,15 +47,19 @@ version_file = "src/orcapod/_version.py" dev = [ "adlfs>=2024.12.0", "boto3>=1.39.11", + "datajoint>=0.14.6", "deltalake>=1.0.2", "gcsfs>=2025.7.0", "httpie>=3.2.4", + "hydra-core>=1.3.2", + "imageio>=2.37.0", "ipykernel>=6.29.5", "ipywidgets>=8.1.7", "jsonschema>=4.25.0", "minio>=7.2.16", "mkdocs>=1.6.1", "pyarrow-stubs>=20.0.0.20250716", + "pygraphviz>=1.14", "pyiceberg>=0.9.1", "pytest>=8.3.5", "pytest-cov>=6.1.1", @@ -83,3 +88,6 @@ redis = { features = ["redis"], solve-group = "default" } [tool.pixi.dependencies] python = ">=3.12" + +[tool.uv.sources] +selection-pipeline = { git = "https://github.com/enigma-brain/selection_pipeline" } diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index e5d4f73..7b3388a 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod +from abc import ABC from collections.abc import Collection from pathlib import Path from typing import Any, Mapping diff --git a/src/orcapod/data/operators/__init__.py b/src/orcapod/data/operators/__init__.py index 7ba693b..5890694 100644 --- a/src/orcapod/data/operators/__init__.py +++ b/src/orcapod/data/operators/__init__.py @@ -1,11 +1,12 @@ from .join import Join from .semijoin import SemiJoin from .mappers import MapTags, MapPackets - +from .batch import Batch __all__ = [ "Join", "SemiJoin", "MapTags", "MapPackets", + "Batch", ] diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/data/operators/base.py index 4f4ae60..7cf5bf4 100644 --- a/src/orcapod/data/operators/base.py +++ b/src/orcapod/data/operators/base.py @@ -43,18 +43,23 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: It expects exactly one stream as input. """ stream = streams[0] - # visit each substream - output_substreams = [] - for substream_id in stream.substream_identities: - substream = stream.get_substream(substream_id) - output_substreams.append(self.op_forward(substream)) - - # at the moment only single output substream is supported - if len(output_substreams) != 1: - raise NotImplementedError( - "Support for multiple output substreams is not implemented yet." - ) - return output_substreams[0] + return self.op_forward(stream) + + # TODO: complete substream implementation + # Substream implementation pending + # stream = streams[0] + # # visit each substream + # output_substreams = [] + # for substream_id in stream.substream_identities: + # substream = stream.get_substream(substream_id) + # output_substreams.append(self.op_forward(substream)) + + # # at the moment only single output substream is supported + # if len(output_substreams) != 1: + # raise NotImplementedError( + # "Support for multiple output substreams is not implemented yet." + # ) + # return output_substreams[0] def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False diff --git a/src/orcapod/data/operators/batch.py b/src/orcapod/data/operators/batch.py new file mode 100644 index 0000000..603402a --- /dev/null +++ b/src/orcapod/data/operators/batch.py @@ -0,0 +1,105 @@ +from orcapod.data.operators.base import UnaryOperator +from collections.abc import Collection, Mapping +from orcapod.protocols import data_protocols as dp +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from orcapod.data.streams import TableStream +if TYPE_CHECKING: + import pyarrow as pa + import polars as pl +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + +from orcapod.types import TypeSpec + +class Batch(UnaryOperator): + """ + Base class for all operators. + """ + + def __init__(self, batch_size:int = 0, drop_last_batch:bool=False, **kwargs): + if batch_size < 0: + raise ValueError("Batch size must be non-negative.") + + super().__init__(**kwargs) + + self.batch_size = batch_size + self.drop_last_batch = drop_last_batch + + def check_unary_input( + self, + streams: Collection[dp.Stream], + ) -> None: + """ + Check that the inputs to the unary operator are valid. + """ + if len(streams) != 1: + raise ValueError("UnaryOperator requires exactly one input stream.") + + def validate_inputs(self, *streams: dp.Stream) -> None: + self.check_unary_input(streams) + stream = streams[0] + return self.op_validate_inputs(stream) + + + def op_validate_inputs(self, stream: dp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + return None + + + def op_forward(self, stream: dp.Stream) -> dp.Stream: + """ + This method should be implemented by subclasses to define the specific behavior of the binary operator. + It takes two streams as input and returns a new stream as output. + """ + table = stream.as_table(include_source=True, include_system_tags=True) + + tag_columns, packet_columns = stream.keys() + + data_list = table.to_pylist() + + batched_data = [] + + next_batch = {} + + i = 0 + for i, entry in enumerate(data_list): + i += 1 + for c in entry: + next_batch.setdefault(c, []).append(entry[c]) + + if self.batch_size > 0 and i >= self.batch_size: + batched_data.append(next_batch) + next_batch = {} + i = 0 + + if i > 0 and not self.drop_last_batch: + batched_data.append(next_batch) + + batched_table = pa.Table.from_pylist(batched_data) + return TableStream(batched_table, tag_columns=tag_columns) + + + + def op_output_types( + self, stream: dp.Stream, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + """ + This method should be implemented by subclasses to return the typespecs of the input and output streams. + It takes two streams as input and returns a tuple of typespecs. + """ + tag_types, packet_types = stream.types() + batched_tag_types = {k: list[v] for k, v in tag_types.items()} + batched_packet_types = {k: list[v] for k, v in packet_types.items()} + + return batched_tag_types, batched_packet_types + + + def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + return ( + (self.__class__.__name__, self.batch_size, self.drop_last_batch) + (stream,) if stream is not None else () + ) \ No newline at end of file diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index 1ff9a36..ee4b652 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -49,10 +49,11 @@ def op_output_types( include_system_tags=include_system_tags ) tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) - packet_typespec = intersection_typespecs( + intersection_packet_typespec = intersection_typespecs( packet_typespec, other_packet_typespec ) - if packet_typespec: + packet_typespec = union_typespecs(packet_typespec, other_packet_typespec) + if intersection_packet_typespec: raise InputValidationError( f"Packets should not have overlapping keys, but {packet_typespec.keys()} found in {stream} and {other_stream}." ) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 87c9bd9..b45b978 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -315,20 +315,20 @@ def __init__( self._function_info_extractor = function_info_extractor object_hasher = self.data_context.object_hasher # TODO: fix and replace with object_hasher protocol specific methods - self._function_signature_hash = object_hasher.hash_to_hex( - get_function_signature(self.function), prefix_hasher_id=True - ) - self._function_content_hash = object_hasher.hash_to_hex( - get_function_components(self.function), prefix_hasher_id=True - ) + self._function_signature_hash = object_hasher.hash_object( + get_function_signature(self.function) + ).to_string() + self._function_content_hash = object_hasher.hash_object( + get_function_components(self.function) + ).to_string() - self._output_packet_type_hash = object_hasher.hash_to_hex( - self.output_packet_types(), prefix_hasher_id=True - ) + self._output_packet_type_hash = object_hasher.hash_object( + self.output_packet_types() + ).to_string() - self._total_pod_id_hash = object_hasher.hash_to_hex( - self.tiered_pod_id, prefix_hasher_id=True - ) + self._total_pod_id_hash = object_hasher.hash_object( + self.tiered_pod_id + ).to_string() @property def tiered_pod_id(self) -> dict[str, str]: @@ -613,7 +613,6 @@ class CachedPod(WrappedPod): """ # name of the column in the tag store that contains the packet hash - PACKET_HASH_COLUMN = f"{constants.META_PREFIX}packet_hash" DATA_RETRIEVED_FLAG = f"{constants.META_PREFIX}data_retrieved" def __init__( @@ -660,7 +659,7 @@ def call( ) output_packet = None if not skip_cache_lookup: - output_packet = self.get_recorded_output_packet(packet) + output_packet = self.get_cached_output_for_packet(packet) if output_packet is None: tag, output_packet = super().call( tag, packet, record_id=record_id, execution_engine=execution_engine @@ -688,7 +687,7 @@ async def async_call( ) output_packet = None if not skip_cache_lookup: - output_packet = self.get_recorded_output_packet(packet) + output_packet = self.get_cached_output_for_packet(packet) if output_packet is None: tag, output_packet = await super().async_call( tag, packet, record_id=record_id, execution_engine=execution_engine @@ -766,7 +765,7 @@ def record_packet( # # TODO: make store return retrieved table return output_packet - def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | None: + def get_cached_output_for_packet(self, input_packet: dp.Packet) -> dp.Packet | None: """ Retrieve the output packet from the result store based on the input packet. If more than one output packet is found, conflict resolution strategy @@ -836,15 +835,18 @@ def get_recorded_output_packet(self, input_packet: dp.Packet) -> dp.Packet | Non meta_info={self.DATA_RETRIEVED_FLAG: str(datetime.now(timezone.utc))}, ) - def get_all_records( + def get_all_cached_outputs( self, include_system_columns: bool = False ) -> "pa.Table | None": """ Get all records from the result store for this pod. If include_system_columns is True, include system columns in the result. """ + record_id_column = ( + constants.PACKET_RECORD_ID if include_system_columns else None + ) result_table = self.result_store.get_all_records( - self.record_path, record_id_column=constants.INPUT_PACKET_HASH + self.record_path, record_id_column=record_id_column ) if result_table is None or result_table.num_rows == 0: return None diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 1d9cf0d..d6375b2 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -63,26 +63,33 @@ def run_in_thread(): class OperatorStreamBaseMixin: - def join(self, other_stream: dp.Stream) -> dp.Stream: + def join(self, other_stream: dp.Stream, label: str | None = None) -> dp.Stream: """ Joins this stream with another stream, returning a new stream that contains the combined data from both streams. """ from orcapod.data.operators import Join - return Join()(self, other_stream) # type: ignore[return-value] + return Join()(self, other_stream, label=label) # type: ignore - def semi_join(self, other_stream: dp.Stream) -> dp.Stream: + def semi_join( + self, + other_stream: dp.Stream, + label: str | None = None, + ) -> dp.Stream: """ Performs a semi-join with another stream, returning a new stream that contains only the packets from this stream that have matching tags in the other stream. """ from orcapod.data.operators import SemiJoin - return SemiJoin()(self, other_stream) # type: ignore[return-value] + return SemiJoin()(self, other_stream, label=label) # type: ignore def map_tags( - self, name_map: Mapping[str, str], drop_unmapped: bool = True + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, ) -> dp.Stream: """ Maps the tags in this stream according to the provided name_map. @@ -90,10 +97,13 @@ def map_tags( """ from orcapod.data.operators import MapTags - return MapTags(name_map, drop_unmapped)(self) # type: ignore[return-value] + return MapTags(name_map, drop_unmapped)(self, label=label) # type: ignore def map_packets( - self, name_map: Mapping[str, str], drop_unmapped: bool = True + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, ) -> dp.Stream: """ Maps the packets in this stream according to the provided packet_map. @@ -101,7 +111,21 @@ def map_packets( """ from orcapod.data.operators import MapPackets - return MapPackets(name_map, drop_unmapped)(self) # type: ignore[return-value] + return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore + + def batch( + self, + batch_size: int = 0, + drop_last: bool = False, + label: str | None = None, + ) -> dp.Stream: + """ + Batch stream into fixed-size chunks, each of size batch_size. + If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. + """ + from orcapod.data.operators import Batch + + return Batch(batch_size=batch_size, drop_last=drop_last)(self, label=label) # type: ignore class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): @@ -617,9 +641,14 @@ def as_table( table = arrow_utils.hstack_tables(*table_stack) if sort_by_tags: - return table.sort_by( - [(column, "ascending") for column in self._all_tag_columns] - ) + # TODO: cleanup the sorting tag selection logic + try: + return table.sort_by( + [(column, "ascending") for column in self._all_tag_columns] + ) + except pa.ArrowTypeError: + # If sorting fails, fall back to unsorted table + return table return table @@ -1094,7 +1123,9 @@ async def run_async( include_source=True, include_system_tags=True, ) - existing_entries = self.pod.get_all_records(include_system_columns=True) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) if existing_entries is None or existing_entries.num_rows == 0: missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) existing = None @@ -1156,7 +1187,7 @@ async def run_async( def run( self, execution_engine: dp.ExecutionEngine | None = None, - try_async_backend: bool = True, + try_async_backend: bool = False, ) -> None: if try_async_backend: # Use async run if requested @@ -1187,7 +1218,9 @@ def iter_packets( include_content_hash=constants.INPUT_PACKET_HASH, execution_engine=execution_engine, ) - existing_entries = self.pod.get_all_records(include_system_columns=True) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) if existing_entries is None or existing_entries.num_rows == 0: missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) existing = None @@ -1202,14 +1235,29 @@ def iter_packets( # .select([constants.INPUT_PACKET_HASH]).append_column( # "_exists", pa.array([True] * len(existing_entries)) # ), - all_results = target_entries.join( + + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( existing_entries.append_column( "_exists", pa.array([True] * len(existing_entries)) - ), - keys=[constants.INPUT_PACKET_HASH], - join_type="left outer", - right_suffix="_right", + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH, + how="left", + suffix="_right", ) + all_results = all_results_df.to_arrow() + # all_results = target_entries.join( + # existing_entries.append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left outer", + # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys + # ) # grab all columns from target_entries first missing = ( all_results.filter(pc.is_null(pc.field("_exists"))) @@ -1217,10 +1265,16 @@ def iter_packets( .drop_columns([constants.INPUT_PACKET_HASH]) ) - existing = ( - all_results.filter(pc.is_valid(pc.field("_exists"))) - .drop_columns(target_entries.column_names) - .drop_columns(["_exists"]) + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID ) renamed = [ c.removesuffix("_right") if c.endswith("_right") else c @@ -1361,10 +1415,13 @@ def as_table( ) if sort_by_tags: - # TODO: consider having explicit tag/packet properties? - output_table = output_table.sort_by( - [(column, "ascending") for column in self.keys()[0]] - ) + try: + # TODO: consider having explicit tag/packet properties? + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) + except pa.ArrowTypeError: + pass return output_table diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 752d368..7b7f488 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -1,5 +1,6 @@ from orcapod.data.trackers import GraphTracker, Invocation from orcapod.pipeline.nodes import KernelNode, PodNode +from orcapod.protocols.pipeline_protocols import Node from orcapod import contexts from orcapod.protocols import data_protocols as dp from orcapod.protocols import store_protocols as sp @@ -41,7 +42,7 @@ def __init__( self.results_store_path_prefix = self.name + ("_results",) self.pipeline_store = pipeline_store self.results_store = results_store - self.nodes = {} + self.nodes: dict[str, Node] = {} self.auto_compile = auto_compile self._dirty = False self._ordered_nodes = [] # Track order of invocations @@ -79,6 +80,8 @@ def record_pod_invocation( def compile(self) -> None: import networkx as nx + name_candidates = {} + invocation_to_stream_lut = {} G = self.generate_graph() for invocation in nx.topological_sort(G): @@ -87,7 +90,17 @@ def compile(self) -> None: ] node = self.wrap_invocation(invocation, new_input_streams=input_streams) invocation_to_stream_lut[invocation] = node() - self.nodes[node.label] = node + name_candidates.setdefault(node.label, []).append(node) + + # visit through the name candidates and resolve any collisions + for label, nodes in name_candidates.items(): + if len(nodes) > 1: + # If there are multiple nodes with the same label, we need to resolve the collision + logger.info(f"Collision detected for label '{label}': {nodes}") + for i, node in enumerate(nodes, start=1): + self.nodes[f"{label}_{i}"] = node + else: + self.nodes[label] = nodes[0] def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: # FIXME: perform more efficient traversal through the graph! @@ -100,7 +113,7 @@ def wrap_invocation( self, invocation: Invocation, new_input_streams: Collection[dp.Stream], - ) -> dp.Kernel: + ) -> Node: if invocation in self.invocation_to_pod_lut: pod = self.invocation_to_pod_lut[invocation] node = PodNode( diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index ac61dce..b6388eb 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -19,7 +19,7 @@ pd = LazyModule("pandas") -class Node( +class NodeBase( SourceBase, ): """ @@ -36,17 +36,17 @@ def __init__( super().__init__(**kwargs) self._cached_stream: KernelStream | None = None self.input_streams = tuple(input_streams) - self.pipeline_store = pipeline_store self.pipeline_path_prefix = pipeline_path_prefix # compute invocation hash - note that empty () is passed into identity_structure to signify # identity structure of invocation with no input streams - self.invocation_hash = self.data_context.object_hasher.hash_to_hex( - self.identity_structure(()), prefix_hasher_id=True - ) + self.invocation_hash = self.data_context.object_hasher.hash_object( + self.identity_structure(()) + ).to_string() tag_types, _ = self.types(include_system_tags=True) - self.tag_schema_hash = self.data_context.object_hasher.hash_to_hex( - tag_types, prefix_hasher_id=True - ) + self.tag_schema_hash = self.data_context.object_hasher.hash_object( + tag_types + ).to_string() + self.pipeline_store = pipeline_store @property def contained_kernel(self) -> dp.Kernel: @@ -112,7 +112,7 @@ def get_all_records( raise NotImplementedError("This method should be implemented by subclasses.") -class KernelNode(Node, WrappedKernel): +class KernelNode(NodeBase, WrappedKernel): """ A node in the pipeline that represents a kernel. This node can be used to execute the kernel and process data streams. @@ -202,7 +202,7 @@ def get_all_records( return results -class PodNode(Node, CachedPod): +class PodNode(NodeBase, CachedPod): def __init__( self, pod: dp.Pod, @@ -321,10 +321,10 @@ def add_pipeline_record( # TODO: consider using bytes instead of string representation tag_with_hash = tag.as_table(include_system_tags=True).append_column( constants.INPUT_PACKET_HASH, - pa.array([str(input_packet.content_hash())], type=pa.large_string()), + pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), ) - entry_id = str(self.data_context.arrow_hasher.hash_table(tag_with_hash)) + entry_id = self.data_context.arrow_hasher.hash_table(tag_with_hash).to_string() # FIXME: consider and implement more robust cache lookup logic existing_record = None if not skip_cache_lookup: diff --git a/src/orcapod/protocols/data_protocols/base.py b/src/orcapod/protocols/data_protocols/base.py index 080e2f3..c44d52c 100644 --- a/src/orcapod/protocols/data_protocols/base.py +++ b/src/orcapod/protocols/data_protocols/base.py @@ -83,7 +83,7 @@ class Labelable(Protocol): """ @property - def label(self) -> str | None: + def label(self) -> str: """ Return the human-readable label for this object. @@ -95,3 +95,16 @@ def label(self) -> str | None: None: No label is set (will use default naming) """ ... + + @label.setter + def label(self, label: str | None) -> None: + """ + Set the human-readable label for this object. + + Labels should be descriptive and help users understand the purpose + or role of the object in the computational graph. + + Args: + value (str): Human-readable label for this object + """ + ... diff --git a/src/orcapod/protocols/data_protocols/pods.py b/src/orcapod/protocols/data_protocols/pods.py index 80ce1d1..68fcb91 100644 --- a/src/orcapod/protocols/data_protocols/pods.py +++ b/src/orcapod/protocols/data_protocols/pods.py @@ -187,11 +187,23 @@ def call( """ ... - def get_all_records( + def get_cached_output_for_packet(self, input_packet: Packet) -> Packet | None: + """ + Retrieve the cached output packet for a given input packet. + + Args: + input_packet: The input packet to look up in the cache + + Returns: + Packet | None: The cached output packet, or None if not found + """ + ... + + def get_all_cached_outputs( self, include_system_columns: bool = False ) -> "pa.Table | None": """ - Retrieve all records processed by this Pod. + Retrieve all packets processed by this Pod. This method returns a table containing all packets processed by the Pod, including metadata and system columns if requested. It is useful for: diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 75fffbf..28437a3 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -53,8 +53,14 @@ def to_base64(self) -> str: return base64.b64encode(self.digest).decode("ascii") + def to_string(self, prefix_method: bool = True) -> str: + """Convert digest to a string representation.""" + if prefix_method: + return f"{self.method}:{self.to_hex()}" + return self.to_hex() + def __str__(self) -> str: - return f"{self.method}:{self.to_hex()}" + return self.to_string() @classmethod def from_string(cls, hash_string: str) -> "ContentHash": diff --git a/src/orcapod/protocols/pipeline_protocols.py b/src/orcapod/protocols/pipeline_protocols.py new file mode 100644 index 0000000..725bbe7 --- /dev/null +++ b/src/orcapod/protocols/pipeline_protocols.py @@ -0,0 +1,37 @@ +# Protocols for pipeline and nodes +from typing import Protocol, runtime_checkable, TYPE_CHECKING +from orcapod.protocols.data_protocols.source import Source +from orcapod.protocols.data_protocols.pods import CachedPod + + +if TYPE_CHECKING: + import pyarrow as pa + + +class Node(Source, Protocol): + # def record_pipeline_outputs(self): + # pass + ... + + +@runtime_checkable +class PodNode(CachedPod, Protocol): + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + """ + Retrieve all tag and packet processed by this Pod. + + This method returns a table containing all packets processed by the Pod, + including metadata and system columns if requested. It is useful for: + - Debugging and analysis + - Auditing and data lineage tracking + - Performance monitoring + + Args: + include_system_columns: Whether to include system columns in the output + + Returns: + pa.Table | None: A table containing all processed records, or None if no records are available + """ + ... diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 8e5bf9c..111937a 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -6,6 +6,7 @@ from typing import Any + from typing import TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule @@ -89,6 +90,9 @@ def normalize_to_large_types(arrow_type: "pa.DataType") -> "pa.DataType": struct """ # Handle primitive types that have large variants + if pa.types.is_null(arrow_type): + # TODO: make this configurable + return pa.large_string() if pa.types.is_string(arrow_type): return pa.large_string() elif pa.types.is_binary(arrow_type): @@ -597,7 +601,12 @@ def prepare_prefixed_columns( if value is not None: # Use value from source_info dictionary - column_values = pa.array([value] * num_rows, type=pa.large_string()) + # TODO: clean up the logic here + if not isinstance(value, str) and isinstance(value, Collection): + # TODO: this won't work other data types!!! + column_values = pa.array([value] * num_rows, type=pa.list_(pa.large_string())) + else: + column_values = pa.array([value] * num_rows, type=pa.large_string()) # if col_name is in existing_source_info, use that column elif col_name in existing_columns: # Use existing prefixed column, but convert to large_string @@ -618,9 +627,11 @@ def prepare_prefixed_columns( data_table: pa.Table = pa.Table.from_arrays(data_columns, names=data_column_names) result_tables = {} for prefix in all_prefix_info: - result_tables[prefix] = pa.Table.from_arrays( + prefix_table = pa.Table.from_arrays( prefixed_columns[prefix], names=prefixed_column_names[prefix] ) + result_tables[prefix] = normalize_table_to_large_types(prefix_table) + return data_table, result_tables diff --git a/uv.lock b/uv.lock index a50b758..5cafb27 100644 --- a/uv.lock +++ b/uv.lock @@ -145,6 +145,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "antlr4-python3-runtime" +version = "4.9.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" } + +[[package]] +name = "appdirs" +version = "1.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/d8/05696357e0311f5b5c316d7b95f46c669dd9c15aaeecbb48c7d0aeb88c40/appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", size = 13470, upload-time = "2020-05-11T07:59:51.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128", size = 9566, upload-time = "2020-05-11T07:59:49.499Z" }, +] + [[package]] name = "appnope" version = "0.1.4" @@ -345,6 +360,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl", hash = "sha256:b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0", size = 1191340, upload-time = "2025-05-22T05:09:24.606Z" }, ] +[[package]] +name = "blinker" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, +] + [[package]] name = "boto3" version = "1.39.11" @@ -631,6 +655,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "datajoint" +version = "0.14.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, + { name = "deepdiff" }, + { name = "faker" }, + { name = "ipython" }, + { name = "matplotlib" }, + { name = "minio" }, + { name = "networkx" }, + { name = "numpy" }, + { name = "otumat" }, + { name = "pandas" }, + { name = "pydot" }, + { name = "pymysql" }, + { name = "pyparsing" }, + { name = "setuptools" }, + { name = "tqdm" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/bf/bb/8e4e69bdba1aca9806cb0667a784bc07c9d381e0d0e45a9fd2d79eeaf65d/datajoint-0.14.6.tar.gz", hash = "sha256:1bd91c0c8a2a6d6521e95943dbff40751cc6c3893de9ae320a40f5211cc6f074", size = 151647, upload-time = "2025-07-31T22:08:03.836Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/03/58ab7a19c21b08ee148299d81467ce2164cb0c47062404f626445e056ddf/datajoint-0.14.6-py3-none-any.whl", hash = "sha256:b4b6724ccdc9b5daed9773f18695e82df8ed7a8ce3b40944b0e916ef68ce5a71", size = 115036, upload-time = "2025-07-31T22:08:02.539Z" }, +] + [[package]] name = "debugpy" version = "1.8.14" @@ -657,6 +708,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, ] +[[package]] +name = "decord" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/79/936af42edf90a7bd4e41a6cac89c913d4b47fa48a26b042d5129a9242ee3/decord-0.6.0-py3-none-manylinux2010_x86_64.whl", hash = "sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976", size = 13602299, upload-time = "2021-06-14T21:30:55.486Z" }, + { url = "https://files.pythonhosted.org/packages/6c/be/e15b5b866da452e62635a7b27513f31cb581fa2ea9cc9b768b535d62a955/decord-0.6.0-py3-none-win_amd64.whl", hash = "sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad", size = 24733380, upload-time = "2021-06-14T21:30:57.766Z" }, +] + +[[package]] +name = "deepdiff" +version = "8.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "orderly-set" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/03/65/57d5047a03700ccb3eaab9d86837168701b1527fdd2bd9fe7a212bee83b1/deepdiff-8.6.0.tar.gz", hash = "sha256:6197216c2d777c3106a9989055c230e25848e599b26dcbcdc66226bd8d7fe901", size = 631801, upload-time = "2025-08-08T19:00:27.563Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/4d/4c9ba906175430d6f1cb40b7aa90673720c2c4b3fcea03a3719b1906f983/deepdiff-8.6.0-py3-none-any.whl", hash = "sha256:db80677a434ac1f84147fd1598e93f1beb06d467e107af45fcf77cf8a681169f", size = 91121, upload-time = "2025-08-08T19:00:25.575Z" }, +] + [[package]] name = "defusedxml" version = "0.7.1" @@ -705,6 +780,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "einops" +version = "0.8.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e5/81/df4fbe24dff8ba3934af99044188e20a98ed441ad17a274539b74e82e126/einops-0.8.1.tar.gz", hash = "sha256:de5d960a7a761225532e0f1959e5315ebeafc0cd43394732f103ca44b9837e84", size = 54805, upload-time = "2025-02-09T03:17:00.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/62/9773de14fe6c45c23649e98b83231fffd7b9892b6cf863251dc2afa73643/einops-0.8.1-py3-none-any.whl", hash = "sha256:919387eb55330f5757c6bea9165c5ff5cfe63a642682ea788a6d472576d81737", size = 64359, upload-time = "2025-02-09T03:17:01.998Z" }, +] + [[package]] name = "executing" version = "2.2.0" @@ -714,6 +798,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702, upload-time = "2025-01-22T15:41:25.929Z" }, ] +[[package]] +name = "faker" +version = "37.5.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tzdata" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ce/5d/7797a74e8e31fa227f0303239802c5f09b6722bdb6638359e7b6c8f30004/faker-37.5.3.tar.gz", hash = "sha256:8315d8ff4d6f4f588bd42ffe63abd599886c785073e26a44707e10eeba5713dc", size = 1907147, upload-time = "2025-07-30T15:52:19.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/bf/d06dd96e7afa72069dbdd26ed0853b5e8bd7941e2c0819a9b21d6e6fc052/faker-37.5.3-py3-none-any.whl", hash = "sha256:386fe9d5e6132a915984bf887fcebcc72d6366a25dd5952905b31b141a17016d", size = 1949261, upload-time = "2025-07-30T15:52:17.729Z" }, +] + [[package]] name = "filelock" version = "3.18.0" @@ -723,6 +819,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload-time = "2025-03-14T07:11:39.145Z" }, ] +[[package]] +name = "flask" +version = "3.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "blinker" }, + { name = "click" }, + { name = "itsdangerous" }, + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "werkzeug" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/6d/cfe3c0fcc5e477df242b98bfe186a4c34357b4847e87ecaef04507332dab/flask-3.1.2.tar.gz", hash = "sha256:bf656c15c80190ed628ad08cdfd3aaa35beb087855e2f494910aa3774cc4fd87", size = 720160, upload-time = "2025-08-19T21:03:21.205Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/f9/7f9263c5695f4bd0023734af91bedb2ff8209e8de6ead162f35d8dc762fd/flask-3.1.2-py3-none-any.whl", hash = "sha256:ca1d8112ec8a6158cc29ea4858963350011b5c846a414cdb7a954aa9e967d03c", size = 103308, upload-time = "2025-08-19T21:03:19.499Z" }, +] + [[package]] name = "fonttools" version = "4.58.1" @@ -992,6 +1105,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, ] +[[package]] +name = "hf-xet" +version = "1.1.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7a/49/91010b59debc7c862a5fd426d343134dd9a68778dbe570234b6495a4e204/hf_xet-1.1.8.tar.gz", hash = "sha256:62a0043e441753bbc446dcb5a3fe40a4d03f5fb9f13589ef1df9ab19252beb53", size = 484065, upload-time = "2025-08-18T22:01:03.584Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9c/91/5814db3a0d4a65fb6a87f0931ae28073b87f06307701fe66e7c41513bfb4/hf_xet-1.1.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3d5f82e533fc51c7daad0f9b655d9c7811b5308e5890236828bd1dd3ed8fea74", size = 2752357, upload-time = "2025-08-18T22:00:58.777Z" }, + { url = "https://files.pythonhosted.org/packages/70/72/ce898516e97341a7a9d450609e130e108643389110261eaee6deb1ba8545/hf_xet-1.1.8-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:8e2dba5896bca3ab61d0bef4f01a1647004de59640701b37e37eaa57087bbd9d", size = 2613142, upload-time = "2025-08-18T22:00:57.252Z" }, + { url = "https://files.pythonhosted.org/packages/b7/d6/13af5f916cef795ac2b5e4cc1de31f2e0e375f4475d50799915835f301c2/hf_xet-1.1.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfe5700bc729be3d33d4e9a9b5cc17a951bf8c7ada7ba0c9198a6ab2053b7453", size = 3175859, upload-time = "2025-08-18T22:00:55.978Z" }, + { url = "https://files.pythonhosted.org/packages/4c/ed/34a193c9d1d72b7c3901b3b5153b1be9b2736b832692e1c3f167af537102/hf_xet-1.1.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:09e86514c3c4284ed8a57d6b0f3d089f9836a0af0a1ceb3c9dd664f1f3eaefef", size = 3074178, upload-time = "2025-08-18T22:00:54.147Z" }, + { url = "https://files.pythonhosted.org/packages/4a/1b/de6817b4bf65385280252dff5c9cceeedfbcb27ddb93923639323c1034a4/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:4a9b99ab721d385b83f4fc8ee4e0366b0b59dce03b5888a86029cc0ca634efbf", size = 3238122, upload-time = "2025-08-18T22:01:00.546Z" }, + { url = "https://files.pythonhosted.org/packages/b7/13/874c85c7ed519ec101deb654f06703d9e5e68d34416730f64c4755ada36a/hf_xet-1.1.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:25b9d43333bbef39aeae1616789ec329c21401a7fe30969d538791076227b591", size = 3344325, upload-time = "2025-08-18T22:01:02.013Z" }, + { url = "https://files.pythonhosted.org/packages/9e/d3/0aaf279f4f3dea58e99401b92c31c0f752924ba0e6c7d7bb07b1dbd7f35e/hf_xet-1.1.8-cp37-abi3-win_amd64.whl", hash = "sha256:4171f31d87b13da4af1ed86c98cf763292e4720c088b4957cf9d564f92904ca9", size = 2801689, upload-time = "2025-08-18T22:01:04.81Z" }, +] + [[package]] name = "httpie" version = "3.2.4" @@ -1013,6 +1141,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/03/b6/39bcf01e1185882f34bc9fb77d1fb4a27911a55f60ab407de34abc8a2347/httpie-3.2.4-py3-none-any.whl", hash = "sha256:4bd0435cc4b9bca59501bc65089de96f3e93b393803f32a81951db62050ebf0b", size = 127860, upload-time = "2024-11-01T17:31:22.962Z" }, ] +[[package]] +name = "huggingface-hub" +version = "0.34.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/c9/bdbe19339f76d12985bc03572f330a01a93c04dffecaaea3061bdd7fb892/huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c", size = 459768, upload-time = "2025-08-08T09:14:52.365Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/7b/bb06b061991107cd8783f300adff3e7b7f284e330fd82f507f2a1417b11d/huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a", size = 561452, upload-time = "2025-08-08T09:14:50.159Z" }, +] + +[[package]] +name = "hydra-core" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "omegaconf" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -1022,6 +1183,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] +[[package]] +name = "imageio" +version = "2.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0c/47/57e897fb7094afb2d26e8b2e4af9a45c7cf1a405acdeeca001fdf2c98501/imageio-2.37.0.tar.gz", hash = "sha256:71b57b3669666272c818497aebba2b4c5f20d5b37c81720e5e1a56d59c492996", size = 389963, upload-time = "2025-01-20T02:42:37.089Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/bd/b394387b598ed84d8d0fa90611a90bee0adc2021820ad5729f7ced74a8e2/imageio-2.37.0-py3-none-any.whl", hash = "sha256:11efa15b87bc7871b61590326b2d635439acc321cf7f8ce996f812543ce10eed", size = 315796, upload-time = "2025-01-20T02:42:34.931Z" }, +] + [[package]] name = "importlib-metadata" version = "8.7.0" @@ -1125,6 +1299,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, ] +[[package]] +name = "itsdangerous" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, +] + [[package]] name = "jedi" version = "0.19.2" @@ -1495,6 +1678,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/71/4ad9a42f2772793a03cb698f0fc42499f04e6e8d2560ba2f7da0fb059a8e/mmh3-5.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:b22fe2e54be81f6c07dcb36b96fa250fb72effe08aa52fbb83eade6e1e2d5fd7", size = 38890, upload-time = "2025-01-25T08:39:25.28Z" }, ] +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + [[package]] name = "msal" version = "1.33.0" @@ -1665,6 +1857,124 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.4.5.8" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805, upload-time = "2024-04-03T20:57:06.025Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957, upload-time = "2024-04-03T20:55:01.564Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306, upload-time = "2024-04-03T20:56:01.463Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737, upload-time = "2024-04-03T20:54:51.355Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.1.0.70" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.2.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.5.147" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206, upload-time = "2024-04-03T20:58:08.722Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.6.1.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cusparse-cu12" }, + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.3.1.170" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751, upload-time = "2024-07-23T02:35:53.074Z" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.21.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414, upload-time = "2024-04-03T15:32:57.427Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810, upload-time = "2024-04-03T20:59:46.957Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144, upload-time = "2024-04-03T20:56:12.406Z" }, +] + [[package]] name = "oauthlib" version = "3.3.1" @@ -1674,6 +1984,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, ] +[[package]] +name = "omegaconf" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" }, +] + [[package]] name = "opencensus" version = "0.11.4" @@ -1697,6 +2020,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl", hash = "sha256:073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039", size = 5060, upload-time = "2022-08-03T22:20:20.352Z" }, ] +[[package]] +name = "opencv-python" +version = "4.12.0.88" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/71/25c98e634b6bdeca4727c7f6d6927b056080668c5008ad3c8fc9e7f8f6ec/opencv-python-4.12.0.88.tar.gz", hash = "sha256:8b738389cede219405f6f3880b851efa3415ccd674752219377353f017d2994d", size = 95373294, upload-time = "2025-07-07T09:20:52.389Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/85/68/3da40142e7c21e9b1d4e7ddd6c58738feb013203e6e4b803d62cdd9eb96b/opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:f9a1f08883257b95a5764bf517a32d75aec325319c8ed0f89739a57fae9e92a5", size = 37877727, upload-time = "2025-07-07T09:13:31.47Z" }, + { url = "https://files.pythonhosted.org/packages/33/7c/042abe49f58d6ee7e1028eefc3334d98ca69b030e3b567fe245a2b28ea6f/opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:812eb116ad2b4de43ee116fcd8991c3a687f099ada0b04e68f64899c09448e81", size = 57326471, upload-time = "2025-07-07T09:13:41.26Z" }, + { url = "https://files.pythonhosted.org/packages/62/3a/440bd64736cf8116f01f3b7f9f2e111afb2e02beb2ccc08a6458114a6b5d/opencv_python-4.12.0.88-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:51fd981c7df6af3e8f70b1556696b05224c4e6b6777bdd2a46b3d4fb09de1a92", size = 45887139, upload-time = "2025-07-07T09:13:50.761Z" }, + { url = "https://files.pythonhosted.org/packages/68/1f/795e7f4aa2eacc59afa4fb61a2e35e510d06414dd5a802b51a012d691b37/opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:092c16da4c5a163a818f120c22c5e4a2f96e0db4f24e659c701f1fe629a690f9", size = 67041680, upload-time = "2025-07-07T09:14:01.995Z" }, + { url = "https://files.pythonhosted.org/packages/02/96/213fea371d3cb2f1d537612a105792aa0a6659fb2665b22cad709a75bd94/opencv_python-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:ff554d3f725b39878ac6a2e1fa232ec509c36130927afc18a1719ebf4fbf4357", size = 30284131, upload-time = "2025-07-07T09:14:08.819Z" }, + { url = "https://files.pythonhosted.org/packages/fa/80/eb88edc2e2b11cd2dd2e56f1c80b5784d11d6e6b7f04a1145df64df40065/opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:d98edb20aa932fd8ebd276a72627dad9dc097695b3d435a4257557bbb49a79d2", size = 39000307, upload-time = "2025-07-07T09:14:16.641Z" }, +] + [[package]] name = "opentelemetry-api" version = "1.36.0" @@ -1775,6 +2115,7 @@ dependencies = [ { name = "polars" }, { name = "pyarrow" }, { name = "pyyaml" }, + { name = "selection-pipeline" }, { name = "typing-extensions" }, { name = "xxhash" }, ] @@ -1797,15 +2138,19 @@ redis = [ dev = [ { name = "adlfs" }, { name = "boto3" }, + { name = "datajoint" }, { name = "deltalake" }, { name = "gcsfs" }, { name = "httpie" }, + { name = "hydra-core" }, + { name = "imageio" }, { name = "ipykernel" }, { name = "ipywidgets" }, { name = "jsonschema" }, { name = "minio" }, { name = "mkdocs" }, { name = "pyarrow-stubs" }, + { name = "pygraphviz" }, { name = "pyiceberg" }, { name = "pytest" }, { name = "pytest-cov" }, @@ -1831,6 +2176,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=6.0.2" }, { name = "ray", extras = ["default"], marker = "extra == 'ray'", specifier = "==2.48.0" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=6.2.0" }, + { name = "selection-pipeline", git = "https://github.com/enigma-brain/selection_pipeline" }, { name = "typing-extensions" }, { name = "xxhash" }, ] @@ -1840,15 +2186,19 @@ provides-extras = ["redis", "ray", "all"] dev = [ { name = "adlfs", specifier = ">=2024.12.0" }, { name = "boto3", specifier = ">=1.39.11" }, + { name = "datajoint", specifier = ">=0.14.6" }, { name = "deltalake", specifier = ">=1.0.2" }, { name = "gcsfs", specifier = ">=2025.7.0" }, { name = "httpie", specifier = ">=3.2.4" }, + { name = "hydra-core", specifier = ">=1.3.2" }, + { name = "imageio", specifier = ">=2.37.0" }, { name = "ipykernel", specifier = ">=6.29.5" }, { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, { name = "mkdocs", specifier = ">=1.6.1" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, + { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyiceberg", specifier = ">=0.9.1" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-cov", specifier = ">=6.1.1" }, @@ -1859,6 +2209,30 @@ dev = [ { name = "tqdm", specifier = ">=4.67.1" }, ] +[[package]] +name = "orderly-set" +version = "5.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4a/88/39c83c35d5e97cc203e9e77a4f93bf87ec89cf6a22ac4818fdcc65d66584/orderly_set-5.5.0.tar.gz", hash = "sha256:e87185c8e4d8afa64e7f8160ee2c542a475b738bc891dc3f58102e654125e6ce", size = 27414, upload-time = "2025-07-10T20:10:55.885Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/27/fb8d7338b4d551900fa3e580acbe7a0cf655d940e164cb5c00ec31961094/orderly_set-5.5.0-py3-none-any.whl", hash = "sha256:46f0b801948e98f427b412fcabb831677194c05c3b699b80de260374baa0b1e7", size = 13068, upload-time = "2025-07-10T20:10:54.377Z" }, +] + +[[package]] +name = "otumat" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appdirs" }, + { name = "cryptography" }, + { name = "flask" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/ce/48cc07bf7d8e65d2a76b506308364b7472cedc237ab45dfab67ee62de50e/otumat-0.3.1.tar.gz", hash = "sha256:52aca063d8b451a258720f3ead085efa34496d818a4c0d294eecda8040d5966c", size = 17553, upload-time = "2022-01-19T19:21:33.542Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e4/36/6c4b16dd2fa7ba12447e9ea841be410feab834a82092a5d04d65658cabc1/otumat-0.3.1-py3-none-any.whl", hash = "sha256:5388682703da1c5bf809af505639b7d374522d91cf71dd1f185d5a334040d208", size = 15980, upload-time = "2022-01-19T19:21:32.296Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -2330,6 +2704,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, ] +[[package]] +name = "pydot" +version = "4.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyparsing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/35/b17cb89ff865484c6a20ef46bf9d95a5f07328292578de0b295f4a6beec2/pydot-4.0.1.tar.gz", hash = "sha256:c2148f681c4a33e08bf0e26a9e5f8e4099a82e0e2a068098f32ce86577364ad5", size = 162594, upload-time = "2025-06-17T20:09:56.454Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/32/a7125fb28c4261a627f999d5fb4afff25b523800faed2c30979949d6facd/pydot-4.0.1-py3-none-any.whl", hash = "sha256:869c0efadd2708c0be1f916eb669f3d664ca684bc57ffb7ecc08e70d5e93fee6", size = 37087, upload-time = "2025-06-17T20:09:55.25Z" }, +] + [[package]] name = "pygments" version = "2.19.1" @@ -2339,6 +2725,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293, upload-time = "2025-01-06T17:26:25.553Z" }, ] +[[package]] +name = "pygraphviz" +version = "1.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/ca/823d5c74a73d6b8b08e1f5aea12468ef334f0732c65cbb18df2a7f285c87/pygraphviz-1.14.tar.gz", hash = "sha256:c10df02377f4e39b00ae17c862f4ee7e5767317f1c6b2dfd04cea6acc7fc2bea", size = 106003, upload-time = "2024-09-29T18:31:12.471Z" } + [[package]] name = "pyiceberg" version = "0.9.1" @@ -2379,6 +2771,15 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymysql" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/ce59b5e5ed4ce8512f879ff1fa5ab699d211ae2495f1adaa5fbba2a1eada/pymysql-1.1.1.tar.gz", hash = "sha256:e127611aaf2b417403c60bf4dc570124aeb4a57f5f37b8e95ae399a42f904cd0", size = 47678, upload-time = "2024-05-21T11:03:43.722Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/94/e4181a1f6286f545507528c78016e00065ea913276888db2262507693ce5/PyMySQL-1.1.1-py3-none-any.whl", hash = "sha256:4de15da4c61dc132f4fb9ab763063e693d521a80fd0e87943b9a453dd4c19d6c", size = 44972, upload-time = "2024-05-21T11:03:41.216Z" }, +] + [[package]] name = "pyparsing" version = "3.2.3" @@ -2605,6 +3006,56 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0", size = 26775, upload-time = "2025-01-25T08:48:14.241Z" }, ] +[[package]] +name = "regex" +version = "2025.7.34" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/de/e13fa6dc61d78b30ba47481f99933a3b49a57779d625c392d8036770a60d/regex-2025.7.34.tar.gz", hash = "sha256:9ead9765217afd04a86822dfcd4ed2747dfe426e887da413b15ff0ac2457e21a", size = 400714, upload-time = "2025-07-31T00:21:16.262Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/f0/31d62596c75a33f979317658e8d261574785c6cd8672c06741ce2e2e2070/regex-2025.7.34-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7f7211a746aced993bef487de69307a38c5ddd79257d7be83f7b202cb59ddb50", size = 485492, upload-time = "2025-07-31T00:19:35.57Z" }, + { url = "https://files.pythonhosted.org/packages/d8/16/b818d223f1c9758c3434be89aa1a01aae798e0e0df36c1f143d1963dd1ee/regex-2025.7.34-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fb31080f2bd0681484b275461b202b5ad182f52c9ec606052020fe13eb13a72f", size = 290000, upload-time = "2025-07-31T00:19:37.175Z" }, + { url = "https://files.pythonhosted.org/packages/cd/70/69506d53397b4bd6954061bae75677ad34deb7f6ca3ba199660d6f728ff5/regex-2025.7.34-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0200a5150c4cf61e407038f4b4d5cdad13e86345dac29ff9dab3d75d905cf130", size = 286072, upload-time = "2025-07-31T00:19:38.612Z" }, + { url = "https://files.pythonhosted.org/packages/b0/73/536a216d5f66084fb577bb0543b5cb7de3272eb70a157f0c3a542f1c2551/regex-2025.7.34-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:739a74970e736df0773788377969c9fea3876c2fc13d0563f98e5503e5185f46", size = 797341, upload-time = "2025-07-31T00:19:40.119Z" }, + { url = "https://files.pythonhosted.org/packages/26/af/733f8168449e56e8f404bb807ea7189f59507cbea1b67a7bbcd92f8bf844/regex-2025.7.34-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4fef81b2f7ea6a2029161ed6dea9ae13834c28eb5a95b8771828194a026621e4", size = 862556, upload-time = "2025-07-31T00:19:41.556Z" }, + { url = "https://files.pythonhosted.org/packages/19/dd/59c464d58c06c4f7d87de4ab1f590e430821345a40c5d345d449a636d15f/regex-2025.7.34-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea74cf81fe61a7e9d77989050d0089a927ab758c29dac4e8e1b6c06fccf3ebf0", size = 910762, upload-time = "2025-07-31T00:19:43Z" }, + { url = "https://files.pythonhosted.org/packages/37/a8/b05ccf33ceca0815a1e253693b2c86544932ebcc0049c16b0fbdf18b688b/regex-2025.7.34-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e4636a7f3b65a5f340ed9ddf53585c42e3ff37101d383ed321bfe5660481744b", size = 801892, upload-time = "2025-07-31T00:19:44.645Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9a/b993cb2e634cc22810afd1652dba0cae156c40d4864285ff486c73cd1996/regex-2025.7.34-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6cef962d7834437fe8d3da6f9bfc6f93f20f218266dcefec0560ed7765f5fe01", size = 786551, upload-time = "2025-07-31T00:19:46.127Z" }, + { url = "https://files.pythonhosted.org/packages/2d/79/7849d67910a0de4e26834b5bb816e028e35473f3d7ae563552ea04f58ca2/regex-2025.7.34-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:cbe1698e5b80298dbce8df4d8d1182279fbdaf1044e864cbc9d53c20e4a2be77", size = 856457, upload-time = "2025-07-31T00:19:47.562Z" }, + { url = "https://files.pythonhosted.org/packages/91/c6/de516bc082524b27e45cb4f54e28bd800c01efb26d15646a65b87b13a91e/regex-2025.7.34-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:32b9f9bcf0f605eb094b08e8da72e44badabb63dde6b83bd530580b488d1c6da", size = 848902, upload-time = "2025-07-31T00:19:49.312Z" }, + { url = "https://files.pythonhosted.org/packages/7d/22/519ff8ba15f732db099b126f039586bd372da6cd4efb810d5d66a5daeda1/regex-2025.7.34-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:524c868ba527eab4e8744a9287809579f54ae8c62fbf07d62aacd89f6026b282", size = 788038, upload-time = "2025-07-31T00:19:50.794Z" }, + { url = "https://files.pythonhosted.org/packages/3f/7d/aabb467d8f57d8149895d133c88eb809a1a6a0fe262c1d508eb9dfabb6f9/regex-2025.7.34-cp312-cp312-win32.whl", hash = "sha256:d600e58ee6d036081c89696d2bdd55d507498a7180df2e19945c6642fac59588", size = 264417, upload-time = "2025-07-31T00:19:52.292Z" }, + { url = "https://files.pythonhosted.org/packages/3b/39/bd922b55a4fc5ad5c13753274e5b536f5b06ec8eb9747675668491c7ab7a/regex-2025.7.34-cp312-cp312-win_amd64.whl", hash = "sha256:9a9ab52a466a9b4b91564437b36417b76033e8778e5af8f36be835d8cb370d62", size = 275387, upload-time = "2025-07-31T00:19:53.593Z" }, + { url = "https://files.pythonhosted.org/packages/f7/3c/c61d2fdcecb754a40475a3d1ef9a000911d3e3fc75c096acf44b0dfb786a/regex-2025.7.34-cp312-cp312-win_arm64.whl", hash = "sha256:c83aec91af9c6fbf7c743274fd952272403ad9a9db05fe9bfc9df8d12b45f176", size = 268482, upload-time = "2025-07-31T00:19:55.183Z" }, + { url = "https://files.pythonhosted.org/packages/15/16/b709b2119975035169a25aa8e4940ca177b1a2e25e14f8d996d09130368e/regex-2025.7.34-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c3c9740a77aeef3f5e3aaab92403946a8d34437db930a0280e7e81ddcada61f5", size = 485334, upload-time = "2025-07-31T00:19:56.58Z" }, + { url = "https://files.pythonhosted.org/packages/94/a6/c09136046be0595f0331bc58a0e5f89c2d324cf734e0b0ec53cf4b12a636/regex-2025.7.34-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:69ed3bc611540f2ea70a4080f853741ec698be556b1df404599f8724690edbcd", size = 289942, upload-time = "2025-07-31T00:19:57.943Z" }, + { url = "https://files.pythonhosted.org/packages/36/91/08fc0fd0f40bdfb0e0df4134ee37cfb16e66a1044ac56d36911fd01c69d2/regex-2025.7.34-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d03c6f9dcd562c56527c42b8530aad93193e0b3254a588be1f2ed378cdfdea1b", size = 285991, upload-time = "2025-07-31T00:19:59.837Z" }, + { url = "https://files.pythonhosted.org/packages/be/2f/99dc8f6f756606f0c214d14c7b6c17270b6bbe26d5c1f05cde9dbb1c551f/regex-2025.7.34-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6164b1d99dee1dfad33f301f174d8139d4368a9fb50bf0a3603b2eaf579963ad", size = 797415, upload-time = "2025-07-31T00:20:01.668Z" }, + { url = "https://files.pythonhosted.org/packages/62/cf/2fcdca1110495458ba4e95c52ce73b361cf1cafd8a53b5c31542cde9a15b/regex-2025.7.34-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1e4f4f62599b8142362f164ce776f19d79bdd21273e86920a7b604a4275b4f59", size = 862487, upload-time = "2025-07-31T00:20:03.142Z" }, + { url = "https://files.pythonhosted.org/packages/90/38/899105dd27fed394e3fae45607c1983e138273ec167e47882fc401f112b9/regex-2025.7.34-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:72a26dcc6a59c057b292f39d41465d8233a10fd69121fa24f8f43ec6294e5415", size = 910717, upload-time = "2025-07-31T00:20:04.727Z" }, + { url = "https://files.pythonhosted.org/packages/ee/f6/4716198dbd0bcc9c45625ac4c81a435d1c4d8ad662e8576dac06bab35b17/regex-2025.7.34-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5273fddf7a3e602695c92716c420c377599ed3c853ea669c1fe26218867002f", size = 801943, upload-time = "2025-07-31T00:20:07.1Z" }, + { url = "https://files.pythonhosted.org/packages/40/5d/cff8896d27e4e3dd11dd72ac78797c7987eb50fe4debc2c0f2f1682eb06d/regex-2025.7.34-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c1844be23cd40135b3a5a4dd298e1e0c0cb36757364dd6cdc6025770363e06c1", size = 786664, upload-time = "2025-07-31T00:20:08.818Z" }, + { url = "https://files.pythonhosted.org/packages/10/29/758bf83cf7b4c34f07ac3423ea03cee3eb3176941641e4ccc05620f6c0b8/regex-2025.7.34-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dde35e2afbbe2272f8abee3b9fe6772d9b5a07d82607b5788e8508974059925c", size = 856457, upload-time = "2025-07-31T00:20:10.328Z" }, + { url = "https://files.pythonhosted.org/packages/d7/30/c19d212b619963c5b460bfed0ea69a092c6a43cba52a973d46c27b3e2975/regex-2025.7.34-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f3f6e8e7af516a7549412ce57613e859c3be27d55341a894aacaa11703a4c31a", size = 849008, upload-time = "2025-07-31T00:20:11.823Z" }, + { url = "https://files.pythonhosted.org/packages/9e/b8/3c35da3b12c87e3cc00010ef6c3a4ae787cff0bc381aa3d251def219969a/regex-2025.7.34-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:469142fb94a869beb25b5f18ea87646d21def10fbacb0bcb749224f3509476f0", size = 788101, upload-time = "2025-07-31T00:20:13.729Z" }, + { url = "https://files.pythonhosted.org/packages/47/80/2f46677c0b3c2b723b2c358d19f9346e714113865da0f5f736ca1a883bde/regex-2025.7.34-cp313-cp313-win32.whl", hash = "sha256:da7507d083ee33ccea1310447410c27ca11fb9ef18c95899ca57ff60a7e4d8f1", size = 264401, upload-time = "2025-07-31T00:20:15.233Z" }, + { url = "https://files.pythonhosted.org/packages/be/fa/917d64dd074682606a003cba33585c28138c77d848ef72fc77cbb1183849/regex-2025.7.34-cp313-cp313-win_amd64.whl", hash = "sha256:9d644de5520441e5f7e2db63aec2748948cc39ed4d7a87fd5db578ea4043d997", size = 275368, upload-time = "2025-07-31T00:20:16.711Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/f94383666704170a2154a5df7b16be28f0c27a266bffcd843e58bc84120f/regex-2025.7.34-cp313-cp313-win_arm64.whl", hash = "sha256:7bf1c5503a9f2cbd2f52d7e260acb3131b07b6273c470abb78568174fe6bde3f", size = 268482, upload-time = "2025-07-31T00:20:18.189Z" }, + { url = "https://files.pythonhosted.org/packages/ac/23/6376f3a23cf2f3c00514b1cdd8c990afb4dfbac3cb4a68b633c6b7e2e307/regex-2025.7.34-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:8283afe7042d8270cecf27cca558873168e771183d4d593e3c5fe5f12402212a", size = 485385, upload-time = "2025-07-31T00:20:19.692Z" }, + { url = "https://files.pythonhosted.org/packages/73/5b/6d4d3a0b4d312adbfd6d5694c8dddcf1396708976dd87e4d00af439d962b/regex-2025.7.34-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6c053f9647e3421dd2f5dff8172eb7b4eec129df9d1d2f7133a4386319b47435", size = 289788, upload-time = "2025-07-31T00:20:21.941Z" }, + { url = "https://files.pythonhosted.org/packages/92/71/5862ac9913746e5054d01cb9fb8125b3d0802c0706ef547cae1e7f4428fa/regex-2025.7.34-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a16dd56bbcb7d10e62861c3cd000290ddff28ea142ffb5eb3470f183628011ac", size = 286136, upload-time = "2025-07-31T00:20:26.146Z" }, + { url = "https://files.pythonhosted.org/packages/27/df/5b505dc447eb71278eba10d5ec940769ca89c1af70f0468bfbcb98035dc2/regex-2025.7.34-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69c593ff5a24c0d5c1112b0df9b09eae42b33c014bdca7022d6523b210b69f72", size = 797753, upload-time = "2025-07-31T00:20:27.919Z" }, + { url = "https://files.pythonhosted.org/packages/86/38/3e3dc953d13998fa047e9a2414b556201dbd7147034fbac129392363253b/regex-2025.7.34-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98d0ce170fcde1a03b5df19c5650db22ab58af375aaa6ff07978a85c9f250f0e", size = 863263, upload-time = "2025-07-31T00:20:29.803Z" }, + { url = "https://files.pythonhosted.org/packages/68/e5/3ff66b29dde12f5b874dda2d9dec7245c2051f2528d8c2a797901497f140/regex-2025.7.34-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d72765a4bff8c43711d5b0f5b452991a9947853dfa471972169b3cc0ba1d0751", size = 910103, upload-time = "2025-07-31T00:20:31.313Z" }, + { url = "https://files.pythonhosted.org/packages/9e/fe/14176f2182125977fba3711adea73f472a11f3f9288c1317c59cd16ad5e6/regex-2025.7.34-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4494f8fd95a77eb434039ad8460e64d57baa0434f1395b7da44015bef650d0e4", size = 801709, upload-time = "2025-07-31T00:20:33.323Z" }, + { url = "https://files.pythonhosted.org/packages/5a/0d/80d4e66ed24f1ba876a9e8e31b709f9fd22d5c266bf5f3ab3c1afe683d7d/regex-2025.7.34-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4f42b522259c66e918a0121a12429b2abcf696c6f967fa37bdc7b72e61469f98", size = 786726, upload-time = "2025-07-31T00:20:35.252Z" }, + { url = "https://files.pythonhosted.org/packages/12/75/c3ebb30e04a56c046f5c85179dc173818551037daae2c0c940c7b19152cb/regex-2025.7.34-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:aaef1f056d96a0a5d53ad47d019d5b4c66fe4be2da87016e0d43b7242599ffc7", size = 857306, upload-time = "2025-07-31T00:20:37.12Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b2/a4dc5d8b14f90924f27f0ac4c4c4f5e195b723be98adecc884f6716614b6/regex-2025.7.34-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:656433e5b7dccc9bc0da6312da8eb897b81f5e560321ec413500e5367fcd5d47", size = 848494, upload-time = "2025-07-31T00:20:38.818Z" }, + { url = "https://files.pythonhosted.org/packages/0d/21/9ac6e07a4c5e8646a90b56b61f7e9dac11ae0747c857f91d3d2bc7c241d9/regex-2025.7.34-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e91eb2c62c39705e17b4d42d4b86c4e86c884c0d15d9c5a47d0835f8387add8e", size = 787850, upload-time = "2025-07-31T00:20:40.478Z" }, + { url = "https://files.pythonhosted.org/packages/be/6c/d51204e28e7bc54f9a03bb799b04730d7e54ff2718862b8d4e09e7110a6a/regex-2025.7.34-cp314-cp314-win32.whl", hash = "sha256:f978ddfb6216028c8f1d6b0f7ef779949498b64117fc35a939022f67f810bdcb", size = 269730, upload-time = "2025-07-31T00:20:42.253Z" }, + { url = "https://files.pythonhosted.org/packages/74/52/a7e92d02fa1fdef59d113098cb9f02c5d03289a0e9f9e5d4d6acccd10677/regex-2025.7.34-cp314-cp314-win_amd64.whl", hash = "sha256:4b7dc33b9b48fb37ead12ffc7bdb846ac72f99a80373c4da48f64b373a7abeae", size = 278640, upload-time = "2025-07-31T00:20:44.42Z" }, + { url = "https://files.pythonhosted.org/packages/d1/78/a815529b559b1771080faa90c3ab401730661f99d495ab0071649f139ebd/regex-2025.7.34-cp314-cp314-win_arm64.whl", hash = "sha256:4b8c4d39f451e64809912c82392933d80fe2e4a87eeef8859fcc5380d0173c64", size = 271757, upload-time = "2025-07-31T00:20:46.355Z" }, +] + [[package]] name = "requests" version = "2.32.3" @@ -2802,6 +3253,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl", hash = "sha256:a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724", size = 85308, upload-time = "2025-07-18T19:22:40.947Z" }, ] +[[package]] +name = "safetensors" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" }, + { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" }, + { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" }, + { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" }, + { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" }, + { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" }, + { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" }, + { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" }, + { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" }, + { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" }, + { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" }, + { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" }, +] + +[[package]] +name = "selection-pipeline" +version = "0.1.0" +source = { git = "https://github.com/enigma-brain/selection_pipeline#dcae2860c0bd59ab612d9b4a434c6a817a767255" } +dependencies = [ + { name = "decord" }, + { name = "einops" }, + { name = "hydra-core" }, + { name = "opencv-python" }, + { name = "ray", extra = ["default"] }, + { name = "torch" }, + { name = "torchcodec" }, + { name = "torchvision" }, + { name = "transformers" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -2867,6 +3356,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/96/7c/a81ef5ef10978dd073a854e0fa93b5d8021d0594b639cc8f6453c3c78a1d/strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7", size = 123917, upload-time = "2023-03-10T12:50:17.242Z" }, ] +[[package]] +name = "sympy" +version = "1.13.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ca/99/5a5b6f19ff9f083671ddf7b9632028436167cd3d33e11015754e41b249a4/sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f", size = 7533040, upload-time = "2024-07-19T09:26:51.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b2/fe/81695a1aa331a842b582453b605175f419fe8540355886031328089d840a/sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8", size = 6189177, upload-time = "2024-07-19T09:26:48.863Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" @@ -2876,6 +3377,102 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, ] +[[package]] +name = "tokenizers" +version = "0.21.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/2f/402986d0823f8d7ca139d969af2917fefaa9b947d1fb32f6168c509f2492/tokenizers-0.21.4.tar.gz", hash = "sha256:fa23f85fbc9a02ec5c6978da172cdcbac23498c3ca9f3645c5c68740ac007880", size = 351253, upload-time = "2025-07-28T15:48:54.325Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/c6/fdb6f72bf6454f52eb4a2510be7fb0f614e541a2554d6210e370d85efff4/tokenizers-0.21.4-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:2ccc10a7c3bcefe0f242867dc914fc1226ee44321eb618cfe3019b5df3400133", size = 2863987, upload-time = "2025-07-28T15:48:44.877Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a6/28975479e35ddc751dc1ddc97b9b69bf7fcf074db31548aab37f8116674c/tokenizers-0.21.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:5e2f601a8e0cd5be5cc7506b20a79112370b9b3e9cb5f13f68ab11acd6ca7d60", size = 2732457, upload-time = "2025-07-28T15:48:43.265Z" }, + { url = "https://files.pythonhosted.org/packages/aa/8f/24f39d7b5c726b7b0be95dca04f344df278a3fe3a4deb15a975d194cbb32/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:39b376f5a1aee67b4d29032ee85511bbd1b99007ec735f7f35c8a2eb104eade5", size = 3012624, upload-time = "2025-07-28T13:22:43.895Z" }, + { url = "https://files.pythonhosted.org/packages/58/47/26358925717687a58cb74d7a508de96649544fad5778f0cd9827398dc499/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2107ad649e2cda4488d41dfd031469e9da3fcbfd6183e74e4958fa729ffbf9c6", size = 2939681, upload-time = "2025-07-28T13:22:47.499Z" }, + { url = "https://files.pythonhosted.org/packages/99/6f/cc300fea5db2ab5ddc2c8aea5757a27b89c84469899710c3aeddc1d39801/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c73012da95afafdf235ba80047699df4384fdc481527448a078ffd00e45a7d9", size = 3247445, upload-time = "2025-07-28T15:48:39.711Z" }, + { url = "https://files.pythonhosted.org/packages/be/bf/98cb4b9c3c4afd8be89cfa6423704337dc20b73eb4180397a6e0d456c334/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f23186c40395fc390d27f519679a58023f368a0aad234af145e0f39ad1212732", size = 3428014, upload-time = "2025-07-28T13:22:49.569Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/96c1cc780e6ca7f01a57c13235dd05b7bc1c0f3588512ebe9d1331b5f5ae/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc88bb34e23a54cc42713d6d98af5f1bf79c07653d24fe984d2d695ba2c922a2", size = 3193197, upload-time = "2025-07-28T13:22:51.471Z" }, + { url = "https://files.pythonhosted.org/packages/f2/90/273b6c7ec78af547694eddeea9e05de771278bd20476525ab930cecaf7d8/tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51b7eabb104f46c1c50b486520555715457ae833d5aee9ff6ae853d1130506ff", size = 3115426, upload-time = "2025-07-28T15:48:41.439Z" }, + { url = "https://files.pythonhosted.org/packages/91/43/c640d5a07e95f1cf9d2c92501f20a25f179ac53a4f71e1489a3dcfcc67ee/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:714b05b2e1af1288bd1bc56ce496c4cebb64a20d158ee802887757791191e6e2", size = 9089127, upload-time = "2025-07-28T15:48:46.472Z" }, + { url = "https://files.pythonhosted.org/packages/44/a1/dd23edd6271d4dca788e5200a807b49ec3e6987815cd9d0a07ad9c96c7c2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:1340ff877ceedfa937544b7d79f5b7becf33a4cfb58f89b3b49927004ef66f78", size = 9055243, upload-time = "2025-07-28T15:48:48.539Z" }, + { url = "https://files.pythonhosted.org/packages/21/2b/b410d6e9021c4b7ddb57248304dc817c4d4970b73b6ee343674914701197/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3c1f4317576e465ac9ef0d165b247825a2a4078bcd01cba6b54b867bdf9fdd8b", size = 9298237, upload-time = "2025-07-28T15:48:50.443Z" }, + { url = "https://files.pythonhosted.org/packages/b7/0a/42348c995c67e2e6e5c89ffb9cfd68507cbaeb84ff39c49ee6e0a6dd0fd2/tokenizers-0.21.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:c212aa4e45ec0bb5274b16b6f31dd3f1c41944025c2358faaa5782c754e84c24", size = 9461980, upload-time = "2025-07-28T15:48:52.325Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d3/dacccd834404cd71b5c334882f3ba40331ad2120e69ded32cf5fda9a7436/tokenizers-0.21.4-cp39-abi3-win32.whl", hash = "sha256:6c42a930bc5f4c47f4ea775c91de47d27910881902b0f20e4990ebe045a415d0", size = 2329871, upload-time = "2025-07-28T15:48:56.841Z" }, + { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" }, +] + +[[package]] +name = "torch" +version = "2.6.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "fsspec" }, + { name = "jinja2" }, + { name = "networkx" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "setuptools" }, + { name = "sympy" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "typing-extensions" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" }, + { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" }, + { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" }, + { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538, upload-time = "2025-01-29T16:24:18.976Z" }, + { url = "https://files.pythonhosted.org/packages/24/85/ead1349fc30fe5a32cadd947c91bda4a62fbfd7f8c34ee61f6398d38fb48/torch-2.6.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:4874a73507a300a5d089ceaff616a569e7bb7c613c56f37f63ec3ffac65259cf", size = 766626191, upload-time = "2025-01-29T16:17:26.26Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b0/26f06f9428b250d856f6d512413e9e800b78625f63801cbba13957432036/torch-2.6.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a0d5e1b9874c1a6c25556840ab8920569a7a4137afa8a63a32cee0bc7d89bd4b", size = 95611439, upload-time = "2025-01-29T16:21:21.061Z" }, + { url = "https://files.pythonhosted.org/packages/c2/9c/fc5224e9770c83faed3a087112d73147cd7c7bfb7557dcf9ad87e1dda163/torch-2.6.0-cp313-cp313-win_amd64.whl", hash = "sha256:510c73251bee9ba02ae1cb6c9d4ee0907b3ce6020e62784e2d7598e0cfa4d6cc", size = 204126475, upload-time = "2025-01-29T16:21:55.394Z" }, + { url = "https://files.pythonhosted.org/packages/88/8b/d60c0491ab63634763be1537ad488694d316ddc4a20eaadd639cedc53971/torch-2.6.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:ff96f4038f8af9f7ec4231710ed4549da1bdebad95923953a25045dcf6fd87e2", size = 66536783, upload-time = "2025-01-29T16:22:08.559Z" }, +] + +[[package]] +name = "torchcodec" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/5b/16c30a9bfd3d056c6c10598d3067a850b46202d43b6984da117fa91d1a25/TorchCodec-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e831722e6723eb9d632bd0e04b0320723985d19e43ef4a1624f5425efc0ef9b", size = 3028185, upload-time = "2025-02-05T13:57:17.796Z" }, + { url = "https://files.pythonhosted.org/packages/9f/0f/ec751e200fe44660139755f637b20836496b3e28a653d4b8f131f26e3e04/TorchCodec-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c9dd97f1b59e02ee8cc1e140650aae04e469e9fb848d9b70af9e2fddfcb9ff3", size = 755291, upload-time = "2025-02-05T13:56:50.016Z" }, + { url = "https://files.pythonhosted.org/packages/32/80/1d948edfe98e5e918b221560527605ea2539c5cbd699db8f3d6183791449/TorchCodec-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:51090be7bb7bf7a45dc65c599b0e864852a97df397bb70fa2a976dd6e9cb5358", size = 2962876, upload-time = "2025-02-05T13:57:19.938Z" }, + { url = "https://files.pythonhosted.org/packages/2c/e9/063c3c6f5596646b29ce4f2d8fac88f71c52f912d58cabd22d778fd6b326/TorchCodec-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32e3b7a291beefe7bcb6c764aa793367594849b5bd7ad0b669495915d22fd5fa", size = 754960, upload-time = "2025-02-05T13:56:51.797Z" }, +] + +[[package]] +name = "torchvision" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "pillow" }, + { name = "torch" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/5b/76ca113a853b19c7b1da761f8a72cb6429b3bd0bf932537d8df4657f47c3/torchvision-0.21.0-1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:ffa2a16499508fe6798323e455f312c7c55f2a88901c9a7c0fb1efa86cf7e327", size = 2329878, upload-time = "2025-03-18T17:25:50.039Z" }, + { url = "https://files.pythonhosted.org/packages/4e/fe/5e193353706dab96fe73ae100d5a633ff635ce310e0d92f3bc2958d075b1/torchvision-0.21.0-1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7e9e9afa150e40cd2a8f0701c43cb82a8d724f512896455c0918b987f94b84a4", size = 2280711, upload-time = "2025-03-18T17:25:46.012Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1b/28f527b22d5e8800184d0bc847f801ae92c7573a8c15979d92b7091c0751/torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:97a5814a93c793aaf0179cfc7f916024f4b63218929aee977b645633d074a49f", size = 1784140, upload-time = "2025-01-29T16:28:44.694Z" }, + { url = "https://files.pythonhosted.org/packages/36/63/0722e153fd27d64d5b0af45b5c8cb0e80b35a68cf0130303bc9a8bb095c7/torchvision-0.21.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:b578bcad8a4083b40d34f689b19ca9f7c63e511758d806510ea03c29ac568f7b", size = 7238673, upload-time = "2025-01-29T16:28:27.631Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ea/03541ed901cdc30b934f897060d09bbf7a98466a08ad1680320f9ce0cbe0/torchvision-0.21.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5083a5b1fec2351bf5ea9900a741d54086db75baec4b1d21e39451e00977f1b1", size = 14701186, upload-time = "2025-01-29T16:28:16.491Z" }, + { url = "https://files.pythonhosted.org/packages/4c/6a/c7752603060d076dfed95135b78b047dc71792630cbcb022e3693d6f32ef/torchvision-0.21.0-cp312-cp312-win_amd64.whl", hash = "sha256:6eb75d41e3bbfc2f7642d0abba9383cc9ae6c5a4ca8d6b00628c225e1eaa63b3", size = 1560520, upload-time = "2025-01-29T16:28:42.122Z" }, + { url = "https://files.pythonhosted.org/packages/f9/56/47d456b61c3bbce7bed4af3925c83d405bb87468e659fd3cf3d9840c3b51/torchvision-0.21.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:659b76c86757cb2ee4ca2db245e0740cfc3081fef46f0f1064d11adb4a8cee31", size = 1784141, upload-time = "2025-01-29T16:28:39.01Z" }, + { url = "https://files.pythonhosted.org/packages/cb/4c/99880813aa50e64447fb1c4c6c804a793d2d78f7f7c53e99ddee7fa175fa/torchvision-0.21.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:084ac3f5a1f50c70d630a488d19bf62f323018eae1b1c1232f2b7047d3a7b76d", size = 7238714, upload-time = "2025-01-29T16:28:25.658Z" }, + { url = "https://files.pythonhosted.org/packages/0b/2d/3c3ee10608310a395594aac7da8640372ed79c6585910ccae6919658dcdc/torchvision-0.21.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5045a3a5f21ec3eea6962fa5f2fa2d4283f854caec25ada493fcf4aab2925467", size = 2281252, upload-time = "2025-01-29T16:28:40.687Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b4/fc60e3bc003879d3de842baea258fffc3586f4b49cd435a5ba1e09c33315/torchvision-0.21.0-cp313-cp313-win_amd64.whl", hash = "sha256:9147f5e096a9270684e3befdee350f3cacafd48e0c54ab195f45790a9c146d67", size = 1560519, upload-time = "2025-01-29T16:28:22.527Z" }, +] + [[package]] name = "tornado" version = "6.5.1" @@ -2916,6 +3513,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "transformers" +version = "4.55.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/20/a7/f8ef0b1caa022e085a31cd01bc705fb9194558f8a35a5107b3ee5cb640ed/transformers-4.55.3.tar.gz", hash = "sha256:31dca715549f56cb1b591a933d2caf534f948705191e809234a52a2df407c98f", size = 9572448, upload-time = "2025-08-21T09:23:01.46Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/be/79ef53e6a65795b40bfc2d11fa54a16abcb173f069e32005b8092c5d5c19/transformers-4.55.3-py3-none-any.whl", hash = "sha256:c85e7feace634541e23b3e34d28aa9492d67974b733237ade9eba7c57c0fd1bd", size = 11269669, upload-time = "2025-08-21T09:22:57.535Z" }, +] + +[[package]] +name = "triton" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, + { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278, upload-time = "2025-01-22T19:13:54.221Z" }, +] + [[package]] name = "typing-extensions" version = "4.14.0" @@ -3002,6 +3629,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166, upload-time = "2024-01-06T02:10:55.763Z" }, ] +[[package]] +name = "werkzeug" +version = "3.1.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9f/69/83029f1f6300c5fb2471d621ab06f6ec6b3324685a2ce0f9777fd4a8b71e/werkzeug-3.1.3.tar.gz", hash = "sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746", size = 806925, upload-time = "2024-11-08T15:52:18.093Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/24/ab44c871b0f07f491e5d2ad12c9bd7358e527510618cb1b803a88e986db1/werkzeug-3.1.3-py3-none-any.whl", hash = "sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e", size = 224498, upload-time = "2024-11-08T15:52:16.132Z" }, +] + [[package]] name = "widgetsnbextension" version = "4.0.14" From bea864d3227e23e560829d9f9f16b6c2177d1f38 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:47:44 -0700 Subject: [PATCH 193/224] feat: add baseclass for context awareness --- pyproject.toml | 4 +- src/orcapod/data/base.py | 42 +- src/orcapod/data/sources/__init__.py | 3 + src/orcapod/data/sources/base.py | 214 ++++++++++ uv.lock | 558 +++++++++++++++++++++++++-- 5 files changed, 777 insertions(+), 44 deletions(-) create mode 100644 src/orcapod/data/sources/__init__.py create mode 100644 src/orcapod/data/sources/base.py diff --git a/pyproject.toml b/pyproject.toml index 8bef0b1..af8ed52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "selection-pipeline", ] readme = "README.md" -requires-python = ">=3.12.0" +requires-python = ">=3.11.0" license = { text = "MIT License" } classifiers = [ "Programming Language :: Python :: 3", @@ -90,4 +90,4 @@ redis = { features = ["redis"], solve-group = "default" } python = ">=3.12" [tool.uv.sources] -selection-pipeline = { git = "https://github.com/enigma-brain/selection_pipeline" } +selection-pipeline = { git = "https://github.com/enigma-brain/selection_pipeline" } #, rev = "6218290" } diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index 7b3388a..e1fe272 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -54,7 +54,24 @@ def computed_label(self) -> str | None: return None -class ContentIdentifiableBase(ABC): +class ContextAwareBase(ABC): + def __init__( + self, data_context: str | contexts.DataContext | None = None, **kwargs + ): + super().__init__(**kwargs) + self._data_context = contexts.resolve_context(data_context) + + @property + def data_context(self) -> contexts.DataContext: + return self._data_context + + @property + def data_context_key(self) -> str: + """Return the data context key.""" + return self._data_context.context_key + + +class ContentIdentifiableBase(ContextAwareBase): """ Base class for content-identifiable objects. This class provides a way to define objects that can be uniquely identified @@ -65,9 +82,7 @@ class ContentIdentifiableBase(ABC): Two content-identifiable objects are considered equal if their `identity_structure` returns the same value. """ - def __init__( - self, data_context: str | contexts.DataContext | None = None, **kwargs - ) -> None: + def __init__(self, **kwargs) -> None: """ Initialize the ContentHashable with an optional ObjectHasher. @@ -75,9 +90,8 @@ def __init__( identity_structure_hasher (ObjectHasher | None): An instance of ObjectHasher to use for hashing. """ super().__init__(**kwargs) - self._data_context = contexts.resolve_context(data_context) - self._content_hash: hp.ContentHash | None = None - self._int_hash: int | None = None + self._cached_content_hash: hp.ContentHash | None = None + self._cached_int_hash: int | None = None def identity_structure(self) -> Any: """ @@ -100,13 +114,13 @@ def content_hash(self) -> hp.ContentHash: bytes: A byte representation of the hash based on the content. If no identity structure is provided, return None. """ - if self._content_hash is None: + if self._cached_content_hash is None: structure = self.identity_structure() processed_structure = process_structure(structure) - self._content_hash = self._data_context.object_hasher.hash_object( + self._cached_content_hash = self._data_context.object_hasher.hash_object( processed_structure ) - return self._content_hash + return self._cached_content_hash def __hash__(self) -> int: """ @@ -117,16 +131,16 @@ def __hash__(self) -> int: int: A hash value based on either content or identity """ # Get the identity structure - if self._int_hash is None: + if self._cached_int_hash is None: structure = self.identity_structure() if structure is None: # If no identity structure is provided, use the default hash - self._int_hash = super().__hash__() + self._cached_int_hash = super().__hash__() else: - self._int_hash = self._data_context.object_hasher.hash_object( + self._cached_int_hash = self._data_context.object_hasher.hash_object( structure ).to_int() - return self._int_hash + return self._cached_int_hash def __eq__(self, other: object) -> bool: """ diff --git a/src/orcapod/data/sources/__init__.py b/src/orcapod/data/sources/__init__.py new file mode 100644 index 0000000..51b7a56 --- /dev/null +++ b/src/orcapod/data/sources/__init__.py @@ -0,0 +1,3 @@ +from .base import SourceBase + +__all__ = ["SourceBase"] diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py new file mode 100644 index 0000000..3ef6fa9 --- /dev/null +++ b/src/orcapod/data/sources/base.py @@ -0,0 +1,214 @@ +from collections.abc import Collection, Iterator +from typing import TYPE_CHECKING + + +from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.streams import ( + KernelStream, + StatefulStreamBase, +) +from orcapod.protocols import data_protocols as dp +from orcapod.types import TypeSpec +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class SourceBase(TrackedKernelBase, StatefulStreamBase): + """ + Base class for sources that act as both Kernels and LiveStreams. + + Design Philosophy: + 1. Source is fundamentally a Kernel (data loader) + 2. forward() returns static snapshots as a stream (pure computation) + 3. __call__() returns a cached KernelStream (live, tracked) + 4. All stream methods delegate to the cached KernelStream + + This ensures that direct source iteration and source() iteration + are identical and both benefit from KernelStream's lifecycle management. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # Cache the KernelStream for reuse across all stream method calls + self._cached_kernel_stream: KernelStream | None = None + + # =========================== Kernel Methods =========================== + + # The following are inherited from TrackedKernelBase as abstract methods. + # @abstractmethod + # def forward(self, *streams: dp.Stream) -> dp.Stream: + # """ + # Pure computation: return a static snapshot of the data. + + # This is the core method that subclasses must implement. + # Each call should return a fresh stream representing the current state of the data. + # This is what KernelStream calls when it needs to refresh its data. + # """ + # ... + + # @abstractmethod + # def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + # """Return the tag and packet types this source produces.""" + # ... + + # @abstractmethod + # def kernel_identity_structure( + # self, streams: Collection[dp.Stream] | None = None + # ) -> dp.Any: ... + + def validate_inputs(self, *streams: dp.Stream) -> None: + """Sources take no input streams.""" + if len(streams) > 0: + raise ValueError( + f"{self.__class__.__name__} is a source and takes no input streams" + ) + + def prepare_output_stream( + self, *streams: dp.Stream, label: str | None = None + ) -> KernelStream: + if self._cached_kernel_stream is None: + self._cached_kernel_stream = super().prepare_output_stream( + *streams, label=label + ) + return self._cached_kernel_stream + + def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + if not self._skip_tracking and self._tracker_manager is not None: + self._tracker_manager.record_source_invocation(self, label=label) + + # ==================== Stream Protocol (Delegation) ==================== + + @property + def source(self) -> dp.Kernel | None: + """Sources are their own source.""" + return self + + @property + def upstreams(self) -> tuple[dp.Stream, ...]: + """Sources have no upstream dependencies.""" + return () + + def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + """Delegate to the cached KernelStream.""" + return self().keys() + + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + """Delegate to the cached KernelStream.""" + return self().types(include_system_tags=include_system_tags) + + @property + def last_modified(self): + """Delegate to the cached KernelStream.""" + return self().last_modified + + @property + def is_current(self) -> bool: + """Delegate to the cached KernelStream.""" + return self().is_current + + def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """ + Iterate over the cached KernelStream. + + This allows direct iteration over the source as if it were a stream. + """ + return self().iter_packets() + + def iter_packets( + self, + execution_engine: dp.ExecutionEngine | None = None, + ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + """Delegate to the cached KernelStream.""" + return self().iter_packets(execution_engine=execution_engine) + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: dp.ExecutionEngine | None = None, + ) -> "pa.Table": + """Delegate to the cached KernelStream.""" + return self().as_table( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def flow( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> Collection[tuple[dp.Tag, dp.Packet]]: + """Delegate to the cached KernelStream.""" + return self().flow(execution_engine=execution_engine) + + def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + """ + Run the source node, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + self().run(execution_engine=execution_engine) + + async def run_async( + self, execution_engine: dp.ExecutionEngine | None = None + ) -> None: + """ + Run the source node asynchronously, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + await self().run_async(execution_engine=execution_engine) + + # ==================== LiveStream Protocol (Delegation) ==================== + + def refresh(self, force: bool = False) -> bool: + """Delegate to the cached KernelStream.""" + return self().refresh(force=force) + + def invalidate(self) -> None: + """Delegate to the cached KernelStream.""" + return self().invalidate() + + # ==================== Source Protocol ==================== + + @property + def tag_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + tag_keys, _ = self.keys() + return tag_keys + + @property + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + # TODO: consider caching this + _, packet_keys = self.keys() + return packet_keys + + def reset_cache(self) -> None: + """ + Clear the cached KernelStream, forcing a fresh one on next access. + + Useful when the underlying data source has fundamentally changed + (e.g., file path changed, database connection reset). + """ + if self._cached_kernel_stream is not None: + self._cached_kernel_stream.invalidate() + self._cached_kernel_stream = None + + +# ==================== Example Implementation ==================== diff --git a/uv.lock b/uv.lock index 5cafb27..ea9b66b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,10 +1,19 @@ version = 1 revision = 2 -requires-python = ">=3.12.0" +requires-python = ">=3.11.0" resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version < '3.13'", + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version == '3.13.*' and sys_platform == 'darwin'", + "python_full_version == '3.12.*' and sys_platform == 'darwin'", + "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", ] [[package]] @@ -66,6 +75,23 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/9b/e7/d92a237d8802ca88483906c388f7c201bbe96cd80a165ffd0ac2f6a8d59f/aiohttp-3.12.15.tar.gz", hash = "sha256:4fc61385e9c98d72fcdf47e6dd81833f47b2f77c114c29cd64a361be57a763a2", size = 7823716, upload-time = "2025-07-29T05:52:32.215Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/20/19/9e86722ec8e835959bd97ce8c1efa78cf361fa4531fca372551abcc9cdd6/aiohttp-3.12.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d3ce17ce0220383a0f9ea07175eeaa6aa13ae5a41f30bc61d84df17f0e9b1117", size = 711246, upload-time = "2025-07-29T05:50:15.937Z" }, + { url = "https://files.pythonhosted.org/packages/71/f9/0a31fcb1a7d4629ac9d8f01f1cb9242e2f9943f47f5d03215af91c3c1a26/aiohttp-3.12.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:010cc9bbd06db80fe234d9003f67e97a10fe003bfbedb40da7d71c1008eda0fe", size = 483515, upload-time = "2025-07-29T05:50:17.442Z" }, + { url = "https://files.pythonhosted.org/packages/62/6c/94846f576f1d11df0c2e41d3001000527c0fdf63fce7e69b3927a731325d/aiohttp-3.12.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f9d7c55b41ed687b9d7165b17672340187f87a773c98236c987f08c858145a9", size = 471776, upload-time = "2025-07-29T05:50:19.568Z" }, + { url = "https://files.pythonhosted.org/packages/f8/6c/f766d0aaafcee0447fad0328da780d344489c042e25cd58fde566bf40aed/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc4fbc61bb3548d3b482f9ac7ddd0f18c67e4225aaa4e8552b9f1ac7e6bda9e5", size = 1741977, upload-time = "2025-07-29T05:50:21.665Z" }, + { url = "https://files.pythonhosted.org/packages/17/e5/fb779a05ba6ff44d7bc1e9d24c644e876bfff5abe5454f7b854cace1b9cc/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7fbc8a7c410bb3ad5d595bb7118147dfbb6449d862cc1125cf8867cb337e8728", size = 1690645, upload-time = "2025-07-29T05:50:23.333Z" }, + { url = "https://files.pythonhosted.org/packages/37/4e/a22e799c2035f5d6a4ad2cf8e7c1d1bd0923192871dd6e367dafb158b14c/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:74dad41b3458dbb0511e760fb355bb0b6689e0630de8a22b1b62a98777136e16", size = 1789437, upload-time = "2025-07-29T05:50:25.007Z" }, + { url = "https://files.pythonhosted.org/packages/28/e5/55a33b991f6433569babb56018b2fb8fb9146424f8b3a0c8ecca80556762/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b6f0af863cf17e6222b1735a756d664159e58855da99cfe965134a3ff63b0b0", size = 1828482, upload-time = "2025-07-29T05:50:26.693Z" }, + { url = "https://files.pythonhosted.org/packages/c6/82/1ddf0ea4f2f3afe79dffed5e8a246737cff6cbe781887a6a170299e33204/aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b5b7fe4972d48a4da367043b8e023fb70a04d1490aa7d68800e465d1b97e493b", size = 1730944, upload-time = "2025-07-29T05:50:28.382Z" }, + { url = "https://files.pythonhosted.org/packages/1b/96/784c785674117b4cb3877522a177ba1b5e4db9ce0fd519430b5de76eec90/aiohttp-3.12.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6443cca89553b7a5485331bc9bedb2342b08d073fa10b8c7d1c60579c4a7b9bd", size = 1668020, upload-time = "2025-07-29T05:50:30.032Z" }, + { url = "https://files.pythonhosted.org/packages/12/8a/8b75f203ea7e5c21c0920d84dd24a5c0e971fe1e9b9ebbf29ae7e8e39790/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c5f40ec615e5264f44b4282ee27628cea221fcad52f27405b80abb346d9f3f8", size = 1716292, upload-time = "2025-07-29T05:50:31.983Z" }, + { url = "https://files.pythonhosted.org/packages/47/0b/a1451543475bb6b86a5cfc27861e52b14085ae232896a2654ff1231c0992/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:2abbb216a1d3a2fe86dbd2edce20cdc5e9ad0be6378455b05ec7f77361b3ab50", size = 1711451, upload-time = "2025-07-29T05:50:33.989Z" }, + { url = "https://files.pythonhosted.org/packages/55/fd/793a23a197cc2f0d29188805cfc93aa613407f07e5f9da5cd1366afd9d7c/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:db71ce547012a5420a39c1b744d485cfb823564d01d5d20805977f5ea1345676", size = 1691634, upload-time = "2025-07-29T05:50:35.846Z" }, + { url = "https://files.pythonhosted.org/packages/ca/bf/23a335a6670b5f5dfc6d268328e55a22651b440fca341a64fccf1eada0c6/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ced339d7c9b5030abad5854aa5413a77565e5b6e6248ff927d3e174baf3badf7", size = 1785238, upload-time = "2025-07-29T05:50:37.597Z" }, + { url = "https://files.pythonhosted.org/packages/57/4f/ed60a591839a9d85d40694aba5cef86dde9ee51ce6cca0bb30d6eb1581e7/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:7c7dd29c7b5bda137464dc9bfc738d7ceea46ff70309859ffde8c022e9b08ba7", size = 1805701, upload-time = "2025-07-29T05:50:39.591Z" }, + { url = "https://files.pythonhosted.org/packages/85/e0/444747a9455c5de188c0f4a0173ee701e2e325d4b2550e9af84abb20cdba/aiohttp-3.12.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:421da6fd326460517873274875c6c5a18ff225b40da2616083c5a34a7570b685", size = 1718758, upload-time = "2025-07-29T05:50:41.292Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/1006278d1ffd13a698e5dd4bfa01e5878f6bddefc296c8b62649753ff249/aiohttp-3.12.15-cp311-cp311-win32.whl", hash = "sha256:4420cf9d179ec8dfe4be10e7d0fe47d6d606485512ea2265b0d8c5113372771b", size = 428868, upload-time = "2025-07-29T05:50:43.063Z" }, + { url = "https://files.pythonhosted.org/packages/10/97/ad2b18700708452400278039272032170246a1bf8ec5d832772372c71f1a/aiohttp-3.12.15-cp311-cp311-win_amd64.whl", hash = "sha256:edd533a07da85baa4b423ee8839e3e91681c7bfa19b04260a469ee94b778bf6d", size = 453273, upload-time = "2025-07-29T05:50:44.613Z" }, { url = "https://files.pythonhosted.org/packages/63/97/77cb2450d9b35f517d6cf506256bf4f5bda3f93a66b4ad64ba7fc917899c/aiohttp-3.12.15-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:802d3868f5776e28f7bf69d349c26fc0efadb81676d0afa88ed00d98a26340b7", size = 702333, upload-time = "2025-07-29T05:50:46.507Z" }, { url = "https://files.pythonhosted.org/packages/83/6d/0544e6b08b748682c30b9f65640d006e51f90763b41d7c546693bc22900d/aiohttp-3.12.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2800614cd560287be05e33a679638e586a2d7401f4ddf99e304d98878c29444", size = 476948, upload-time = "2025-07-29T05:50:48.067Z" }, { url = "https://files.pythonhosted.org/packages/3a/1d/c8c40e611e5094330284b1aea8a4b02ca0858f8458614fa35754cab42b9c/aiohttp-3.12.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8466151554b593909d30a0a125d638b4e5f3836e5aecde85b66b80ded1cb5b0d", size = 469787, upload-time = "2025-07-29T05:50:49.669Z" }, @@ -187,7 +213,9 @@ name = "argon2-cffi-bindings" version = "21.2.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14'", + "python_full_version >= '3.14' and sys_platform == 'darwin'", + "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version >= '3.14' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "cffi", marker = "python_full_version >= '3.14'" }, @@ -211,8 +239,15 @@ name = "argon2-cffi-bindings" version = "25.1.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.13.*'", - "python_full_version < '3.13'", + "python_full_version == '3.13.*' and sys_platform == 'darwin'", + "python_full_version == '3.12.*' and sys_platform == 'darwin'", + "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux')", + "python_full_version < '3.12' and sys_platform == 'darwin'", + "python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux'", + "(python_full_version < '3.12' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and sys_platform != 'darwin' and sys_platform != 'linux')", ] dependencies = [ { name = "cffi", marker = "python_full_version < '3.14'" }, @@ -245,7 +280,23 @@ wheels = [ name = "arro3-core" version = "0.5.1" source = { registry = "https://pypi.org/simple" } -wheels = [ +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/45/c2540f04330f52f431a0ca0824c15d86fc38dd8b3f2af027a41a90ea91e7/arro3_core-0.5.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e6c43f2f59cd43044663969031c4ef29aab76247b5bda74800187a8b9bda3b9e", size = 2448953, upload-time = "2025-05-31T23:18:40.996Z" }, + { url = "https://files.pythonhosted.org/packages/4b/8f/9fc60dcc201f72f3d9d2ca86b61ff374eb640b58a65660b8de2ac53654d6/arro3_core-0.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:006214e68df6f66bbd1712989258cac2b307085627962348749cc2802b843f25", size = 2155535, upload-time = "2025-05-31T23:18:44.178Z" }, + { url = "https://files.pythonhosted.org/packages/5e/9e/4e6a3c41b52b08b8f34f7830df2a0e499d3e4ab43c6d45984e2af13fa780/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:be77d366d43025599a5a0c520cced43c181f750cf6bcc174a72a97a7338f9e37", size = 2594752, upload-time = "2025-05-31T23:18:47.586Z" }, + { url = "https://files.pythonhosted.org/packages/bd/77/94d8099c8fbfe3489ec92da76f65844b276f82b18d9cb6a547a717bd38cc/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca7cba980b3d2e3552dd06da67c8c298d970bd9430ed661a2316c893bfca3873", size = 2637291, upload-time = "2025-05-31T23:18:50.539Z" }, + { url = "https://files.pythonhosted.org/packages/ff/22/050c75161bcbe2e6b3ff5f8de11f760a376523fa905f4787b09bab65a4b5/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1866f014ca091049692d81601760b65fdad7b779d9c73698f709cd6ee4e8b5c3", size = 2869405, upload-time = "2025-05-31T23:18:53.73Z" }, + { url = "https://files.pythonhosted.org/packages/ac/88/87a3293db47dab5b23ecd910532f02c56d15f52920fc5d72404935126345/arro3_core-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e1433e98b4385f2565c59d69c1bbb4f18da7d2693d2d9712e219e020e8f9025", size = 2540544, upload-time = "2025-05-31T23:18:56.954Z" }, + { url = "https://files.pythonhosted.org/packages/71/e8/f85ce3be71c967b24e96c3af589ae3390548ab0d9fd69d5ed535225fd620/arro3_core-0.5.1-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:afba61734d4fc772ddf26888c299f94157e530a080835a981431a37398168fd6", size = 2289505, upload-time = "2025-05-31T23:19:00.354Z" }, + { url = "https://files.pythonhosted.org/packages/9c/4b/432eb5135fbcc5d8770ad7bd4193545e97588caf1f690d4f724bbb927632/arro3_core-0.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69b8885acf0e94b54adb6f060b4c41ee138d171b37a6356b690bece6b911565d", size = 2724357, upload-time = "2025-05-31T23:19:04.201Z" }, + { url = "https://files.pythonhosted.org/packages/83/91/056ab3166c5e562eab66477f573aff02bb4b92ba0de8affffd1bace6e50c/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2fe8f6d43697719abf822f9f02df7547681669c092b41bcee2b3a689f99e1588", size = 2435801, upload-time = "2025-05-31T23:19:07.617Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5f/b7a6a2106ba508e20f9788bb53c71b56211defd3729c7bcfe6ec09d36fd1/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a2aa298a78135d993e9257f110ac140e008d7bdc11eb23d8bc1c02493afbdf5a", size = 2869804, upload-time = "2025-05-31T23:19:11.059Z" }, + { url = "https://files.pythonhosted.org/packages/f6/e3/d95fbff21b27b06faa892c65621ea429391d0bfb926cdeb557db2d452a33/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:508688336dfc4667f8571115924857ae4629044ebeb4d3dedeabc33e287b2bca", size = 2797201, upload-time = "2025-05-31T23:19:14.674Z" }, + { url = "https://files.pythonhosted.org/packages/45/07/7ab65b01110e9459db2f2d37972826aa31a367ee98e95c7664f0eb13963d/arro3_core-0.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:31463bda8a942f5ae0e4a06c8fbe2424367b820d93f6f3b82c6f775f9a966780", size = 2709306, upload-time = "2025-05-31T23:19:17.913Z" }, + { url = "https://files.pythonhosted.org/packages/a7/15/0bebe279425bb70bd0a712dd45dcb4418deb9f32057ff5b9efd7947a65d3/arro3_core-0.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:0223d878f5f23c17600cab853cecce963c38fe365efa5f157f016706314018f1", size = 2611539, upload-time = "2025-05-31T23:19:21.358Z" }, { url = "https://files.pythonhosted.org/packages/c9/9c/af3c6127548630beaa319746770265b2fb996bb3e6dba8d16f78910bc070/arro3_core-0.5.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:afccbaf951a84d6eafb4384692ea557ad06887c6db8825e9417647f805735936", size = 2438592, upload-time = "2025-05-31T23:19:24.494Z" }, { url = "https://files.pythonhosted.org/packages/d8/50/057c93a846bbc5e5e55a976ea4fc00255332f64e5f9b1abfc218bb184f48/arro3_core-0.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:37325ec2f47a4dce40fa871935000708b545f3981c8e2bde7d7a031f2e098865", size = 2145488, upload-time = "2025-05-31T23:19:27.886Z" }, { url = "https://files.pythonhosted.org/packages/1f/8c/cbb785ecb9a0df254f5a761fc5ac7c8c5a6f93b0116e994ecf2797984f80/arro3_core-0.5.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:63ac803d46127d8c01bc4ffbb5911f10e51c063c9bcc76ba0258378bda683383", size = 2592145, upload-time = "2025-05-31T23:19:31.499Z" }, @@ -283,6 +334,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, ] +[[package]] +name = "async-timeout" +version = "5.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, +] + [[package]] name = "attrs" version = "25.3.0" @@ -424,6 +484,18 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264, upload-time = "2024-09-04T20:43:51.124Z" }, + { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651, upload-time = "2024-09-04T20:43:52.872Z" }, + { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259, upload-time = "2024-09-04T20:43:56.123Z" }, + { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200, upload-time = "2024-09-04T20:43:57.891Z" }, + { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235, upload-time = "2024-09-04T20:44:00.18Z" }, + { url = "https://files.pythonhosted.org/packages/62/12/ce8710b5b8affbcdd5c6e367217c242524ad17a02fe5beec3ee339f69f85/cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6", size = 459721, upload-time = "2024-09-04T20:44:01.585Z" }, + { url = "https://files.pythonhosted.org/packages/ff/6b/d45873c5e0242196f042d555526f92aa9e0c32355a1be1ff8c27f077fd37/cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d", size = 467242, upload-time = "2024-09-04T20:44:03.467Z" }, + { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999, upload-time = "2024-09-04T20:44:05.023Z" }, + { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242, upload-time = "2024-09-04T20:44:06.444Z" }, + { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604, upload-time = "2024-09-04T20:44:08.206Z" }, + { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727, upload-time = "2024-09-04T20:44:09.481Z" }, + { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400, upload-time = "2024-09-04T20:44:10.873Z" }, { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" }, { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" }, { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" }, @@ -454,6 +526,19 @@ version = "3.4.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload-time = "2025-05-02T08:34:42.01Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/05/85/4c40d00dcc6284a1c1ad5de5e0996b06f39d8232f1031cd23c2f5c07ee86/charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2", size = 198794, upload-time = "2025-05-02T08:32:11.945Z" }, + { url = "https://files.pythonhosted.org/packages/41/d9/7a6c0b9db952598e97e93cbdfcb91bacd89b9b88c7c983250a77c008703c/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645", size = 142846, upload-time = "2025-05-02T08:32:13.946Z" }, + { url = "https://files.pythonhosted.org/packages/66/82/a37989cda2ace7e37f36c1a8ed16c58cf48965a79c2142713244bf945c89/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd", size = 153350, upload-time = "2025-05-02T08:32:15.873Z" }, + { url = "https://files.pythonhosted.org/packages/df/68/a576b31b694d07b53807269d05ec3f6f1093e9545e8607121995ba7a8313/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8", size = 145657, upload-time = "2025-05-02T08:32:17.283Z" }, + { url = "https://files.pythonhosted.org/packages/92/9b/ad67f03d74554bed3aefd56fe836e1623a50780f7c998d00ca128924a499/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f", size = 147260, upload-time = "2025-05-02T08:32:18.807Z" }, + { url = "https://files.pythonhosted.org/packages/a6/e6/8aebae25e328160b20e31a7e9929b1578bbdc7f42e66f46595a432f8539e/charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7", size = 149164, upload-time = "2025-05-02T08:32:20.333Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f2/b3c2f07dbcc248805f10e67a0262c93308cfa149a4cd3d1fe01f593e5fd2/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9", size = 144571, upload-time = "2025-05-02T08:32:21.86Z" }, + { url = "https://files.pythonhosted.org/packages/60/5b/c3f3a94bc345bc211622ea59b4bed9ae63c00920e2e8f11824aa5708e8b7/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544", size = 151952, upload-time = "2025-05-02T08:32:23.434Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4d/ff460c8b474122334c2fa394a3f99a04cf11c646da895f81402ae54f5c42/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82", size = 155959, upload-time = "2025-05-02T08:32:24.993Z" }, + { url = "https://files.pythonhosted.org/packages/a2/2b/b964c6a2fda88611a1fe3d4c400d39c66a42d6c169c924818c848f922415/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0", size = 153030, upload-time = "2025-05-02T08:32:26.435Z" }, + { url = "https://files.pythonhosted.org/packages/59/2e/d3b9811db26a5ebf444bc0fa4f4be5aa6d76fc6e1c0fd537b16c14e849b6/charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5", size = 148015, upload-time = "2025-05-02T08:32:28.376Z" }, + { url = "https://files.pythonhosted.org/packages/90/07/c5fd7c11eafd561bb51220d600a788f1c8d77c5eef37ee49454cc5c35575/charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a", size = 98106, upload-time = "2025-05-02T08:32:30.281Z" }, + { url = "https://files.pythonhosted.org/packages/a8/05/5e33dbef7e2f773d672b6d79f10ec633d4a71cd96db6673625838a4fd532/charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28", size = 105402, upload-time = "2025-05-02T08:32:32.191Z" }, { url = "https://files.pythonhosted.org/packages/d7/a4/37f4d6035c89cac7930395a35cc0f1b872e652eaafb76a6075943754f095/charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7", size = 199936, upload-time = "2025-05-02T08:32:33.712Z" }, { url = "https://files.pythonhosted.org/packages/ee/8a/1a5e33b73e0d9287274f899d967907cd0bf9c343e651755d9307e0dbf2b3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3", size = 143790, upload-time = "2025-05-02T08:32:35.768Z" }, { url = "https://files.pythonhosted.org/packages/66/52/59521f1d8e6ab1482164fa21409c5ef44da3e9f653c13ba71becdd98dec3/charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a", size = 153924, upload-time = "2025-05-02T08:32:37.284Z" }, @@ -537,6 +622,16 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/b9/ede788a0b56fc5b071639d06c33cb893f68b1178938f3425debebe2dab78/contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445", size = 269636, upload-time = "2025-04-15T17:35:54.473Z" }, + { url = "https://files.pythonhosted.org/packages/e6/75/3469f011d64b8bbfa04f709bfc23e1dd71be54d05b1b083be9f5b22750d1/contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773", size = 254636, upload-time = "2025-04-15T17:35:58.283Z" }, + { url = "https://files.pythonhosted.org/packages/8d/2f/95adb8dae08ce0ebca4fd8e7ad653159565d9739128b2d5977806656fcd2/contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1", size = 313053, upload-time = "2025-04-15T17:36:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/c3/a6/8ccf97a50f31adfa36917707fe39c9a0cbc24b3bbb58185577f119736cc9/contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43", size = 352985, upload-time = "2025-04-15T17:36:08.275Z" }, + { url = "https://files.pythonhosted.org/packages/1d/b6/7925ab9b77386143f39d9c3243fdd101621b4532eb126743201160ffa7e6/contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab", size = 323750, upload-time = "2025-04-15T17:36:13.29Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f3/20c5d1ef4f4748e52d60771b8560cf00b69d5c6368b5c2e9311bcfa2a08b/contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7", size = 326246, upload-time = "2025-04-15T17:36:18.329Z" }, + { url = "https://files.pythonhosted.org/packages/8c/e5/9dae809e7e0b2d9d70c52b3d24cba134dd3dad979eb3e5e71f5df22ed1f5/contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83", size = 1308728, upload-time = "2025-04-15T17:36:33.878Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4a/0058ba34aeea35c0b442ae61a4f4d4ca84d6df8f91309bc2d43bb8dd248f/contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd", size = 1375762, upload-time = "2025-04-15T17:36:51.295Z" }, + { url = "https://files.pythonhosted.org/packages/09/33/7174bdfc8b7767ef2c08ed81244762d93d5c579336fc0b51ca57b33d1b80/contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f", size = 178196, upload-time = "2025-04-15T17:36:55.002Z" }, + { url = "https://files.pythonhosted.org/packages/5e/fe/4029038b4e1c4485cef18e480b0e2cd2d755448bb071eb9977caac80b77b/contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878", size = 222017, upload-time = "2025-04-15T17:36:58.576Z" }, { url = "https://files.pythonhosted.org/packages/34/f7/44785876384eff370c251d58fd65f6ad7f39adce4a093c934d4a67a7c6b6/contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2", size = 271580, upload-time = "2025-04-15T17:37:03.105Z" }, { url = "https://files.pythonhosted.org/packages/93/3b/0004767622a9826ea3d95f0e9d98cd8729015768075d61f9fea8eeca42a8/contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15", size = 255530, upload-time = "2025-04-15T17:37:07.026Z" }, { url = "https://files.pythonhosted.org/packages/e7/bb/7bd49e1f4fa805772d9fd130e0d375554ebc771ed7172f48dfcd4ca61549/contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92", size = 307688, upload-time = "2025-04-15T17:37:11.481Z" }, @@ -567,6 +662,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/73/69dd9a024444489e22d86108e7b913f3528f56cfc312b5c5727a44188471/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd", size = 1372168, upload-time = "2025-04-15T17:44:33.43Z" }, { url = "https://files.pythonhosted.org/packages/0f/1b/96d586ccf1b1a9d2004dd519b25fbf104a11589abfd05484ff12199cca21/contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1", size = 189550, upload-time = "2025-04-15T17:44:37.092Z" }, { url = "https://files.pythonhosted.org/packages/b0/e6/6000d0094e8a5e32ad62591c8609e269febb6e4db83a1c75ff8868b42731/contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69", size = 238214, upload-time = "2025-04-15T17:44:40.827Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/91f1215d0d9f9f343e4773ba6c9b89e8c0cc7a64a6263f21139da639d848/contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0", size = 266807, upload-time = "2025-04-15T17:45:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/d4/79/6be7e90c955c0487e7712660d6cead01fa17bff98e0ea275737cc2bc8e71/contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5", size = 318729, upload-time = "2025-04-15T17:45:20.166Z" }, + { url = "https://files.pythonhosted.org/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791, upload-time = "2025-04-15T17:45:24.794Z" }, ] [[package]] @@ -575,6 +673,17 @@ version = "7.8.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/ba/07/998afa4a0ecdf9b1981ae05415dad2d4e7716e1b1f00abbd91691ac09ac9/coverage-7.8.2.tar.gz", hash = "sha256:a886d531373a1f6ff9fad2a2ba4a045b68467b779ae729ee0b3b10ac20033b27", size = 812759, upload-time = "2025-05-23T11:39:57.856Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/4d/1ff618ee9f134d0de5cc1661582c21a65e06823f41caf801aadf18811a8e/coverage-7.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b99058eef42e6a8dcd135afb068b3d53aff3921ce699e127602efff9956457a9", size = 211692, upload-time = "2025-05-23T11:38:08.485Z" }, + { url = "https://files.pythonhosted.org/packages/96/fa/c3c1b476de96f2bc7a8ca01a9f1fcb51c01c6b60a9d2c3e66194b2bdb4af/coverage-7.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5feb7f2c3e6ea94d3b877def0270dff0947b8d8c04cfa34a17be0a4dc1836879", size = 212115, upload-time = "2025-05-23T11:38:09.989Z" }, + { url = "https://files.pythonhosted.org/packages/f7/c2/5414c5a1b286c0f3881ae5adb49be1854ac5b7e99011501f81c8c1453065/coverage-7.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:670a13249b957bb9050fab12d86acef7bf8f6a879b9d1a883799276e0d4c674a", size = 244740, upload-time = "2025-05-23T11:38:11.947Z" }, + { url = "https://files.pythonhosted.org/packages/cd/46/1ae01912dfb06a642ef3dd9cf38ed4996fda8fe884dab8952da616f81a2b/coverage-7.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bdc8bf760459a4a4187b452213e04d039990211f98644c7292adf1e471162b5", size = 242429, upload-time = "2025-05-23T11:38:13.955Z" }, + { url = "https://files.pythonhosted.org/packages/06/58/38c676aec594bfe2a87c7683942e5a30224791d8df99bcc8439fde140377/coverage-7.8.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07a989c867986c2a75f158f03fdb413128aad29aca9d4dbce5fc755672d96f11", size = 244218, upload-time = "2025-05-23T11:38:15.631Z" }, + { url = "https://files.pythonhosted.org/packages/80/0c/95b1023e881ce45006d9abc250f76c6cdab7134a1c182d9713878dfefcb2/coverage-7.8.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2db10dedeb619a771ef0e2949ccba7b75e33905de959c2643a4607bef2f3fb3a", size = 243865, upload-time = "2025-05-23T11:38:17.622Z" }, + { url = "https://files.pythonhosted.org/packages/57/37/0ae95989285a39e0839c959fe854a3ae46c06610439350d1ab860bf020ac/coverage-7.8.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e6ea7dba4e92926b7b5f0990634b78ea02f208d04af520c73a7c876d5a8d36cb", size = 242038, upload-time = "2025-05-23T11:38:19.966Z" }, + { url = "https://files.pythonhosted.org/packages/4d/82/40e55f7c0eb5e97cc62cbd9d0746fd24e8caf57be5a408b87529416e0c70/coverage-7.8.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ef2f22795a7aca99fc3c84393a55a53dd18ab8c93fb431004e4d8f0774150f54", size = 242567, upload-time = "2025-05-23T11:38:21.912Z" }, + { url = "https://files.pythonhosted.org/packages/f9/35/66a51adc273433a253989f0d9cc7aa6bcdb4855382cf0858200afe578861/coverage-7.8.2-cp311-cp311-win32.whl", hash = "sha256:641988828bc18a6368fe72355df5f1703e44411adbe49bba5644b941ce6f2e3a", size = 214194, upload-time = "2025-05-23T11:38:23.571Z" }, + { url = "https://files.pythonhosted.org/packages/f6/8f/a543121f9f5f150eae092b08428cb4e6b6d2d134152c3357b77659d2a605/coverage-7.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8ab4a51cb39dc1933ba627e0875046d150e88478dbe22ce145a68393e9652975", size = 215109, upload-time = "2025-05-23T11:38:25.137Z" }, + { url = "https://files.pythonhosted.org/packages/77/65/6cc84b68d4f35186463cd7ab1da1169e9abb59870c0f6a57ea6aba95f861/coverage-7.8.2-cp311-cp311-win_arm64.whl", hash = "sha256:8966a821e2083c74d88cca5b7dcccc0a3a888a596a04c0b9668a891de3a0cc53", size = 213521, upload-time = "2025-05-23T11:38:27.123Z" }, { url = "https://files.pythonhosted.org/packages/8d/2a/1da1ada2e3044fcd4a3254fb3576e160b8fe5b36d705c8a31f793423f763/coverage-7.8.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e2f6fe3654468d061942591aef56686131335b7a8325684eda85dacdf311356c", size = 211876, upload-time = "2025-05-23T11:38:29.01Z" }, { url = "https://files.pythonhosted.org/packages/70/e9/3d715ffd5b6b17a8be80cd14a8917a002530a99943cc1939ad5bb2aa74b9/coverage-7.8.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76090fab50610798cc05241bf83b603477c40ee87acd358b66196ab0ca44ffa1", size = 212130, upload-time = "2025-05-23T11:38:30.675Z" }, { url = "https://files.pythonhosted.org/packages/a0/02/fdce62bb3c21649abfd91fbdcf041fb99be0d728ff00f3f9d54d97ed683e/coverage-7.8.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd0a0a5054be160777a7920b731a0570284db5142abaaf81bcbb282b8d99279", size = 246176, upload-time = "2025-05-23T11:38:32.395Z" }, @@ -608,9 +717,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/63/2d624ac7d7ccd4ebbd3c6a9eba9d7fc4491a1226071360d59dd84928ccb2/coverage-7.8.2-cp313-cp313t-win32.whl", hash = "sha256:3f5673888d3676d0a745c3d0e16da338c5eea300cb1f4ada9c872981265e76d8", size = 215109, upload-time = "2025-05-23T11:39:26.722Z" }, { url = "https://files.pythonhosted.org/packages/22/5e/7053b71462e970e869111c1853afd642212568a350eba796deefdfbd0770/coverage-7.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:2c08b05ee8d7861e45dc5a2cc4195c8c66dca5ac613144eb6ebeaff2d502e73d", size = 216268, upload-time = "2025-05-23T11:39:28.429Z" }, { url = "https://files.pythonhosted.org/packages/07/69/afa41aa34147655543dbe96994f8a246daf94b361ccf5edfd5df62ce066a/coverage-7.8.2-cp313-cp313t-win_arm64.whl", hash = "sha256:1e1448bb72b387755e1ff3ef1268a06617afd94188164960dba8d0245a46004b", size = 214071, upload-time = "2025-05-23T11:39:30.55Z" }, + { url = "https://files.pythonhosted.org/packages/69/2f/572b29496d8234e4a7773200dd835a0d32d9e171f2d974f3fe04a9dbc271/coverage-7.8.2-pp39.pp310.pp311-none-any.whl", hash = "sha256:ec455eedf3ba0bbdf8f5a570012617eb305c63cb9f03428d39bf544cb2b94837", size = 203636, upload-time = "2025-05-23T11:39:52.002Z" }, { url = "https://files.pythonhosted.org/packages/a0/1a/0b9c32220ad694d66062f571cc5cedfa9997b64a591e8a500bb63de1bd40/coverage-7.8.2-py3-none-any.whl", hash = "sha256:726f32ee3713f7359696331a18daf0c3b3a70bb0ae71141b9d3c52be7c595e32", size = 203623, upload-time = "2025-05-23T11:39:53.846Z" }, ] +[package.optional-dependencies] +toml = [ + { name = "tomli", marker = "python_full_version <= '3.11'" }, +] + [[package]] name = "cryptography" version = "45.0.6" @@ -644,6 +759,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/82/27/092d311af22095d288f4db89fcaebadfb2f28944f3d790a4cf51fe5ddaeb/cryptography-45.0.6-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5b64e668fc3528e77efa51ca70fadcd6610e8ab231e3e06ae2bab3b31c2b8ed9", size = 4554815, upload-time = "2025-08-05T23:59:00.283Z" }, { url = "https://files.pythonhosted.org/packages/7e/01/aa2f4940262d588a8fdf4edabe4cda45854d00ebc6eaac12568b3a491a16/cryptography-45.0.6-cp37-abi3-win32.whl", hash = "sha256:780c40fb751c7d2b0c6786ceee6b6f871e86e8718a8ff4bc35073ac353c7cd02", size = 2912147, upload-time = "2025-08-05T23:59:01.716Z" }, { url = "https://files.pythonhosted.org/packages/0a/bc/16e0276078c2de3ceef6b5a34b965f4436215efac45313df90d55f0ba2d2/cryptography-45.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:20d15aed3ee522faac1a39fbfdfee25d17b1284bafd808e1640a74846d7c4d1b", size = 3390459, upload-time = "2025-08-05T23:59:03.358Z" }, + { url = "https://files.pythonhosted.org/packages/61/69/c252de4ec047ba2f567ecb53149410219577d408c2aea9c989acae7eafce/cryptography-45.0.6-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fc022c1fa5acff6def2fc6d7819bbbd31ccddfe67d075331a65d9cfb28a20983", size = 3584669, upload-time = "2025-08-05T23:59:15.431Z" }, + { url = "https://files.pythonhosted.org/packages/e3/fe/deea71e9f310a31fe0a6bfee670955152128d309ea2d1c79e2a5ae0f0401/cryptography-45.0.6-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3de77e4df42ac8d4e4d6cdb342d989803ad37707cf8f3fbf7b088c9cbdd46427", size = 4153022, upload-time = "2025-08-05T23:59:16.954Z" }, + { url = "https://files.pythonhosted.org/packages/60/45/a77452f5e49cb580feedba6606d66ae7b82c128947aa754533b3d1bd44b0/cryptography-45.0.6-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:599c8d7df950aa68baa7e98f7b73f4f414c9f02d0e8104a30c0182a07732638b", size = 4386802, upload-time = "2025-08-05T23:59:18.55Z" }, + { url = "https://files.pythonhosted.org/packages/a3/b9/a2f747d2acd5e3075fdf5c145c7c3568895daaa38b3b0c960ef830db6cdc/cryptography-45.0.6-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:31a2b9a10530a1cb04ffd6aa1cd4d3be9ed49f7d77a4dafe198f3b382f41545c", size = 4152706, upload-time = "2025-08-05T23:59:20.044Z" }, + { url = "https://files.pythonhosted.org/packages/81/ec/381b3e8d0685a3f3f304a382aa3dfce36af2d76467da0fd4bb21ddccc7b2/cryptography-45.0.6-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:e5b3dda1b00fb41da3af4c5ef3f922a200e33ee5ba0f0bc9ecf0b0c173958385", size = 4386740, upload-time = "2025-08-05T23:59:21.525Z" }, + { url = "https://files.pythonhosted.org/packages/0a/76/cf8d69da8d0b5ecb0db406f24a63a3f69ba5e791a11b782aeeefef27ccbb/cryptography-45.0.6-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:629127cfdcdc6806dfe234734d7cb8ac54edaf572148274fa377a7d3405b0043", size = 3331874, upload-time = "2025-08-05T23:59:23.017Z" }, ] [[package]] @@ -688,6 +809,10 @@ version = "1.8.14" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/bd/75/087fe07d40f490a78782ff3b0a30e3968936854105487decdb33446d4b0e/debugpy-1.8.14.tar.gz", hash = "sha256:7cd287184318416850aa8b60ac90105837bb1e59531898c07569d197d2ed5322", size = 1641444, upload-time = "2025-04-10T19:46:10.981Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/67/e8/57fe0c86915671fd6a3d2d8746e40485fd55e8d9e682388fbb3a3d42b86f/debugpy-1.8.14-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:1b2ac8c13b2645e0b1eaf30e816404990fbdb168e193322be8f545e8c01644a9", size = 2175064, upload-time = "2025-04-10T19:46:19.486Z" }, + { url = "https://files.pythonhosted.org/packages/3b/97/2b2fd1b1c9569c6764ccdb650a6f752e4ac31be465049563c9eb127a8487/debugpy-1.8.14-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf431c343a99384ac7eab2f763980724834f933a271e90496944195318c619e2", size = 3132359, upload-time = "2025-04-10T19:46:21.192Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ee/b825c87ed06256ee2a7ed8bab8fb3bb5851293bf9465409fdffc6261c426/debugpy-1.8.14-cp311-cp311-win32.whl", hash = "sha256:c99295c76161ad8d507b413cd33422d7c542889fbb73035889420ac1fad354f2", size = 5133269, upload-time = "2025-04-10T19:46:23.047Z" }, + { url = "https://files.pythonhosted.org/packages/d5/a6/6c70cd15afa43d37839d60f324213843174c1d1e6bb616bd89f7c1341bac/debugpy-1.8.14-cp311-cp311-win_amd64.whl", hash = "sha256:7816acea4a46d7e4e50ad8d09d963a680ecc814ae31cdef3622eb05ccacf7b01", size = 5158156, upload-time = "2025-04-10T19:46:24.521Z" }, { url = "https://files.pythonhosted.org/packages/d9/2a/ac2df0eda4898f29c46eb6713a5148e6f8b2b389c8ec9e425a4a1d67bf07/debugpy-1.8.14-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:8899c17920d089cfa23e6005ad9f22582fd86f144b23acb9feeda59e84405b84", size = 2501268, upload-time = "2025-04-10T19:46:26.044Z" }, { url = "https://files.pythonhosted.org/packages/10/53/0a0cb5d79dd9f7039169f8bf94a144ad3efa52cc519940b3b7dde23bcb89/debugpy-1.8.14-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6bb5c0dcf80ad5dbc7b7d6eac484e2af34bdacdf81df09b6a3e62792b722826", size = 4221077, upload-time = "2025-04-10T19:46:27.464Z" }, { url = "https://files.pythonhosted.org/packages/f8/d5/84e01821f362327bf4828728aa31e907a2eca7c78cd7c6ec062780d249f8/debugpy-1.8.14-cp312-cp312-win32.whl", hash = "sha256:281d44d248a0e1791ad0eafdbbd2912ff0de9eec48022a5bfbc332957487ed3f", size = 5255127, upload-time = "2025-04-10T19:46:29.467Z" }, @@ -842,6 +967,14 @@ version = "4.58.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/3e/7a/30c581aeaa86d94e7a29344bccefd2408870bf5b0e7640b6f4ffede61bd0/fonttools-4.58.1.tar.gz", hash = "sha256:cbc8868e0a29c3e22628dfa1432adf7a104d86d1bc661cecc3e9173070b6ab2d", size = 3519505, upload-time = "2025-05-28T15:29:26.219Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/50/3f/9fecd69149b0eec5ca46ec58de83b2fd34d07204fe2c12c209255082507a/fonttools-4.58.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9966e14729669bcfbb56f83b747a2397c4d97c6d4798cb2e2adc28f9388fa008", size = 2754713, upload-time = "2025-05-28T15:28:18.998Z" }, + { url = "https://files.pythonhosted.org/packages/c8/19/d04ea5f3ab2afa7799f2b1ebe1d57ff71b479f99f29b82bddc7197d50220/fonttools-4.58.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64cc1647bbe83dea57f5496ec878ad19ccdba7185b0dd34955d3e6f03dc789e6", size = 2316637, upload-time = "2025-05-28T15:28:21.016Z" }, + { url = "https://files.pythonhosted.org/packages/5c/3f/375f59d756b17318336c050363849011e03ac82904538f39ebe8189835bc/fonttools-4.58.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:464f790ce681d08d1583df0735776aa9cb1999594bf336ddd0bf962c17b629ac", size = 4915730, upload-time = "2025-05-28T15:28:22.633Z" }, + { url = "https://files.pythonhosted.org/packages/2f/90/069f859d6f6480503574cda21b84ceee98bf5f5fd1764f26674e828a2600/fonttools-4.58.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c53c6a720ee70cc25746d511ba88c45c95ec510fd258026ed209b0b9e3ba92f", size = 4936194, upload-time = "2025-05-28T15:28:24.704Z" }, + { url = "https://files.pythonhosted.org/packages/01/11/339973e588e1c27f20c578f845bdcf84376c5e42bd35fca05419fd8d1648/fonttools-4.58.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6823a633bbce29cf3033508ebb54a433c473fb9833eff7f936bfdc5204fd98d", size = 4978982, upload-time = "2025-05-28T15:28:26.633Z" }, + { url = "https://files.pythonhosted.org/packages/a7/aa/1c627532a69715f54b8d96ab3a7bc8628f6e89989e9275dfc067dc2d6d56/fonttools-4.58.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5701fe66a1408c1974d2f78c00f964f8aad17cccbc32bc041e1b81421f31f448", size = 5090087, upload-time = "2025-05-28T15:28:29.608Z" }, + { url = "https://files.pythonhosted.org/packages/77/ce/cf7b624db35bce589ac1f2c98329ea91b28f0283d3b7e9e6126dfaeb5abd/fonttools-4.58.1-cp311-cp311-win32.whl", hash = "sha256:4cad2c74adf9ee31ae43be6b0b376fdb386d4d50c60979790e32c3548efec051", size = 2188923, upload-time = "2025-05-28T15:28:31.797Z" }, + { url = "https://files.pythonhosted.org/packages/b9/22/c4f1f76eeb1b9353e9cc81451d0ae08acc3d3aa31b9ab8f3791a18af1f89/fonttools-4.58.1-cp311-cp311-win_amd64.whl", hash = "sha256:7ade12485abccb0f6b6a6e2a88c50e587ff0e201e48e0153dd9b2e0ed67a2f38", size = 2236853, upload-time = "2025-05-28T15:28:33.381Z" }, { url = "https://files.pythonhosted.org/packages/32/97/ed1078b1e138fbc0b4ee75878000d549a70c02d83bb4e557e416efc34140/fonttools-4.58.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f56085a65769dc0100822c814069327541db9c3c4f21e599c6138f9dbda75e96", size = 2740473, upload-time = "2025-05-28T15:28:35.002Z" }, { url = "https://files.pythonhosted.org/packages/28/35/53d49fb7d6b30128153d11628b976fda3ce8ae44234b5a81c4edb3023798/fonttools-4.58.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:19c65a88e522c9f1be0c05d73541de20feada99d23d06e9b5354023cc3e517b0", size = 2309936, upload-time = "2025-05-28T15:28:37.145Z" }, { url = "https://files.pythonhosted.org/packages/0c/db/8b63c1d673b2bf0cfed77500d47769dc4aa85453b5f0ef525db2cf952895/fonttools-4.58.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b01bb37006e97703300bfde7a73d1c7038574dd1df9d8d92ca99af151becf2ca", size = 4814671, upload-time = "2025-05-28T15:28:39.339Z" }, @@ -867,6 +1000,23 @@ version = "1.7.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/79/b1/b64018016eeb087db503b038296fd782586432b9c077fc5c7839e9cb6ef6/frozenlist-1.7.0.tar.gz", hash = "sha256:2e310d81923c2437ea8670467121cc3e9b0f76d3043cc1d2331d56c7fb7a3a8f", size = 45078, upload-time = "2025-06-09T23:02:35.538Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/34/7e/803dde33760128acd393a27eb002f2020ddb8d99d30a44bfbaab31c5f08a/frozenlist-1.7.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:aa51e147a66b2d74de1e6e2cf5921890de6b0f4820b257465101d7f37b49fb5a", size = 82251, upload-time = "2025-06-09T23:00:16.279Z" }, + { url = "https://files.pythonhosted.org/packages/75/a9/9c2c5760b6ba45eae11334db454c189d43d34a4c0b489feb2175e5e64277/frozenlist-1.7.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9b35db7ce1cd71d36ba24f80f0c9e7cff73a28d7a74e91fe83e23d27c7828750", size = 48183, upload-time = "2025-06-09T23:00:17.698Z" }, + { url = "https://files.pythonhosted.org/packages/47/be/4038e2d869f8a2da165f35a6befb9158c259819be22eeaf9c9a8f6a87771/frozenlist-1.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:34a69a85e34ff37791e94542065c8416c1afbf820b68f720452f636d5fb990cd", size = 47107, upload-time = "2025-06-09T23:00:18.952Z" }, + { url = "https://files.pythonhosted.org/packages/79/26/85314b8a83187c76a37183ceed886381a5f992975786f883472fcb6dc5f2/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a646531fa8d82c87fe4bb2e596f23173caec9185bfbca5d583b4ccfb95183e2", size = 237333, upload-time = "2025-06-09T23:00:20.275Z" }, + { url = "https://files.pythonhosted.org/packages/1f/fd/e5b64f7d2c92a41639ffb2ad44a6a82f347787abc0c7df5f49057cf11770/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:79b2ffbba483f4ed36a0f236ccb85fbb16e670c9238313709638167670ba235f", size = 231724, upload-time = "2025-06-09T23:00:21.705Z" }, + { url = "https://files.pythonhosted.org/packages/20/fb/03395c0a43a5976af4bf7534759d214405fbbb4c114683f434dfdd3128ef/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a26f205c9ca5829cbf82bb2a84b5c36f7184c4316617d7ef1b271a56720d6b30", size = 245842, upload-time = "2025-06-09T23:00:23.148Z" }, + { url = "https://files.pythonhosted.org/packages/d0/15/c01c8e1dffdac5d9803507d824f27aed2ba76b6ed0026fab4d9866e82f1f/frozenlist-1.7.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcacfad3185a623fa11ea0e0634aac7b691aa925d50a440f39b458e41c561d98", size = 239767, upload-time = "2025-06-09T23:00:25.103Z" }, + { url = "https://files.pythonhosted.org/packages/14/99/3f4c6fe882c1f5514b6848aa0a69b20cb5e5d8e8f51a339d48c0e9305ed0/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72c1b0fe8fe451b34f12dce46445ddf14bd2a5bcad7e324987194dc8e3a74c86", size = 224130, upload-time = "2025-06-09T23:00:27.061Z" }, + { url = "https://files.pythonhosted.org/packages/4d/83/220a374bd7b2aeba9d0725130665afe11de347d95c3620b9b82cc2fcab97/frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d1a5baeaac6c0798ff6edfaeaa00e0e412d49946c53fae8d4b8e8b3566c4ae", size = 235301, upload-time = "2025-06-09T23:00:29.02Z" }, + { url = "https://files.pythonhosted.org/packages/03/3c/3e3390d75334a063181625343e8daab61b77e1b8214802cc4e8a1bb678fc/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7edf5c043c062462f09b6820de9854bf28cc6cc5b6714b383149745e287181a8", size = 234606, upload-time = "2025-06-09T23:00:30.514Z" }, + { url = "https://files.pythonhosted.org/packages/23/1e/58232c19608b7a549d72d9903005e2d82488f12554a32de2d5fb59b9b1ba/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d50ac7627b3a1bd2dcef6f9da89a772694ec04d9a61b66cf87f7d9446b4a0c31", size = 248372, upload-time = "2025-06-09T23:00:31.966Z" }, + { url = "https://files.pythonhosted.org/packages/c0/a4/e4a567e01702a88a74ce8a324691e62a629bf47d4f8607f24bf1c7216e7f/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ce48b2fece5aeb45265bb7a58259f45027db0abff478e3077e12b05b17fb9da7", size = 229860, upload-time = "2025-06-09T23:00:33.375Z" }, + { url = "https://files.pythonhosted.org/packages/73/a6/63b3374f7d22268b41a9db73d68a8233afa30ed164c46107b33c4d18ecdd/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:fe2365ae915a1fafd982c146754e1de6ab3478def8a59c86e1f7242d794f97d5", size = 245893, upload-time = "2025-06-09T23:00:35.002Z" }, + { url = "https://files.pythonhosted.org/packages/6d/eb/d18b3f6e64799a79673c4ba0b45e4cfbe49c240edfd03a68be20002eaeaa/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:45a6f2fdbd10e074e8814eb98b05292f27bad7d1883afbe009d96abdcf3bc898", size = 246323, upload-time = "2025-06-09T23:00:36.468Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f5/720f3812e3d06cd89a1d5db9ff6450088b8f5c449dae8ffb2971a44da506/frozenlist-1.7.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:21884e23cffabb157a9dd7e353779077bf5b8f9a58e9b262c6caad2ef5f80a56", size = 233149, upload-time = "2025-06-09T23:00:37.963Z" }, + { url = "https://files.pythonhosted.org/packages/69/68/03efbf545e217d5db8446acfd4c447c15b7c8cf4dbd4a58403111df9322d/frozenlist-1.7.0-cp311-cp311-win32.whl", hash = "sha256:284d233a8953d7b24f9159b8a3496fc1ddc00f4db99c324bd5fb5f22d8698ea7", size = 39565, upload-time = "2025-06-09T23:00:39.753Z" }, + { url = "https://files.pythonhosted.org/packages/58/17/fe61124c5c333ae87f09bb67186d65038834a47d974fc10a5fadb4cc5ae1/frozenlist-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:387cbfdcde2f2353f19c2f66bbb52406d06ed77519ac7ee21be0232147c2592d", size = 44019, upload-time = "2025-06-09T23:00:40.988Z" }, { url = "https://files.pythonhosted.org/packages/ef/a2/c8131383f1e66adad5f6ecfcce383d584ca94055a34d683bbb24ac5f2f1c/frozenlist-1.7.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3dbf9952c4bb0e90e98aec1bd992b3318685005702656bc6f67c1a32b76787f2", size = 81424, upload-time = "2025-06-09T23:00:42.24Z" }, { url = "https://files.pythonhosted.org/packages/4c/9d/02754159955088cb52567337d1113f945b9e444c4960771ea90eb73de8db/frozenlist-1.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1f5906d3359300b8a9bb194239491122e6cf1444c2efb88865426f170c262cdb", size = 47952, upload-time = "2025-06-09T23:00:43.481Z" }, { url = "https://files.pythonhosted.org/packages/01/7a/0046ef1bd6699b40acd2067ed6d6670b4db2f425c56980fa21c982c2a9db/frozenlist-1.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3dabd5a8f84573c8d10d8859a50ea2dec01eea372031929871368c09fa103478", size = 46688, upload-time = "2025-06-09T23:00:44.793Z" }, @@ -1039,6 +1189,11 @@ version = "1.7.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/94/220139ea87822b6fdfdab4fb9ba81b3fff7ea2c82e2af34adc726085bffc/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6fbab4b935989e2c3610371963ba1b86afb09537fd0c633049be82afe153ac06", size = 30468, upload-time = "2025-03-26T14:32:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/94/97/789b23bdeeb9d15dc2904660463ad539d0318286d7633fe2760c10ed0c1c/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ed66cbe1ed9cbaaad9392b5259b3eba4a9e565420d734e6238813c428c3336c9", size = 30313, upload-time = "2025-03-26T14:57:38.758Z" }, + { url = "https://files.pythonhosted.org/packages/81/b8/976a2b843610c211e7ccb3e248996a61e87dbb2c09b1499847e295080aec/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6547b657621b6cbed3562ea7826c3e11cab01cd33b74e1f677690652883e77", size = 33048, upload-time = "2025-03-26T14:41:30.679Z" }, + { url = "https://files.pythonhosted.org/packages/c9/16/a3842c2cf591093b111d4a5e2bfb478ac6692d02f1b386d2a33283a19dc9/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d68e17bad8f7dd9a49181a1f5a8f4b251c6dbc8cc96fb79f1d321dfd57d66f53", size = 32669, upload-time = "2025-03-26T14:41:31.432Z" }, + { url = "https://files.pythonhosted.org/packages/04/17/ed9aba495916fcf5fe4ecb2267ceb851fc5f273c4e4625ae453350cfd564/google_crc32c-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:6335de12921f06e1f774d0dd1fbea6bf610abe0887a1638f64d694013138be5d", size = 33476, upload-time = "2025-03-26T14:29:10.211Z" }, { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, @@ -1051,6 +1206,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, + { url = "https://files.pythonhosted.org/packages/16/1b/1693372bf423ada422f80fd88260dbfd140754adb15cbc4d7e9a68b1cb8e/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85fef7fae11494e747c9fd1359a527e5970fc9603c90764843caabd3a16a0a48", size = 28241, upload-time = "2025-03-26T14:41:45.898Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3c/2a19a60a473de48717b4efb19398c3f914795b64a96cf3fbe82588044f78/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efb97eb4369d52593ad6f75e7e10d053cf00c48983f7a973105bc70b0ac4d82", size = 28048, upload-time = "2025-03-26T14:41:46.696Z" }, ] [[package]] @@ -1083,6 +1240,16 @@ version = "1.74.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/38/b4/35feb8f7cab7239c5b94bd2db71abb3d6adb5f335ad8f131abb6060840b6/grpcio-1.74.0.tar.gz", hash = "sha256:80d1f4fbb35b0742d3e3d3bb654b7381cd5f015f8497279a1e9c21ba623e01b1", size = 12756048, upload-time = "2025-07-24T18:54:23.039Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/77/b2f06db9f240a5abeddd23a0e49eae2b6ac54d85f0e5267784ce02269c3b/grpcio-1.74.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:69e1a8180868a2576f02356565f16635b99088da7df3d45aaa7e24e73a054e31", size = 5487368, upload-time = "2025-07-24T18:53:03.548Z" }, + { url = "https://files.pythonhosted.org/packages/48/99/0ac8678a819c28d9a370a663007581744a9f2a844e32f0fa95e1ddda5b9e/grpcio-1.74.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:8efe72fde5500f47aca1ef59495cb59c885afe04ac89dd11d810f2de87d935d4", size = 10999804, upload-time = "2025-07-24T18:53:05.095Z" }, + { url = "https://files.pythonhosted.org/packages/45/c6/a2d586300d9e14ad72e8dc211c7aecb45fe9846a51e558c5bca0c9102c7f/grpcio-1.74.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:a8f0302f9ac4e9923f98d8e243939a6fb627cd048f5cd38595c97e38020dffce", size = 5987667, upload-time = "2025-07-24T18:53:07.157Z" }, + { url = "https://files.pythonhosted.org/packages/c9/57/5f338bf56a7f22584e68d669632e521f0de460bb3749d54533fc3d0fca4f/grpcio-1.74.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f609a39f62a6f6f05c7512746798282546358a37ea93c1fcbadf8b2fed162e3", size = 6655612, upload-time = "2025-07-24T18:53:09.244Z" }, + { url = "https://files.pythonhosted.org/packages/82/ea/a4820c4c44c8b35b1903a6c72a5bdccec92d0840cf5c858c498c66786ba5/grpcio-1.74.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98e0b7434a7fa4e3e63f250456eaef52499fba5ae661c58cc5b5477d11e7182", size = 6219544, upload-time = "2025-07-24T18:53:11.221Z" }, + { url = "https://files.pythonhosted.org/packages/a4/17/0537630a921365928f5abb6d14c79ba4dcb3e662e0dbeede8af4138d9dcf/grpcio-1.74.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:662456c4513e298db6d7bd9c3b8df6f75f8752f0ba01fb653e252ed4a59b5a5d", size = 6334863, upload-time = "2025-07-24T18:53:12.925Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a6/85ca6cb9af3f13e1320d0a806658dca432ff88149d5972df1f7b51e87127/grpcio-1.74.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3d14e3c4d65e19d8430a4e28ceb71ace4728776fd6c3ce34016947474479683f", size = 7019320, upload-time = "2025-07-24T18:53:15.002Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a7/fe2beab970a1e25d2eff108b3cf4f7d9a53c185106377a3d1989216eba45/grpcio-1.74.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bf949792cee20d2078323a9b02bacbbae002b9e3b9e2433f2741c15bdeba1c4", size = 6514228, upload-time = "2025-07-24T18:53:16.999Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c2/2f9c945c8a248cebc3ccda1b7a1bf1775b9d7d59e444dbb18c0014e23da6/grpcio-1.74.0-cp311-cp311-win32.whl", hash = "sha256:55b453812fa7c7ce2f5c88be3018fb4a490519b6ce80788d5913f3f9d7da8c7b", size = 3817216, upload-time = "2025-07-24T18:53:20.564Z" }, + { url = "https://files.pythonhosted.org/packages/ff/d1/a9cf9c94b55becda2199299a12b9feef0c79946b0d9d34c989de6d12d05d/grpcio-1.74.0-cp311-cp311-win_amd64.whl", hash = "sha256:86ad489db097141a907c559988c29718719aa3e13370d40e20506f11b4de0d11", size = 4495380, upload-time = "2025-07-24T18:53:22.058Z" }, { url = "https://files.pythonhosted.org/packages/4c/5d/e504d5d5c4469823504f65687d6c8fb97b7f7bf0b34873b7598f1df24630/grpcio-1.74.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:8533e6e9c5bd630ca98062e3a1326249e6ada07d05acf191a77bc33f8948f3d8", size = 5445551, upload-time = "2025-07-24T18:53:23.641Z" }, { url = "https://files.pythonhosted.org/packages/43/01/730e37056f96f2f6ce9f17999af1556df62ee8dab7fa48bceeaab5fd3008/grpcio-1.74.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:2918948864fec2a11721d91568effffbe0a02b23ecd57f281391d986847982f6", size = 10979810, upload-time = "2025-07-24T18:53:25.349Z" }, { url = "https://files.pythonhosted.org/packages/79/3d/09fd100473ea5c47083889ca47ffd356576173ec134312f6aa0e13111dee/grpcio-1.74.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:60d2d48b0580e70d2e1954d0d19fa3c2e60dd7cbed826aca104fff518310d1c5", size = 5941946, upload-time = "2025-07-24T18:53:27.387Z" }, @@ -1256,6 +1423,7 @@ dependencies = [ { name = "pygments" }, { name = "stack-data" }, { name = "traitlets" }, + { name = "typing-extensions", marker = "python_full_version < '3.12'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/9d/02/63a84444a7409b3c0acd1de9ffe524660e0e5d82ee473e78b45e5bfb64a4/ipython-9.2.0.tar.gz", hash = "sha256:62a9373dbc12f28f9feaf4700d052195bf89806279fc8ca11f3f54017d04751b", size = 4424394, upload-time = "2025-04-25T17:55:40.498Z" } wheels = [ @@ -1413,6 +1581,21 @@ version = "1.4.8" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/82/59/7c91426a8ac292e1cdd53a63b6d9439abd573c875c3f92c146767dd33faf/kiwisolver-1.4.8.tar.gz", hash = "sha256:23d5f023bdc8c7e54eb65f03ca5d5bb25b601eac4d7f1a042888a1f45237987e", size = 97538, upload-time = "2024-12-24T18:30:51.519Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/da/ed/c913ee28936c371418cb167b128066ffb20bbf37771eecc2c97edf8a6e4c/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a4d3601908c560bdf880f07d94f31d734afd1bb71e96585cace0e38ef44c6d84", size = 124635, upload-time = "2024-12-24T18:28:51.826Z" }, + { url = "https://files.pythonhosted.org/packages/4c/45/4a7f896f7467aaf5f56ef093d1f329346f3b594e77c6a3c327b2d415f521/kiwisolver-1.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:856b269c4d28a5c0d5e6c1955ec36ebfd1651ac00e1ce0afa3e28da95293b561", size = 66717, upload-time = "2024-12-24T18:28:54.256Z" }, + { url = "https://files.pythonhosted.org/packages/5f/b4/c12b3ac0852a3a68f94598d4c8d569f55361beef6159dce4e7b624160da2/kiwisolver-1.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2b9a96e0f326205af81a15718a9073328df1173a2619a68553decb7097fd5d7", size = 65413, upload-time = "2024-12-24T18:28:55.184Z" }, + { url = "https://files.pythonhosted.org/packages/a9/98/1df4089b1ed23d83d410adfdc5947245c753bddfbe06541c4aae330e9e70/kiwisolver-1.4.8-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5020c83e8553f770cb3b5fc13faac40f17e0b205bd237aebd21d53d733adb03", size = 1343994, upload-time = "2024-12-24T18:28:57.493Z" }, + { url = "https://files.pythonhosted.org/packages/8d/bf/b4b169b050c8421a7c53ea1ea74e4ef9c335ee9013216c558a047f162d20/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dace81d28c787956bfbfbbfd72fdcef014f37d9b48830829e488fdb32b49d954", size = 1434804, upload-time = "2024-12-24T18:29:00.077Z" }, + { url = "https://files.pythonhosted.org/packages/66/5a/e13bd341fbcf73325ea60fdc8af752addf75c5079867af2e04cc41f34434/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11e1022b524bd48ae56c9b4f9296bce77e15a2e42a502cceba602f804b32bb79", size = 1450690, upload-time = "2024-12-24T18:29:01.401Z" }, + { url = "https://files.pythonhosted.org/packages/9b/4f/5955dcb376ba4a830384cc6fab7d7547bd6759fe75a09564910e9e3bb8ea/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b9b4d2892fefc886f30301cdd80debd8bb01ecdf165a449eb6e78f79f0fabd6", size = 1376839, upload-time = "2024-12-24T18:29:02.685Z" }, + { url = "https://files.pythonhosted.org/packages/3a/97/5edbed69a9d0caa2e4aa616ae7df8127e10f6586940aa683a496c2c280b9/kiwisolver-1.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a96c0e790ee875d65e340ab383700e2b4891677b7fcd30a699146f9384a2bb0", size = 1435109, upload-time = "2024-12-24T18:29:04.113Z" }, + { url = "https://files.pythonhosted.org/packages/13/fc/e756382cb64e556af6c1809a1bbb22c141bbc2445049f2da06b420fe52bf/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23454ff084b07ac54ca8be535f4174170c1094a4cff78fbae4f73a4bcc0d4dab", size = 2245269, upload-time = "2024-12-24T18:29:05.488Z" }, + { url = "https://files.pythonhosted.org/packages/76/15/e59e45829d7f41c776d138245cabae6515cb4eb44b418f6d4109c478b481/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:87b287251ad6488e95b4f0b4a79a6d04d3ea35fde6340eb38fbd1ca9cd35bbbc", size = 2393468, upload-time = "2024-12-24T18:29:06.79Z" }, + { url = "https://files.pythonhosted.org/packages/e9/39/483558c2a913ab8384d6e4b66a932406f87c95a6080112433da5ed668559/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b21dbe165081142b1232a240fc6383fd32cdd877ca6cc89eab93e5f5883e1c25", size = 2355394, upload-time = "2024-12-24T18:29:08.24Z" }, + { url = "https://files.pythonhosted.org/packages/01/aa/efad1fbca6570a161d29224f14b082960c7e08268a133fe5dc0f6906820e/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:768cade2c2df13db52475bd28d3a3fac8c9eff04b0e9e2fda0f3760f20b3f7fc", size = 2490901, upload-time = "2024-12-24T18:29:09.653Z" }, + { url = "https://files.pythonhosted.org/packages/c9/4f/15988966ba46bcd5ab9d0c8296914436720dd67fca689ae1a75b4ec1c72f/kiwisolver-1.4.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d47cfb2650f0e103d4bf68b0b5804c68da97272c84bb12850d877a95c056bd67", size = 2312306, upload-time = "2024-12-24T18:29:12.644Z" }, + { url = "https://files.pythonhosted.org/packages/2d/27/bdf1c769c83f74d98cbc34483a972f221440703054894a37d174fba8aa68/kiwisolver-1.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:ed33ca2002a779a2e20eeb06aea7721b6e47f2d4b8a8ece979d8ba9e2a167e34", size = 71966, upload-time = "2024-12-24T18:29:14.089Z" }, + { url = "https://files.pythonhosted.org/packages/4a/c9/9642ea855604aeb2968a8e145fc662edf61db7632ad2e4fb92424be6b6c0/kiwisolver-1.4.8-cp311-cp311-win_arm64.whl", hash = "sha256:16523b40aab60426ffdebe33ac374457cf62863e330a90a0383639ce14bf44b2", size = 65311, upload-time = "2024-12-24T18:29:15.892Z" }, { url = "https://files.pythonhosted.org/packages/fc/aa/cea685c4ab647f349c3bc92d2daf7ae34c8e8cf405a6dcd3a497f58a2ac3/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d6af5e8815fd02997cb6ad9bbed0ee1e60014438ee1a5c2444c96f87b8843502", size = 124152, upload-time = "2024-12-24T18:29:16.85Z" }, { url = "https://files.pythonhosted.org/packages/c5/0b/8db6d2e2452d60d5ebc4ce4b204feeb16176a851fd42462f66ade6808084/kiwisolver-1.4.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bade438f86e21d91e0cf5dd7c0ed00cda0f77c8c1616bd83f9fc157fa6760d31", size = 66555, upload-time = "2024-12-24T18:29:19.146Z" }, { url = "https://files.pythonhosted.org/packages/60/26/d6a0db6785dd35d3ba5bf2b2df0aedc5af089962c6eb2cbf67a15b81369e/kiwisolver-1.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b83dc6769ddbc57613280118fb4ce3cd08899cc3369f7d0e0fab518a7cf37fdb", size = 65067, upload-time = "2024-12-24T18:29:20.096Z" }, @@ -1485,6 +1668,16 @@ version = "3.0.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537, upload-time = "2024-10-18T15:21:54.129Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/28/bbf83e3f76936960b850435576dd5e67034e200469571be53f69174a2dfd/MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d", size = 14353, upload-time = "2024-10-18T15:21:02.187Z" }, + { url = "https://files.pythonhosted.org/packages/6c/30/316d194b093cde57d448a4c3209f22e3046c5bb2fb0820b118292b334be7/MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93", size = 12392, upload-time = "2024-10-18T15:21:02.941Z" }, + { url = "https://files.pythonhosted.org/packages/f2/96/9cdafba8445d3a53cae530aaf83c38ec64c4d5427d975c974084af5bc5d2/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832", size = 23984, upload-time = "2024-10-18T15:21:03.953Z" }, + { url = "https://files.pythonhosted.org/packages/f1/a4/aefb044a2cd8d7334c8a47d3fb2c9f328ac48cb349468cc31c20b539305f/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84", size = 23120, upload-time = "2024-10-18T15:21:06.495Z" }, + { url = "https://files.pythonhosted.org/packages/8d/21/5e4851379f88f3fad1de30361db501300d4f07bcad047d3cb0449fc51f8c/MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca", size = 23032, upload-time = "2024-10-18T15:21:07.295Z" }, + { url = "https://files.pythonhosted.org/packages/00/7b/e92c64e079b2d0d7ddf69899c98842f3f9a60a1ae72657c89ce2655c999d/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798", size = 24057, upload-time = "2024-10-18T15:21:08.073Z" }, + { url = "https://files.pythonhosted.org/packages/f9/ac/46f960ca323037caa0a10662ef97d0a4728e890334fc156b9f9e52bcc4ca/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e", size = 23359, upload-time = "2024-10-18T15:21:09.318Z" }, + { url = "https://files.pythonhosted.org/packages/69/84/83439e16197337b8b14b6a5b9c2105fff81d42c2a7c5b58ac7b62ee2c3b1/MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4", size = 23306, upload-time = "2024-10-18T15:21:10.185Z" }, + { url = "https://files.pythonhosted.org/packages/9a/34/a15aa69f01e2181ed8d2b685c0d2f6655d5cca2c4db0ddea775e631918cd/MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d", size = 15094, upload-time = "2024-10-18T15:21:11.005Z" }, + { url = "https://files.pythonhosted.org/packages/da/b8/3a3bd761922d416f3dc5d00bfbed11f66b1ab89a0c2b6e887240a30b0f6b/MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b", size = 15521, upload-time = "2024-10-18T15:21:12.911Z" }, { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274, upload-time = "2024-10-18T15:21:13.777Z" }, { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348, upload-time = "2024-10-18T15:21:14.822Z" }, { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149, upload-time = "2024-10-18T15:21:15.642Z" }, @@ -1534,6 +1727,12 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/26/91/d49359a21893183ed2a5b6c76bec40e0b1dcbf8ca148f864d134897cfc75/matplotlib-3.10.3.tar.gz", hash = "sha256:2f82d2c5bb7ae93aaaa4cd42aca65d76ce6376f83304fa3a630b569aca274df0", size = 34799811, upload-time = "2025-05-08T19:10:54.39Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/bd/af9f655456f60fe1d575f54fb14704ee299b16e999704817a7645dfce6b0/matplotlib-3.10.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0ef061f74cd488586f552d0c336b2f078d43bc00dc473d2c3e7bfee2272f3fa8", size = 8178873, upload-time = "2025-05-08T19:09:53.857Z" }, + { url = "https://files.pythonhosted.org/packages/c2/86/e1c86690610661cd716eda5f9d0b35eaf606ae6c9b6736687cfc8f2d0cd8/matplotlib-3.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d96985d14dc5f4a736bbea4b9de9afaa735f8a0fc2ca75be2fa9e96b2097369d", size = 8052205, upload-time = "2025-05-08T19:09:55.684Z" }, + { url = "https://files.pythonhosted.org/packages/54/51/a9f8e49af3883dacddb2da1af5fca1f7468677f1188936452dd9aaaeb9ed/matplotlib-3.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c5f0283da91e9522bdba4d6583ed9d5521566f63729ffb68334f86d0bb98049", size = 8465823, upload-time = "2025-05-08T19:09:57.442Z" }, + { url = "https://files.pythonhosted.org/packages/e7/e3/c82963a3b86d6e6d5874cbeaa390166458a7f1961bab9feb14d3d1a10f02/matplotlib-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdfa07c0ec58035242bc8b2c8aae37037c9a886370eef6850703d7583e19964b", size = 8606464, upload-time = "2025-05-08T19:09:59.471Z" }, + { url = "https://files.pythonhosted.org/packages/0e/34/24da1027e7fcdd9e82da3194c470143c551852757a4b473a09a012f5b945/matplotlib-3.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c0b9849a17bce080a16ebcb80a7b714b5677d0ec32161a2cc0a8e5a6030ae220", size = 9413103, upload-time = "2025-05-08T19:10:03.208Z" }, + { url = "https://files.pythonhosted.org/packages/a6/da/948a017c3ea13fd4a97afad5fdebe2f5bbc4d28c0654510ce6fd6b06b7bd/matplotlib-3.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:eef6ed6c03717083bc6d69c2d7ee8624205c29a8e6ea5a31cd3492ecdbaee1e1", size = 8065492, upload-time = "2025-05-08T19:10:05.271Z" }, { url = "https://files.pythonhosted.org/packages/eb/43/6b80eb47d1071f234ef0c96ca370c2ca621f91c12045f1401b5c9b28a639/matplotlib-3.10.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0ab1affc11d1f495ab9e6362b8174a25afc19c081ba5b0775ef00533a4236eea", size = 8179689, upload-time = "2025-05-08T19:10:07.602Z" }, { url = "https://files.pythonhosted.org/packages/0f/70/d61a591958325c357204870b5e7b164f93f2a8cca1dc6ce940f563909a13/matplotlib-3.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2a818d8bdcafa7ed2eed74487fdb071c09c1ae24152d403952adad11fa3c65b4", size = 8050466, upload-time = "2025-05-08T19:10:09.383Z" }, { url = "https://files.pythonhosted.org/packages/e7/75/70c9d2306203148cc7902a961240c5927dd8728afedf35e6a77e105a2985/matplotlib-3.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:748ebc3470c253e770b17d8b0557f0aa85cf8c63fd52f1a61af5b27ec0b7ffee", size = 8456252, upload-time = "2025-05-08T19:10:11.958Z" }, @@ -1644,6 +1843,22 @@ version = "5.1.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/47/1b/1fc6888c74cbd8abad1292dde2ddfcf8fc059e114c97dd6bf16d12f36293/mmh3-5.1.0.tar.gz", hash = "sha256:136e1e670500f177f49ec106a4ebf0adf20d18d96990cc36ea492c651d2b406c", size = 33728, upload-time = "2025-01-25T08:39:43.386Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/56/09/fda7af7fe65928262098382e3bf55950cfbf67d30bf9e47731bf862161e9/mmh3-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b529dcda3f951ff363a51d5866bc6d63cf57f1e73e8961f864ae5010647079d", size = 56098, upload-time = "2025-01-25T08:38:22.917Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ab/84c7bc3f366d6f3bd8b5d9325a10c367685bc17c26dac4c068e2001a4671/mmh3-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db1079b3ace965e562cdfc95847312f9273eb2ad3ebea983435c8423e06acd7", size = 40513, upload-time = "2025-01-25T08:38:25.079Z" }, + { url = "https://files.pythonhosted.org/packages/4f/21/25ea58ca4a652bdc83d1528bec31745cce35802381fb4fe3c097905462d2/mmh3-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:22d31e3a0ff89b8eb3b826d6fc8e19532998b2aa6b9143698043a1268da413e1", size = 40112, upload-time = "2025-01-25T08:38:25.947Z" }, + { url = "https://files.pythonhosted.org/packages/bd/78/4f12f16ae074ddda6f06745254fdb50f8cf3c85b0bbf7eaca58bed84bf58/mmh3-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2139bfbd354cd6cb0afed51c4b504f29bcd687a3b1460b7e89498329cc28a894", size = 102632, upload-time = "2025-01-25T08:38:26.939Z" }, + { url = "https://files.pythonhosted.org/packages/48/11/8f09dc999cf2a09b6138d8d7fc734efb7b7bfdd9adb9383380941caadff0/mmh3-5.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8c8105c6a435bc2cd6ea2ef59558ab1a2976fd4a4437026f562856d08996673a", size = 108884, upload-time = "2025-01-25T08:38:29.159Z" }, + { url = "https://files.pythonhosted.org/packages/bd/91/e59a66538a3364176f6c3f7620eee0ab195bfe26f89a95cbcc7a1fb04b28/mmh3-5.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57730067174a7f36fcd6ce012fe359bd5510fdaa5fe067bc94ed03e65dafb769", size = 106835, upload-time = "2025-01-25T08:38:33.04Z" }, + { url = "https://files.pythonhosted.org/packages/25/14/b85836e21ab90e5cddb85fe79c494ebd8f81d96a87a664c488cc9277668b/mmh3-5.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bde80eb196d7fdc765a318604ded74a4378f02c5b46c17aa48a27d742edaded2", size = 93688, upload-time = "2025-01-25T08:38:34.987Z" }, + { url = "https://files.pythonhosted.org/packages/ac/aa/8bc964067df9262740c95e4cde2d19f149f2224f426654e14199a9e47df6/mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e9c8eddcb441abddeb419c16c56fd74b3e2df9e57f7aa2903221996718435c7a", size = 101569, upload-time = "2025-01-25T08:38:35.983Z" }, + { url = "https://files.pythonhosted.org/packages/70/b6/1fb163cbf919046a64717466c00edabebece3f95c013853fec76dbf2df92/mmh3-5.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:99e07e4acafbccc7a28c076a847fb060ffc1406036bc2005acb1b2af620e53c3", size = 98483, upload-time = "2025-01-25T08:38:38.198Z" }, + { url = "https://files.pythonhosted.org/packages/70/49/ba64c050dd646060f835f1db6b2cd60a6485f3b0ea04976e7a29ace7312e/mmh3-5.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9e25ba5b530e9a7d65f41a08d48f4b3fedc1e89c26486361166a5544aa4cad33", size = 96496, upload-time = "2025-01-25T08:38:39.257Z" }, + { url = "https://files.pythonhosted.org/packages/9e/07/f2751d6a0b535bb865e1066e9c6b80852571ef8d61bce7eb44c18720fbfc/mmh3-5.1.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bb9bf7475b4d99156ce2f0cf277c061a17560c8c10199c910a680869a278ddc7", size = 105109, upload-time = "2025-01-25T08:38:40.395Z" }, + { url = "https://files.pythonhosted.org/packages/b7/02/30360a5a66f7abba44596d747cc1e6fb53136b168eaa335f63454ab7bb79/mmh3-5.1.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2a1b0878dd281ea3003368ab53ff6f568e175f1b39f281df1da319e58a19c23a", size = 98231, upload-time = "2025-01-25T08:38:42.141Z" }, + { url = "https://files.pythonhosted.org/packages/8c/60/8526b0c750ff4d7ae1266e68b795f14b97758a1d9fcc19f6ecabf9c55656/mmh3-5.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:25f565093ac8b8aefe0f61f8f95c9a9d11dd69e6a9e9832ff0d293511bc36258", size = 97548, upload-time = "2025-01-25T08:38:43.402Z" }, + { url = "https://files.pythonhosted.org/packages/6d/4c/26e1222aca65769280d5427a1ce5875ef4213449718c8f03958d0bf91070/mmh3-5.1.0-cp311-cp311-win32.whl", hash = "sha256:1e3554d8792387eac73c99c6eaea0b3f884e7130eb67986e11c403e4f9b6d372", size = 40810, upload-time = "2025-01-25T08:38:45.143Z" }, + { url = "https://files.pythonhosted.org/packages/98/d5/424ba95062d1212ea615dc8debc8d57983f2242d5e6b82e458b89a117a1e/mmh3-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:8ad777a48197882492af50bf3098085424993ce850bdda406a358b6ab74be759", size = 41476, upload-time = "2025-01-25T08:38:46.029Z" }, + { url = "https://files.pythonhosted.org/packages/bd/08/0315ccaf087ba55bb19a6dd3b1e8acd491e74ce7f5f9c4aaa06a90d66441/mmh3-5.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:f29dc4efd99bdd29fe85ed6c81915b17b2ef2cf853abf7213a48ac6fb3eaabe1", size = 38880, upload-time = "2025-01-25T08:38:47.035Z" }, { url = "https://files.pythonhosted.org/packages/f4/47/e5f452bdf16028bfd2edb4e2e35d0441e4a4740f30e68ccd4cfd2fb2c57e/mmh3-5.1.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:45712987367cb9235026e3cbf4334670522a97751abfd00b5bc8bfa022c3311d", size = 56152, upload-time = "2025-01-25T08:38:47.902Z" }, { url = "https://files.pythonhosted.org/packages/60/38/2132d537dc7a7fdd8d2e98df90186c7fcdbd3f14f95502a24ba443c92245/mmh3-5.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b1020735eb35086ab24affbea59bb9082f7f6a0ad517cb89f0fc14f16cea4dae", size = 40564, upload-time = "2025-01-25T08:38:48.839Z" }, { url = "https://files.pythonhosted.org/packages/c0/2a/c52cf000581bfb8d94794f58865658e7accf2fa2e90789269d4ae9560b16/mmh3-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:babf2a78ce5513d120c358722a2e3aa7762d6071cd10cede026f8b32452be322", size = 40104, upload-time = "2025-01-25T08:38:49.773Z" }, @@ -1719,6 +1934,16 @@ version = "1.1.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/45/b1/ea4f68038a18c77c9467400d166d74c4ffa536f34761f7983a104357e614/msgpack-1.1.1.tar.gz", hash = "sha256:77b79ce34a2bdab2594f490c8e80dd62a02d650b91a75159a63ec413b8d104cd", size = 173555, upload-time = "2025-06-13T06:52:51.324Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/83/97f24bf9848af23fe2ba04380388216defc49a8af6da0c28cc636d722502/msgpack-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:71ef05c1726884e44f8b1d1773604ab5d4d17729d8491403a705e649116c9558", size = 82728, upload-time = "2025-06-13T06:51:50.68Z" }, + { url = "https://files.pythonhosted.org/packages/aa/7f/2eaa388267a78401f6e182662b08a588ef4f3de6f0eab1ec09736a7aaa2b/msgpack-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:36043272c6aede309d29d56851f8841ba907a1a3d04435e43e8a19928e243c1d", size = 79279, upload-time = "2025-06-13T06:51:51.72Z" }, + { url = "https://files.pythonhosted.org/packages/f8/46/31eb60f4452c96161e4dfd26dbca562b4ec68c72e4ad07d9566d7ea35e8a/msgpack-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a32747b1b39c3ac27d0670122b57e6e57f28eefb725e0b625618d1b59bf9d1e0", size = 423859, upload-time = "2025-06-13T06:51:52.749Z" }, + { url = "https://files.pythonhosted.org/packages/45/16/a20fa8c32825cc7ae8457fab45670c7a8996d7746ce80ce41cc51e3b2bd7/msgpack-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a8b10fdb84a43e50d38057b06901ec9da52baac6983d3f709d8507f3889d43f", size = 429975, upload-time = "2025-06-13T06:51:53.97Z" }, + { url = "https://files.pythonhosted.org/packages/86/ea/6c958e07692367feeb1a1594d35e22b62f7f476f3c568b002a5ea09d443d/msgpack-1.1.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0c325c3f485dc54ec298d8b024e134acf07c10d494ffa24373bea729acf704", size = 413528, upload-time = "2025-06-13T06:51:55.507Z" }, + { url = "https://files.pythonhosted.org/packages/75/05/ac84063c5dae79722bda9f68b878dc31fc3059adb8633c79f1e82c2cd946/msgpack-1.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:88daaf7d146e48ec71212ce21109b66e06a98e5e44dca47d853cbfe171d6c8d2", size = 413338, upload-time = "2025-06-13T06:51:57.023Z" }, + { url = "https://files.pythonhosted.org/packages/69/e8/fe86b082c781d3e1c09ca0f4dacd457ede60a13119b6ce939efe2ea77b76/msgpack-1.1.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:d8b55ea20dc59b181d3f47103f113e6f28a5e1c89fd5b67b9140edb442ab67f2", size = 422658, upload-time = "2025-06-13T06:51:58.419Z" }, + { url = "https://files.pythonhosted.org/packages/3b/2b/bafc9924df52d8f3bb7c00d24e57be477f4d0f967c0a31ef5e2225e035c7/msgpack-1.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4a28e8072ae9779f20427af07f53bbb8b4aa81151054e882aee333b158da8752", size = 427124, upload-time = "2025-06-13T06:51:59.969Z" }, + { url = "https://files.pythonhosted.org/packages/a2/3b/1f717e17e53e0ed0b68fa59e9188f3f610c79d7151f0e52ff3cd8eb6b2dc/msgpack-1.1.1-cp311-cp311-win32.whl", hash = "sha256:7da8831f9a0fdb526621ba09a281fadc58ea12701bc709e7b8cbc362feabc295", size = 65016, upload-time = "2025-06-13T06:52:01.294Z" }, + { url = "https://files.pythonhosted.org/packages/48/45/9d1780768d3b249accecc5a38c725eb1e203d44a191f7b7ff1941f7df60c/msgpack-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fd1b58e1431008a57247d6e7cc4faa41c3607e8e7d4aaf81f7c29ea013cb458", size = 72267, upload-time = "2025-06-13T06:52:02.568Z" }, { url = "https://files.pythonhosted.org/packages/e3/26/389b9c593eda2b8551b2e7126ad3a06af6f9b44274eb3a4f054d48ff7e47/msgpack-1.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ae497b11f4c21558d95de9f64fff7053544f4d1a17731c866143ed6bb4591238", size = 82359, upload-time = "2025-06-13T06:52:03.909Z" }, { url = "https://files.pythonhosted.org/packages/ab/65/7d1de38c8a22cf8b1551469159d4b6cf49be2126adc2482de50976084d78/msgpack-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:33be9ab121df9b6b461ff91baac6f2731f83d9b27ed948c5b9d1978ae28bf157", size = 79172, upload-time = "2025-06-13T06:52:05.246Z" }, { url = "https://files.pythonhosted.org/packages/0f/bd/cacf208b64d9577a62c74b677e1ada005caa9b69a05a599889d6fc2ab20a/msgpack-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f64ae8fe7ffba251fecb8408540c34ee9df1c26674c50c4544d72dbf792e5ce", size = 425013, upload-time = "2025-06-13T06:52:06.341Z" }, @@ -1747,6 +1972,23 @@ version = "6.4.4" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/91/2f/a3470242707058fe856fe59241eee5635d79087100b7042a867368863a27/multidict-6.4.4.tar.gz", hash = "sha256:69ee9e6ba214b5245031b76233dd95408a0fd57fdb019ddcc1ead4790932a8e8", size = 90183, upload-time = "2025-05-19T14:16:37.381Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/19/1b/4c6e638195851524a63972c5773c7737bea7e47b1ba402186a37773acee2/multidict-6.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4f5f29794ac0e73d2a06ac03fd18870adc0135a9d384f4a306a951188ed02f95", size = 65515, upload-time = "2025-05-19T14:14:19.767Z" }, + { url = "https://files.pythonhosted.org/packages/25/d5/10e6bca9a44b8af3c7f920743e5fc0c2bcf8c11bf7a295d4cfe00b08fb46/multidict-6.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c04157266344158ebd57b7120d9b0b35812285d26d0e78193e17ef57bfe2979a", size = 38609, upload-time = "2025-05-19T14:14:21.538Z" }, + { url = "https://files.pythonhosted.org/packages/26/b4/91fead447ccff56247edc7f0535fbf140733ae25187a33621771ee598a18/multidict-6.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bb61ffd3ab8310d93427e460f565322c44ef12769f51f77277b4abad7b6f7223", size = 37871, upload-time = "2025-05-19T14:14:22.666Z" }, + { url = "https://files.pythonhosted.org/packages/3b/37/cbc977cae59277e99d15bbda84cc53b5e0c4929ffd91d958347200a42ad0/multidict-6.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e0ba18a9afd495f17c351d08ebbc4284e9c9f7971d715f196b79636a4d0de44", size = 226661, upload-time = "2025-05-19T14:14:24.124Z" }, + { url = "https://files.pythonhosted.org/packages/15/cd/7e0b57fbd4dc2fc105169c4ecce5be1a63970f23bb4ec8c721b67e11953d/multidict-6.4.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:9faf1b1dcaadf9f900d23a0e6d6c8eadd6a95795a0e57fcca73acce0eb912065", size = 223422, upload-time = "2025-05-19T14:14:25.437Z" }, + { url = "https://files.pythonhosted.org/packages/f1/01/1de268da121bac9f93242e30cd3286f6a819e5f0b8896511162d6ed4bf8d/multidict-6.4.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a4d1cb1327c6082c4fce4e2a438483390964c02213bc6b8d782cf782c9b1471f", size = 235447, upload-time = "2025-05-19T14:14:26.793Z" }, + { url = "https://files.pythonhosted.org/packages/d2/8c/8b9a5e4aaaf4f2de14e86181a3a3d7b105077f668b6a06f043ec794f684c/multidict-6.4.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:941f1bec2f5dbd51feeb40aea654c2747f811ab01bdd3422a48a4e4576b7d76a", size = 231455, upload-time = "2025-05-19T14:14:28.149Z" }, + { url = "https://files.pythonhosted.org/packages/35/db/e1817dcbaa10b319c412769cf999b1016890849245d38905b73e9c286862/multidict-6.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5f8a146184da7ea12910a4cec51ef85e44f6268467fb489c3caf0cd512f29c2", size = 223666, upload-time = "2025-05-19T14:14:29.584Z" }, + { url = "https://files.pythonhosted.org/packages/4a/e1/66e8579290ade8a00e0126b3d9a93029033ffd84f0e697d457ed1814d0fc/multidict-6.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:232b7237e57ec3c09be97206bfb83a0aa1c5d7d377faa019c68a210fa35831f1", size = 217392, upload-time = "2025-05-19T14:14:30.961Z" }, + { url = "https://files.pythonhosted.org/packages/7b/6f/f8639326069c24a48c7747c2a5485d37847e142a3f741ff3340c88060a9a/multidict-6.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:55ae0721c1513e5e3210bca4fc98456b980b0c2c016679d3d723119b6b202c42", size = 228969, upload-time = "2025-05-19T14:14:32.672Z" }, + { url = "https://files.pythonhosted.org/packages/d2/c3/3d58182f76b960eeade51c89fcdce450f93379340457a328e132e2f8f9ed/multidict-6.4.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:51d662c072579f63137919d7bb8fc250655ce79f00c82ecf11cab678f335062e", size = 217433, upload-time = "2025-05-19T14:14:34.016Z" }, + { url = "https://files.pythonhosted.org/packages/e1/4b/f31a562906f3bd375f3d0e83ce314e4a660c01b16c2923e8229b53fba5d7/multidict-6.4.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0e05c39962baa0bb19a6b210e9b1422c35c093b651d64246b6c2e1a7e242d9fd", size = 225418, upload-time = "2025-05-19T14:14:35.376Z" }, + { url = "https://files.pythonhosted.org/packages/99/89/78bb95c89c496d64b5798434a3deee21996114d4d2c28dd65850bf3a691e/multidict-6.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:d5b1cc3ab8c31d9ebf0faa6e3540fb91257590da330ffe6d2393d4208e638925", size = 235042, upload-time = "2025-05-19T14:14:36.723Z" }, + { url = "https://files.pythonhosted.org/packages/74/91/8780a6e5885a8770442a8f80db86a0887c4becca0e5a2282ba2cae702bc4/multidict-6.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:93ec84488a384cd7b8a29c2c7f467137d8a73f6fe38bb810ecf29d1ade011a7c", size = 230280, upload-time = "2025-05-19T14:14:38.194Z" }, + { url = "https://files.pythonhosted.org/packages/68/c1/fcf69cabd542eb6f4b892469e033567ee6991d361d77abdc55e3a0f48349/multidict-6.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b308402608493638763abc95f9dc0030bbd6ac6aff784512e8ac3da73a88af08", size = 223322, upload-time = "2025-05-19T14:14:40.015Z" }, + { url = "https://files.pythonhosted.org/packages/b8/85/5b80bf4b83d8141bd763e1d99142a9cdfd0db83f0739b4797172a4508014/multidict-6.4.4-cp311-cp311-win32.whl", hash = "sha256:343892a27d1a04d6ae455ecece12904d242d299ada01633d94c4f431d68a8c49", size = 35070, upload-time = "2025-05-19T14:14:41.904Z" }, + { url = "https://files.pythonhosted.org/packages/09/66/0bed198ffd590ab86e001f7fa46b740d58cf8ff98c2f254e4a36bf8861ad/multidict-6.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:73484a94f55359780c0f458bbd3c39cb9cf9c182552177d2136e828269dee529", size = 38667, upload-time = "2025-05-19T14:14:43.534Z" }, { url = "https://files.pythonhosted.org/packages/d2/b5/5675377da23d60875fe7dae6be841787755878e315e2f517235f22f59e18/multidict-6.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dc388f75a1c00000824bf28b7633e40854f4127ede80512b44c3cfeeea1839a2", size = 64293, upload-time = "2025-05-19T14:14:44.724Z" }, { url = "https://files.pythonhosted.org/packages/34/a7/be384a482754bb8c95d2bbe91717bf7ccce6dc38c18569997a11f95aa554/multidict-6.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:98af87593a666f739d9dba5d0ae86e01b0e1a9cfcd2e30d2d361fbbbd1a9162d", size = 38096, upload-time = "2025-05-19T14:14:45.95Z" }, { url = "https://files.pythonhosted.org/packages/66/6d/d59854bb4352306145bdfd1704d210731c1bb2c890bfee31fb7bbc1c4c7f/multidict-6.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aff4cafea2d120327d55eadd6b7f1136a8e5a0ecf6fb3b6863e8aca32cd8e50a", size = 37214, upload-time = "2025-05-19T14:14:47.158Z" }, @@ -1825,6 +2067,16 @@ version = "2.2.6" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" }, + { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" }, + { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" }, + { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" }, + { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" }, + { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" }, + { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" }, + { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" }, + { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" }, + { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" }, { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" }, { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" }, { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" }, @@ -1894,7 +2146,7 @@ name = "nvidia-cudnn-cu12" version = "9.1.0.70" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" }, @@ -1905,7 +2157,7 @@ name = "nvidia-cufft-cu12" version = "11.2.1.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" }, @@ -1924,9 +2176,9 @@ name = "nvidia-cusolver-cu12" version = "11.6.1.9" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12" }, - { name = "nvidia-cusparse-cu12" }, - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" }, @@ -1937,7 +2189,7 @@ name = "nvidia-cusparse-cu12" version = "12.3.1.170" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" }, @@ -2022,19 +2274,19 @@ wheels = [ [[package]] name = "opencv-python" -version = "4.12.0.88" +version = "4.11.0.86" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ac/71/25c98e634b6bdeca4727c7f6d6927b056080668c5008ad3c8fc9e7f8f6ec/opencv-python-4.12.0.88.tar.gz", hash = "sha256:8b738389cede219405f6f3880b851efa3415ccd674752219377353f017d2994d", size = 95373294, upload-time = "2025-07-07T09:20:52.389Z" } +sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/85/68/3da40142e7c21e9b1d4e7ddd6c58738feb013203e6e4b803d62cdd9eb96b/opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:f9a1f08883257b95a5764bf517a32d75aec325319c8ed0f89739a57fae9e92a5", size = 37877727, upload-time = "2025-07-07T09:13:31.47Z" }, - { url = "https://files.pythonhosted.org/packages/33/7c/042abe49f58d6ee7e1028eefc3334d98ca69b030e3b567fe245a2b28ea6f/opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:812eb116ad2b4de43ee116fcd8991c3a687f099ada0b04e68f64899c09448e81", size = 57326471, upload-time = "2025-07-07T09:13:41.26Z" }, - { url = "https://files.pythonhosted.org/packages/62/3a/440bd64736cf8116f01f3b7f9f2e111afb2e02beb2ccc08a6458114a6b5d/opencv_python-4.12.0.88-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:51fd981c7df6af3e8f70b1556696b05224c4e6b6777bdd2a46b3d4fb09de1a92", size = 45887139, upload-time = "2025-07-07T09:13:50.761Z" }, - { url = "https://files.pythonhosted.org/packages/68/1f/795e7f4aa2eacc59afa4fb61a2e35e510d06414dd5a802b51a012d691b37/opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:092c16da4c5a163a818f120c22c5e4a2f96e0db4f24e659c701f1fe629a690f9", size = 67041680, upload-time = "2025-07-07T09:14:01.995Z" }, - { url = "https://files.pythonhosted.org/packages/02/96/213fea371d3cb2f1d537612a105792aa0a6659fb2665b22cad709a75bd94/opencv_python-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:ff554d3f725b39878ac6a2e1fa232ec509c36130927afc18a1719ebf4fbf4357", size = 30284131, upload-time = "2025-07-07T09:14:08.819Z" }, - { url = "https://files.pythonhosted.org/packages/fa/80/eb88edc2e2b11cd2dd2e56f1c80b5784d11d6e6b7f04a1145df64df40065/opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:d98edb20aa932fd8ebd276a72627dad9dc097695b3d435a4257557bbb49a79d2", size = 39000307, upload-time = "2025-07-07T09:14:16.641Z" }, + { url = "https://files.pythonhosted.org/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322, upload-time = "2025-01-16T13:52:25.887Z" }, + { url = "https://files.pythonhosted.org/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197, upload-time = "2025-01-16T13:55:21.222Z" }, + { url = "https://files.pythonhosted.org/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439, upload-time = "2025-01-16T13:51:35.822Z" }, + { url = "https://files.pythonhosted.org/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597, upload-time = "2025-01-16T13:52:08.836Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" }, ] [[package]] @@ -2254,6 +2506,13 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213, upload-time = "2024-09-20T13:10:04.827Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/44/d9502bf0ed197ba9bf1103c9867d5904ddcaf869e52329787fc54ed70cc8/pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039", size = 12602222, upload-time = "2024-09-20T13:08:56.254Z" }, + { url = "https://files.pythonhosted.org/packages/52/11/9eac327a38834f162b8250aab32a6781339c69afe7574368fffe46387edf/pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd", size = 11321274, upload-time = "2024-09-20T13:08:58.645Z" }, + { url = "https://files.pythonhosted.org/packages/45/fb/c4beeb084718598ba19aa9f5abbc8aed8b42f90930da861fcb1acdb54c3a/pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698", size = 15579836, upload-time = "2024-09-20T19:01:57.571Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5f/4dba1d39bb9c38d574a9a22548c540177f78ea47b32f99c0ff2ec499fac5/pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc", size = 13058505, upload-time = "2024-09-20T13:09:01.501Z" }, + { url = "https://files.pythonhosted.org/packages/b9/57/708135b90391995361636634df1f1130d03ba456e95bcf576fada459115a/pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3", size = 16744420, upload-time = "2024-09-20T19:02:00.678Z" }, + { url = "https://files.pythonhosted.org/packages/86/4a/03ed6b7ee323cf30404265c284cee9c65c56a212e0a08d9ee06984ba2240/pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32", size = 14440457, upload-time = "2024-09-20T13:09:04.105Z" }, + { url = "https://files.pythonhosted.org/packages/ed/8c/87ddf1fcb55d11f9f847e3c69bb1c6f8e46e2f40ab1a2d2abadb2401b007/pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5", size = 11617166, upload-time = "2024-09-20T13:09:06.917Z" }, { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893, upload-time = "2024-09-20T13:09:09.655Z" }, { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475, upload-time = "2024-09-20T13:09:14.718Z" }, { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645, upload-time = "2024-09-20T19:02:03.88Z" }, @@ -2312,6 +2571,17 @@ version = "11.2.1" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707, upload-time = "2025-04-12T17:50:03.289Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/68/08/3fbf4b98924c73037a8e8b4c2c774784805e0fb4ebca6c5bb60795c40125/pillow-11.2.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35ca289f712ccfc699508c4658a1d14652e8033e9b69839edf83cbdd0ba39e70", size = 3198450, upload-time = "2025-04-12T17:47:37.135Z" }, + { url = "https://files.pythonhosted.org/packages/84/92/6505b1af3d2849d5e714fc75ba9e69b7255c05ee42383a35a4d58f576b16/pillow-11.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0409af9f829f87a2dfb7e259f78f317a5351f2045158be321fd135973fff7bf", size = 3030550, upload-time = "2025-04-12T17:47:39.345Z" }, + { url = "https://files.pythonhosted.org/packages/3c/8c/ac2f99d2a70ff966bc7eb13dacacfaab57c0549b2ffb351b6537c7840b12/pillow-11.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4e5c5edee874dce4f653dbe59db7c73a600119fbea8d31f53423586ee2aafd7", size = 4415018, upload-time = "2025-04-12T17:47:41.128Z" }, + { url = "https://files.pythonhosted.org/packages/1f/e3/0a58b5d838687f40891fff9cbaf8669f90c96b64dc8f91f87894413856c6/pillow-11.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b93a07e76d13bff9444f1a029e0af2964e654bfc2e2c2d46bfd080df5ad5f3d8", size = 4498006, upload-time = "2025-04-12T17:47:42.912Z" }, + { url = "https://files.pythonhosted.org/packages/21/f5/6ba14718135f08fbfa33308efe027dd02b781d3f1d5c471444a395933aac/pillow-11.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:e6def7eed9e7fa90fde255afaf08060dc4b343bbe524a8f69bdd2a2f0018f600", size = 4517773, upload-time = "2025-04-12T17:47:44.611Z" }, + { url = "https://files.pythonhosted.org/packages/20/f2/805ad600fc59ebe4f1ba6129cd3a75fb0da126975c8579b8f57abeb61e80/pillow-11.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8f4f3724c068be008c08257207210c138d5f3731af6c155a81c2b09a9eb3a788", size = 4607069, upload-time = "2025-04-12T17:47:46.46Z" }, + { url = "https://files.pythonhosted.org/packages/71/6b/4ef8a288b4bb2e0180cba13ca0a519fa27aa982875882392b65131401099/pillow-11.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0a6709b47019dff32e678bc12c63008311b82b9327613f534e496dacaefb71e", size = 4583460, upload-time = "2025-04-12T17:47:49.255Z" }, + { url = "https://files.pythonhosted.org/packages/62/ae/f29c705a09cbc9e2a456590816e5c234382ae5d32584f451c3eb41a62062/pillow-11.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f6b0c664ccb879109ee3ca702a9272d877f4fcd21e5eb63c26422fd6e415365e", size = 4661304, upload-time = "2025-04-12T17:47:51.067Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1a/c8217b6f2f73794a5e219fbad087701f412337ae6dbb956db37d69a9bc43/pillow-11.2.1-cp311-cp311-win32.whl", hash = "sha256:cc5d875d56e49f112b6def6813c4e3d3036d269c008bf8aef72cd08d20ca6df6", size = 2331809, upload-time = "2025-04-12T17:47:54.425Z" }, + { url = "https://files.pythonhosted.org/packages/e2/72/25a8f40170dc262e86e90f37cb72cb3de5e307f75bf4b02535a61afcd519/pillow-11.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:0f5c7eda47bf8e3c8a283762cab94e496ba977a420868cb819159980b6709193", size = 2676338, upload-time = "2025-04-12T17:47:56.535Z" }, + { url = "https://files.pythonhosted.org/packages/06/9e/76825e39efee61efea258b479391ca77d64dbd9e5804e4ad0fa453b4ba55/pillow-11.2.1-cp311-cp311-win_arm64.whl", hash = "sha256:4d375eb838755f2528ac8cbc926c3e31cc49ca4ad0cf79cff48b20e30634a4a7", size = 2414918, upload-time = "2025-04-12T17:47:58.217Z" }, { url = "https://files.pythonhosted.org/packages/c7/40/052610b15a1b8961f52537cc8326ca6a881408bc2bdad0d852edeb6ed33b/pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f", size = 3190185, upload-time = "2025-04-12T17:48:00.417Z" }, { url = "https://files.pythonhosted.org/packages/e5/7e/b86dbd35a5f938632093dc40d1682874c33dcfe832558fc80ca56bfcb774/pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b", size = 3030306, upload-time = "2025-04-12T17:48:02.391Z" }, { url = "https://files.pythonhosted.org/packages/a4/5c/467a161f9ed53e5eab51a42923c33051bf8d1a2af4626ac04f5166e58e0c/pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d", size = 4416121, upload-time = "2025-04-12T17:48:04.554Z" }, @@ -2345,6 +2615,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/92/1ca0c3f09233bd7decf8f7105a1c4e3162fb9142128c74adad0fb361b7eb/pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd", size = 2335774, upload-time = "2025-04-12T17:49:04.889Z" }, { url = "https://files.pythonhosted.org/packages/a5/ac/77525347cb43b83ae905ffe257bbe2cc6fd23acb9796639a1f56aa59d191/pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e", size = 2681895, upload-time = "2025-04-12T17:49:06.635Z" }, { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234, upload-time = "2025-04-12T17:49:08.399Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ad/2613c04633c7257d9481ab21d6b5364b59fc5d75faafd7cb8693523945a3/pillow-11.2.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:80f1df8dbe9572b4b7abdfa17eb5d78dd620b1d55d9e25f834efdbee872d3aed", size = 3181734, upload-time = "2025-04-12T17:49:46.789Z" }, + { url = "https://files.pythonhosted.org/packages/a4/fd/dcdda4471ed667de57bb5405bb42d751e6cfdd4011a12c248b455c778e03/pillow-11.2.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ea926cfbc3957090becbcbbb65ad177161a2ff2ad578b5a6ec9bb1e1cd78753c", size = 2999841, upload-time = "2025-04-12T17:49:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/ac/89/8a2536e95e77432833f0db6fd72a8d310c8e4272a04461fb833eb021bf94/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:738db0e0941ca0376804d4de6a782c005245264edaa253ffce24e5a15cbdc7bd", size = 3437470, upload-time = "2025-04-12T17:49:50.831Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8f/abd47b73c60712f88e9eda32baced7bfc3e9bd6a7619bb64b93acff28c3e/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db98ab6565c69082ec9b0d4e40dd9f6181dab0dd236d26f7a50b8b9bfbd5076", size = 3460013, upload-time = "2025-04-12T17:49:53.278Z" }, + { url = "https://files.pythonhosted.org/packages/f6/20/5c0a0aa83b213b7a07ec01e71a3d6ea2cf4ad1d2c686cc0168173b6089e7/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:036e53f4170e270ddb8797d4c590e6dd14d28e15c7da375c18978045f7e6c37b", size = 3527165, upload-time = "2025-04-12T17:49:55.164Z" }, + { url = "https://files.pythonhosted.org/packages/58/0e/2abab98a72202d91146abc839e10c14f7cf36166f12838ea0c4db3ca6ecb/pillow-11.2.1-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14f73f7c291279bd65fda51ee87affd7c1e097709f7fdd0188957a16c264601f", size = 3571586, upload-time = "2025-04-12T17:49:57.171Z" }, + { url = "https://files.pythonhosted.org/packages/21/2c/5e05f58658cf49b6667762cca03d6e7d85cededde2caf2ab37b81f80e574/pillow-11.2.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:208653868d5c9ecc2b327f9b9ef34e0e42a4cdd172c2988fd81d62d2bc9bc044", size = 2674751, upload-time = "2025-04-12T17:49:59.628Z" }, ] [[package]] @@ -2415,6 +2692,22 @@ version = "0.3.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a6/16/43264e4a779dd8588c21a70f0709665ee8f611211bdd2c87d952cfa7c776/propcache-0.3.2.tar.gz", hash = "sha256:20d7d62e4e7ef05f221e0db2856b979540686342e7dd9973b815599c7057e168", size = 44139, upload-time = "2025-06-09T22:56:06.081Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/80/8d/e8b436717ab9c2cfc23b116d2c297305aa4cd8339172a456d61ebf5669b8/propcache-0.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b8d2f607bd8f80ddc04088bc2a037fdd17884a6fcadc47a96e334d72f3717be", size = 74207, upload-time = "2025-06-09T22:54:05.399Z" }, + { url = "https://files.pythonhosted.org/packages/d6/29/1e34000e9766d112171764b9fa3226fa0153ab565d0c242c70e9945318a7/propcache-0.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:06766d8f34733416e2e34f46fea488ad5d60726bb9481d3cddf89a6fa2d9603f", size = 43648, upload-time = "2025-06-09T22:54:08.023Z" }, + { url = "https://files.pythonhosted.org/packages/46/92/1ad5af0df781e76988897da39b5f086c2bf0f028b7f9bd1f409bb05b6874/propcache-0.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2dc1f4a1df4fecf4e6f68013575ff4af84ef6f478fe5344317a65d38a8e6dc9", size = 43496, upload-time = "2025-06-09T22:54:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/ce/e96392460f9fb68461fabab3e095cb00c8ddf901205be4eae5ce246e5b7e/propcache-0.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be29c4f4810c5789cf10ddf6af80b041c724e629fa51e308a7a0fb19ed1ef7bf", size = 217288, upload-time = "2025-06-09T22:54:10.466Z" }, + { url = "https://files.pythonhosted.org/packages/c5/2a/866726ea345299f7ceefc861a5e782b045545ae6940851930a6adaf1fca6/propcache-0.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d61f6970ecbd8ff2e9360304d5c8876a6abd4530cb752c06586849ac8a9dc9", size = 227456, upload-time = "2025-06-09T22:54:11.828Z" }, + { url = "https://files.pythonhosted.org/packages/de/03/07d992ccb6d930398689187e1b3c718339a1c06b8b145a8d9650e4726166/propcache-0.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:62180e0b8dbb6b004baec00a7983e4cc52f5ada9cd11f48c3528d8cfa7b96a66", size = 225429, upload-time = "2025-06-09T22:54:13.823Z" }, + { url = "https://files.pythonhosted.org/packages/5d/e6/116ba39448753b1330f48ab8ba927dcd6cf0baea8a0ccbc512dfb49ba670/propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c144ca294a204c470f18cf4c9d78887810d04a3e2fbb30eea903575a779159df", size = 213472, upload-time = "2025-06-09T22:54:15.232Z" }, + { url = "https://files.pythonhosted.org/packages/a6/85/f01f5d97e54e428885a5497ccf7f54404cbb4f906688a1690cd51bf597dc/propcache-0.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5c2a784234c28854878d68978265617aa6dc0780e53d44b4d67f3651a17a9a2", size = 204480, upload-time = "2025-06-09T22:54:17.104Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/7bf5ab9033b8b8194cc3f7cf1aaa0e9c3256320726f64a3e1f113a812dce/propcache-0.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5745bc7acdafa978ca1642891b82c19238eadc78ba2aaa293c6863b304e552d7", size = 214530, upload-time = "2025-06-09T22:54:18.512Z" }, + { url = "https://files.pythonhosted.org/packages/31/0b/bd3e0c00509b609317df4a18e6b05a450ef2d9a963e1d8bc9c9415d86f30/propcache-0.3.2-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:c0075bf773d66fa8c9d41f66cc132ecc75e5bb9dd7cce3cfd14adc5ca184cb95", size = 205230, upload-time = "2025-06-09T22:54:19.947Z" }, + { url = "https://files.pythonhosted.org/packages/7a/23/fae0ff9b54b0de4e819bbe559508da132d5683c32d84d0dc2ccce3563ed4/propcache-0.3.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5f57aa0847730daceff0497f417c9de353c575d8da3579162cc74ac294c5369e", size = 206754, upload-time = "2025-06-09T22:54:21.716Z" }, + { url = "https://files.pythonhosted.org/packages/b7/7f/ad6a3c22630aaa5f618b4dc3c3598974a72abb4c18e45a50b3cdd091eb2f/propcache-0.3.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:eef914c014bf72d18efb55619447e0aecd5fb7c2e3fa7441e2e5d6099bddff7e", size = 218430, upload-time = "2025-06-09T22:54:23.17Z" }, + { url = "https://files.pythonhosted.org/packages/5b/2c/ba4f1c0e8a4b4c75910742f0d333759d441f65a1c7f34683b4a74c0ee015/propcache-0.3.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2a4092e8549031e82facf3decdbc0883755d5bbcc62d3aea9d9e185549936dcf", size = 223884, upload-time = "2025-06-09T22:54:25.539Z" }, + { url = "https://files.pythonhosted.org/packages/88/e4/ebe30fc399e98572019eee82ad0caf512401661985cbd3da5e3140ffa1b0/propcache-0.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:85871b050f174bc0bfb437efbdb68aaf860611953ed12418e4361bc9c392749e", size = 211480, upload-time = "2025-06-09T22:54:26.892Z" }, + { url = "https://files.pythonhosted.org/packages/96/0a/7d5260b914e01d1d0906f7f38af101f8d8ed0dc47426219eeaf05e8ea7c2/propcache-0.3.2-cp311-cp311-win32.whl", hash = "sha256:36c8d9b673ec57900c3554264e630d45980fd302458e4ac801802a7fd2ef7897", size = 37757, upload-time = "2025-06-09T22:54:28.241Z" }, + { url = "https://files.pythonhosted.org/packages/e1/2d/89fe4489a884bc0da0c3278c552bd4ffe06a1ace559db5ef02ef24ab446b/propcache-0.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53af8cb6a781b02d2ea079b5b853ba9430fcbe18a8e3ce647d5982a3ff69f39", size = 41500, upload-time = "2025-06-09T22:54:29.4Z" }, { url = "https://files.pythonhosted.org/packages/a8/42/9ca01b0a6f48e81615dca4765a8f1dd2c057e0540f6116a27dc5ee01dfb6/propcache-0.3.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8de106b6c84506b31c27168582cd3cb3000a6412c16df14a8628e5871ff83c10", size = 73674, upload-time = "2025-06-09T22:54:30.551Z" }, { url = "https://files.pythonhosted.org/packages/af/6e/21293133beb550f9c901bbece755d582bfaf2176bee4774000bd4dd41884/propcache-0.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:28710b0d3975117239c76600ea351934ac7b5ff56e60953474342608dbbb6154", size = 43570, upload-time = "2025-06-09T22:54:32.296Z" }, { url = "https://files.pythonhosted.org/packages/0c/c8/0393a0a3a2b8760eb3bde3c147f62b20044f0ddac81e9d6ed7318ec0d852/propcache-0.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce26862344bdf836650ed2487c3d724b00fbfec4233a1013f597b78c1cb73615", size = 43094, upload-time = "2025-06-09T22:54:33.929Z" }, @@ -2546,6 +2839,15 @@ version = "20.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a2/ee/a7810cb9f3d6e9238e61d312076a9859bf3668fd21c69744de9532383912/pyarrow-20.0.0.tar.gz", hash = "sha256:febc4a913592573c8d5805091a6c2b5064c8bd6e002131f01061797d91c783c1", size = 1125187, upload-time = "2025-04-27T12:34:23.264Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/47/a2/b7930824181ceadd0c63c1042d01fa4ef63eee233934826a7a2a9af6e463/pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:24ca380585444cb2a31324c546a9a56abbe87e26069189e14bdba19c86c049f0", size = 30856035, upload-time = "2025-04-27T12:28:40.78Z" }, + { url = "https://files.pythonhosted.org/packages/9b/18/c765770227d7f5bdfa8a69f64b49194352325c66a5c3bb5e332dfd5867d9/pyarrow-20.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:95b330059ddfdc591a3225f2d272123be26c8fa76e8c9ee1a77aad507361cfdb", size = 32309552, upload-time = "2025-04-27T12:28:47.051Z" }, + { url = "https://files.pythonhosted.org/packages/44/fb/dfb2dfdd3e488bb14f822d7335653092dde150cffc2da97de6e7500681f9/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f0fb1041267e9968c6d0d2ce3ff92e3928b243e2b6d11eeb84d9ac547308232", size = 41334704, upload-time = "2025-04-27T12:28:55.064Z" }, + { url = "https://files.pythonhosted.org/packages/58/0d/08a95878d38808051a953e887332d4a76bc06c6ee04351918ee1155407eb/pyarrow-20.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8ff87cc837601532cc8242d2f7e09b4e02404de1b797aee747dd4ba4bd6313f", size = 42399836, upload-time = "2025-04-27T12:29:02.13Z" }, + { url = "https://files.pythonhosted.org/packages/f3/cd/efa271234dfe38f0271561086eedcad7bc0f2ddd1efba423916ff0883684/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:7a3a5dcf54286e6141d5114522cf31dd67a9e7c9133d150799f30ee302a7a1ab", size = 40711789, upload-time = "2025-04-27T12:29:09.951Z" }, + { url = "https://files.pythonhosted.org/packages/46/1f/7f02009bc7fc8955c391defee5348f510e589a020e4b40ca05edcb847854/pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a6ad3e7758ecf559900261a4df985662df54fb7fdb55e8e3b3aa99b23d526b62", size = 42301124, upload-time = "2025-04-27T12:29:17.187Z" }, + { url = "https://files.pythonhosted.org/packages/4f/92/692c562be4504c262089e86757a9048739fe1acb4024f92d39615e7bab3f/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6bb830757103a6cb300a04610e08d9636f0cd223d32f388418ea893a3e655f1c", size = 42916060, upload-time = "2025-04-27T12:29:24.253Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ec/9f5c7e7c828d8e0a3c7ef50ee62eca38a7de2fa6eb1b8fa43685c9414fef/pyarrow-20.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:96e37f0766ecb4514a899d9a3554fadda770fb57ddf42b63d80f14bc20aa7db3", size = 44547640, upload-time = "2025-04-27T12:29:32.782Z" }, + { url = "https://files.pythonhosted.org/packages/54/96/46613131b4727f10fd2ffa6d0d6f02efcc09a0e7374eff3b5771548aa95b/pyarrow-20.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3346babb516f4b6fd790da99b98bed9708e3f02e734c84971faccb20736848dc", size = 25781491, upload-time = "2025-04-27T12:29:38.464Z" }, { url = "https://files.pythonhosted.org/packages/a1/d6/0c10e0d54f6c13eb464ee9b67a68b8c71bcf2f67760ef5b6fbcddd2ab05f/pyarrow-20.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:75a51a5b0eef32727a247707d4755322cb970be7e935172b6a3a9f9ae98404ba", size = 30815067, upload-time = "2025-04-27T12:29:44.384Z" }, { url = "https://files.pythonhosted.org/packages/7e/e2/04e9874abe4094a06fd8b0cbb0f1312d8dd7d707f144c2ec1e5e8f452ffa/pyarrow-20.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:211d5e84cecc640c7a3ab900f930aaff5cd2702177e0d562d426fb7c4f737781", size = 32297128, upload-time = "2025-04-27T12:29:52.038Z" }, { url = "https://files.pythonhosted.org/packages/31/fd/c565e5dcc906a3b471a83273039cb75cb79aad4a2d4a12f76cc5ae90a4b8/pyarrow-20.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ba3cf4182828be7a896cbd232aa8dd6a31bd1f9e32776cc3796c012855e1199", size = 41334890, upload-time = "2025-04-27T12:29:59.452Z" }, @@ -2671,6 +2973,20 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/ad/88/5f2260bdfae97aabf98f1778d43f69574390ad787afb646292a638c923d4/pydantic_core-2.33.2.tar.gz", hash = "sha256:7cb8bc3605c29176e1b105350d2e6474142d7c1bd1d9327c4a9bdb46bf827acc", size = 435195, upload-time = "2025-04-23T18:33:52.104Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/8d/71db63483d518cbbf290261a1fc2839d17ff89fce7089e08cad07ccfce67/pydantic_core-2.33.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c5b0a576fb381edd6d27f0a85915c6daf2f8138dc5c267a57c08a62900758c7", size = 2028584, upload-time = "2025-04-23T18:31:03.106Z" }, + { url = "https://files.pythonhosted.org/packages/24/2f/3cfa7244ae292dd850989f328722d2aef313f74ffc471184dc509e1e4e5a/pydantic_core-2.33.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e799c050df38a639db758c617ec771fd8fb7a5f8eaaa4b27b101f266b216a246", size = 1855071, upload-time = "2025-04-23T18:31:04.621Z" }, + { url = "https://files.pythonhosted.org/packages/b3/d3/4ae42d33f5e3f50dd467761304be2fa0a9417fbf09735bc2cce003480f2a/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dc46a01bf8d62f227d5ecee74178ffc448ff4e5197c756331f71efcc66dc980f", size = 1897823, upload-time = "2025-04-23T18:31:06.377Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f3/aa5976e8352b7695ff808599794b1fba2a9ae2ee954a3426855935799488/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a144d4f717285c6d9234a66778059f33a89096dfb9b39117663fd8413d582dcc", size = 1983792, upload-time = "2025-04-23T18:31:07.93Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7a/cda9b5a23c552037717f2b2a5257e9b2bfe45e687386df9591eff7b46d28/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:73cf6373c21bc80b2e0dc88444f41ae60b2f070ed02095754eb5a01df12256de", size = 2136338, upload-time = "2025-04-23T18:31:09.283Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/b8f9ec8dd1417eb9da784e91e1667d58a2a4a7b7b34cf4af765ef663a7e5/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dc625f4aa79713512d1976fe9f0bc99f706a9dee21dfd1810b4bbbf228d0e8a", size = 2730998, upload-time = "2025-04-23T18:31:11.7Z" }, + { url = "https://files.pythonhosted.org/packages/47/bc/cd720e078576bdb8255d5032c5d63ee5c0bf4b7173dd955185a1d658c456/pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b21b5549499972441da4758d662aeea93f1923f953e9cbaff14b8b9565aef", size = 2003200, upload-time = "2025-04-23T18:31:13.536Z" }, + { url = "https://files.pythonhosted.org/packages/ca/22/3602b895ee2cd29d11a2b349372446ae9727c32e78a94b3d588a40fdf187/pydantic_core-2.33.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bdc25f3681f7b78572699569514036afe3c243bc3059d3942624e936ec93450e", size = 2113890, upload-time = "2025-04-23T18:31:15.011Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e6/e3c5908c03cf00d629eb38393a98fccc38ee0ce8ecce32f69fc7d7b558a7/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fe5b32187cbc0c862ee201ad66c30cf218e5ed468ec8dc1cf49dec66e160cc4d", size = 2073359, upload-time = "2025-04-23T18:31:16.393Z" }, + { url = "https://files.pythonhosted.org/packages/12/e7/6a36a07c59ebefc8777d1ffdaf5ae71b06b21952582e4b07eba88a421c79/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:bc7aee6f634a6f4a95676fcb5d6559a2c2a390330098dba5e5a5f28a2e4ada30", size = 2245883, upload-time = "2025-04-23T18:31:17.892Z" }, + { url = "https://files.pythonhosted.org/packages/16/3f/59b3187aaa6cc0c1e6616e8045b284de2b6a87b027cce2ffcea073adf1d2/pydantic_core-2.33.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:235f45e5dbcccf6bd99f9f472858849f73d11120d76ea8707115415f8e5ebebf", size = 2241074, upload-time = "2025-04-23T18:31:19.205Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ed/55532bb88f674d5d8f67ab121a2a13c385df382de2a1677f30ad385f7438/pydantic_core-2.33.2-cp311-cp311-win32.whl", hash = "sha256:6368900c2d3ef09b69cb0b913f9f8263b03786e5b2a387706c5afb66800efd51", size = 1910538, upload-time = "2025-04-23T18:31:20.541Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1b/25b7cccd4519c0b23c2dd636ad39d381abf113085ce4f7bec2b0dc755eb1/pydantic_core-2.33.2-cp311-cp311-win_amd64.whl", hash = "sha256:1e063337ef9e9820c77acc768546325ebe04ee38b08703244c1309cccc4f1bab", size = 1952909, upload-time = "2025-04-23T18:31:22.371Z" }, + { url = "https://files.pythonhosted.org/packages/49/a9/d809358e49126438055884c4366a1f6227f0f84f635a9014e2deb9b9de54/pydantic_core-2.33.2-cp311-cp311-win_arm64.whl", hash = "sha256:6b99022f1d19bc32a4c2a0d544fc9a76e3be90f0b3f4af413f87d38749300e65", size = 1897786, upload-time = "2025-04-23T18:31:24.161Z" }, { url = "https://files.pythonhosted.org/packages/18/8a/2b41c97f554ec8c71f2a8a5f85cb56a8b0956addfe8b0efb5b3d77e8bdc3/pydantic_core-2.33.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a7ec89dc587667f22b6a0b6579c249fca9026ce7c333fc142ba42411fa243cdc", size = 2009000, upload-time = "2025-04-23T18:31:25.863Z" }, { url = "https://files.pythonhosted.org/packages/a1/02/6224312aacb3c8ecbaa959897af57181fb6cf3a3d7917fd44d0f2917e6f2/pydantic_core-2.33.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3c6db6e52c6d70aa0d00d45cdb9b40f0433b96380071ea80b09277dba021ddf7", size = 1847996, upload-time = "2025-04-23T18:31:27.341Z" }, { url = "https://files.pythonhosted.org/packages/d6/46/6dcdf084a523dbe0a0be59d054734b86a981726f221f4562aed313dbcb49/pydantic_core-2.33.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e61206137cbc65e6d5256e1166f88331d3b6238e082d9f74613b9b765fb9025", size = 1880957, upload-time = "2025-04-23T18:31:28.956Z" }, @@ -2702,6 +3018,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/7d/e09391c2eebeab681df2b74bfe6c43422fffede8dc74187b2b0bf6fd7571/pydantic_core-2.33.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61c18fba8e5e9db3ab908620af374db0ac1baa69f0f32df4f61ae23f15e586ac", size = 1806162, upload-time = "2025-04-23T18:32:20.188Z" }, { url = "https://files.pythonhosted.org/packages/f1/3d/847b6b1fed9f8ed3bb95a9ad04fbd0b212e832d4f0f50ff4d9ee5a9f15cf/pydantic_core-2.33.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95237e53bb015f67b63c91af7518a62a8660376a6a0db19b89acc77a4d6199f5", size = 1981560, upload-time = "2025-04-23T18:32:22.354Z" }, { url = "https://files.pythonhosted.org/packages/6f/9a/e73262f6c6656262b5fdd723ad90f518f579b7bc8622e43a942eec53c938/pydantic_core-2.33.2-cp313-cp313t-win_amd64.whl", hash = "sha256:c2fc0a768ef76c15ab9238afa6da7f69895bb5d1ee83aeea2e3509af4472d0b9", size = 1935777, upload-time = "2025-04-23T18:32:25.088Z" }, + { url = "https://files.pythonhosted.org/packages/7b/27/d4ae6487d73948d6f20dddcd94be4ea43e74349b56eba82e9bdee2d7494c/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dd14041875d09cc0f9308e37a6f8b65f5585cf2598a53aa0123df8b129d481f8", size = 2025200, upload-time = "2025-04-23T18:33:14.199Z" }, + { url = "https://files.pythonhosted.org/packages/f1/b8/b3cb95375f05d33801024079b9392a5ab45267a63400bf1866e7ce0f0de4/pydantic_core-2.33.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:d87c561733f66531dced0da6e864f44ebf89a8fba55f31407b00c2f7f9449593", size = 1859123, upload-time = "2025-04-23T18:33:16.555Z" }, + { url = "https://files.pythonhosted.org/packages/05/bc/0d0b5adeda59a261cd30a1235a445bf55c7e46ae44aea28f7bd6ed46e091/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f82865531efd18d6e07a04a17331af02cb7a651583c418df8266f17a63c6612", size = 1892852, upload-time = "2025-04-23T18:33:18.513Z" }, + { url = "https://files.pythonhosted.org/packages/3e/11/d37bdebbda2e449cb3f519f6ce950927b56d62f0b84fd9cb9e372a26a3d5/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bfb5112df54209d820d7bf9317c7a6c9025ea52e49f46b6a2060104bba37de7", size = 2067484, upload-time = "2025-04-23T18:33:20.475Z" }, + { url = "https://files.pythonhosted.org/packages/8c/55/1f95f0a05ce72ecb02a8a8a1c3be0579bbc29b1d5ab68f1378b7bebc5057/pydantic_core-2.33.2-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:64632ff9d614e5eecfb495796ad51b0ed98c453e447a76bcbeeb69615079fc7e", size = 2108896, upload-time = "2025-04-23T18:33:22.501Z" }, + { url = "https://files.pythonhosted.org/packages/53/89/2b2de6c81fa131f423246a9109d7b2a375e83968ad0800d6e57d0574629b/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:f889f7a40498cc077332c7ab6b4608d296d852182211787d4f3ee377aaae66e8", size = 2069475, upload-time = "2025-04-23T18:33:24.528Z" }, + { url = "https://files.pythonhosted.org/packages/b8/e9/1f7efbe20d0b2b10f6718944b5d8ece9152390904f29a78e68d4e7961159/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:de4b83bb311557e439b9e186f733f6c645b9417c84e2eb8203f3f820a4b988bf", size = 2239013, upload-time = "2025-04-23T18:33:26.621Z" }, + { url = "https://files.pythonhosted.org/packages/3c/b2/5309c905a93811524a49b4e031e9851a6b00ff0fb668794472ea7746b448/pydantic_core-2.33.2-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:82f68293f055f51b51ea42fafc74b6aad03e70e191799430b90c13d643059ebb", size = 2238715, upload-time = "2025-04-23T18:33:28.656Z" }, + { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, ] [[package]] @@ -2750,6 +3075,11 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz", hash = "sha256:3634134ce33859a441768b39df179b2c6f3de2bbbf506622884f553b013ee799", size = 617629, upload-time = "2025-04-30T14:59:34.306Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/75/c8b4ebba7d345b5e736ebf4976121b97dd7091dcad401a17ca57152704c5/pyiceberg-0.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e75c502dd56ac3d77036ce8a3b2566348da5ff4367c7c671981616ef6dcc883", size = 566274, upload-time = "2025-04-30T14:59:00.626Z" }, + { url = "https://files.pythonhosted.org/packages/e0/a0/9494c7930e5e4dc951d95abba584d8ffdb7403368398796ede21ff25c26f/pyiceberg-0.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0a8189c9b3ba81dd12493d6bb874a656a4d4909904552b97a629d1d43b3a0e90", size = 560157, upload-time = "2025-04-30T14:59:02.082Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d4/351776b1ae83de187d7cf37b100f4f124c7a71e35337182d3aef308156d1/pyiceberg-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c03065d5c5b704444ab8fb18cdd232ec43994db95b9e53444008ebc2cf9dc2c", size = 1052290, upload-time = "2025-04-30T14:59:03.232Z" }, + { url = "https://files.pythonhosted.org/packages/40/17/d8fea681afb52f20bf6a640f9044dcf621a47165f67cc5320bf3c6e82e4e/pyiceberg-0.9.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:93f2586a5da737de6e4643bf096a01772f068d1eedb7ffde6b36c60b6b9e6bd3", size = 1047503, upload-time = "2025-04-30T14:59:04.38Z" }, + { url = "https://files.pythonhosted.org/packages/d0/e0/d173fc2aa8dc252d7aac71703ba2c0491e4988b3a160cf5abb531cfb9086/pyiceberg-0.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:94e45c10051110ba7a43b85a1f0a680b4a31d1d6cee593c8e62e14d22d18c47d", size = 559491, upload-time = "2025-04-30T14:59:05.615Z" }, { url = "https://files.pythonhosted.org/packages/52/26/77983c2884b4a5f13f8a35e5c5e762ae699f6c511efd16730ab883000c1b/pyiceberg-0.9.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b8a958e3bbe919026533cee1f0fb6b7040928fce8d42c2ecea228de7c17578fa", size = 605755, upload-time = "2025-04-30T14:59:07.087Z" }, { url = "https://files.pythonhosted.org/packages/6d/67/e6ea7fcc43aebc85aea5a67a69d01c9015283478061c3121b6b8aa158ce4/pyiceberg-0.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7e956b35c6822600c45fd8f3ea8cfea328cc406fefa534afeb6fdb325d05406", size = 597325, upload-time = "2025-04-30T14:59:08.644Z" }, { url = "https://files.pythonhosted.org/packages/7f/cf/178a9f63fac1bfdd13bc85169e7ab903955d082e2cd80507b1921a6f64dc/pyiceberg-0.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e4e585164d7d86f5c9a609a1bc2abeae2f0ea0680a11a2064d3a945866b5311", size = 1277399, upload-time = "2025-04-30T14:59:10.193Z" }, @@ -2818,7 +3148,7 @@ name = "pytest-cov" version = "6.1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "coverage" }, + { name = "coverage", extra = ["toml"] }, { name = "pytest" }, ] sdist = { url = "https://files.pythonhosted.org/packages/25/69/5f1e57f6c5a39f81411b550027bf72842c4567ff5fd572bed1edc9e4b5d9/pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a", size = 66857, upload-time = "2025-04-05T14:07:51.592Z" } @@ -2852,6 +3182,9 @@ name = "pywin32" version = "310" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/b1/68aa2986129fb1011dabbe95f0136f44509afaf072b12b8f815905a39f33/pywin32-310-cp311-cp311-win32.whl", hash = "sha256:1e765f9564e83011a63321bb9d27ec456a0ed90d3732c4b2e312b855365ed8bd", size = 8784284, upload-time = "2025-03-17T00:55:53.124Z" }, + { url = "https://files.pythonhosted.org/packages/b3/bd/d1592635992dd8db5bb8ace0551bc3a769de1ac8850200cfa517e72739fb/pywin32-310-cp311-cp311-win_amd64.whl", hash = "sha256:126298077a9d7c95c53823934f000599f66ec9296b09167810eb24875f32689c", size = 9520748, upload-time = "2025-03-17T00:55:55.203Z" }, + { url = "https://files.pythonhosted.org/packages/90/b1/ac8b1ffce6603849eb45a91cf126c0fa5431f186c2e768bf56889c46f51c/pywin32-310-cp311-cp311-win_arm64.whl", hash = "sha256:19ec5fc9b1d51c4350be7bb00760ffce46e6c95eaf2f0b2f1150657b1a43c582", size = 8455941, upload-time = "2025-03-17T00:55:57.048Z" }, { url = "https://files.pythonhosted.org/packages/6b/ec/4fdbe47932f671d6e348474ea35ed94227fb5df56a7c30cbbb42cd396ed0/pywin32-310-cp312-cp312-win32.whl", hash = "sha256:8a75a5cc3893e83a108c05d82198880704c44bbaee4d06e442e471d3c9ea4f3d", size = 8796239, upload-time = "2025-03-17T00:55:58.807Z" }, { url = "https://files.pythonhosted.org/packages/e3/e5/b0627f8bb84e06991bea89ad8153a9e50ace40b2e1195d68e9dff6b03d0f/pywin32-310-cp312-cp312-win_amd64.whl", hash = "sha256:bf5c397c9a9a19a6f62f3fb821fbf36cac08f03770056711f765ec1503972060", size = 9503839, upload-time = "2025-03-17T00:56:00.8Z" }, { url = "https://files.pythonhosted.org/packages/1f/32/9ccf53748df72301a89713936645a664ec001abd35ecc8578beda593d37d/pywin32-310-cp312-cp312-win_arm64.whl", hash = "sha256:2349cc906eae872d0663d4d6290d13b90621eaf78964bb1578632ff20e152966", size = 8459470, upload-time = "2025-03-17T00:56:02.601Z" }, @@ -2866,6 +3199,15 @@ version = "6.0.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/54/ed/79a089b6be93607fa5cdaedf301d7dfb23af5f25c398d5ead2525b063e17/pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e", size = 130631, upload-time = "2024-08-06T20:33:50.674Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/f8/aa/7af4e81f7acba21a4c6be026da38fd2b872ca46226673c89a758ebdc4fd2/PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774", size = 184612, upload-time = "2024-08-06T20:32:03.408Z" }, + { url = "https://files.pythonhosted.org/packages/8b/62/b9faa998fd185f65c1371643678e4d58254add437edb764a08c5a98fb986/PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee", size = 172040, upload-time = "2024-08-06T20:32:04.926Z" }, + { url = "https://files.pythonhosted.org/packages/ad/0c/c804f5f922a9a6563bab712d8dcc70251e8af811fce4524d57c2c0fd49a4/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c", size = 736829, upload-time = "2024-08-06T20:32:06.459Z" }, + { url = "https://files.pythonhosted.org/packages/51/16/6af8d6a6b210c8e54f1406a6b9481febf9c64a3109c541567e35a49aa2e7/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317", size = 764167, upload-time = "2024-08-06T20:32:08.338Z" }, + { url = "https://files.pythonhosted.org/packages/75/e4/2c27590dfc9992f73aabbeb9241ae20220bd9452df27483b6e56d3975cc5/PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85", size = 762952, upload-time = "2024-08-06T20:32:14.124Z" }, + { url = "https://files.pythonhosted.org/packages/9b/97/ecc1abf4a823f5ac61941a9c00fe501b02ac3ab0e373c3857f7d4b83e2b6/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4", size = 735301, upload-time = "2024-08-06T20:32:16.17Z" }, + { url = "https://files.pythonhosted.org/packages/45/73/0f49dacd6e82c9430e46f4a027baa4ca205e8b0a9dce1397f44edc23559d/PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e", size = 756638, upload-time = "2024-08-06T20:32:18.555Z" }, + { url = "https://files.pythonhosted.org/packages/22/5f/956f0f9fc65223a58fbc14459bf34b4cc48dec52e00535c79b8db361aabd/PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5", size = 143850, upload-time = "2024-08-06T20:32:19.889Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/8da0bbe2ab9dcdd11f4f4557ccaf95c10b9811b13ecced089d43ce59c3c8/PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44", size = 161980, upload-time = "2024-08-06T20:32:21.273Z" }, { url = "https://files.pythonhosted.org/packages/86/0c/c581167fc46d6d6d7ddcfb8c843a4de25bdd27e4466938109ca68492292c/PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab", size = 183873, upload-time = "2024-08-06T20:32:25.131Z" }, { url = "https://files.pythonhosted.org/packages/a8/0c/38374f5bb272c051e2a69281d71cba6fdb983413e6758b84482905e29a5d/PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725", size = 173302, upload-time = "2024-08-06T20:32:26.511Z" }, { url = "https://files.pythonhosted.org/packages/c3/93/9916574aa8c00aa06bbac729972eb1071d002b8e158bd0e83a3b9a20a1f7/PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5", size = 739154, upload-time = "2024-08-06T20:32:28.363Z" }, @@ -2907,6 +3249,17 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/b1/11/b9213d25230ac18a71b39b3723494e57adebe36e066397b961657b3b41c1/pyzmq-26.4.0.tar.gz", hash = "sha256:4bd13f85f80962f91a651a7356fe0472791a5f7a92f227822b5acf44795c626d", size = 278293, upload-time = "2025-04-04T12:05:44.049Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/32/6d/234e3b0aa82fd0290b1896e9992f56bdddf1f97266110be54d0177a9d2d9/pyzmq-26.4.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:bfcf82644c9b45ddd7cd2a041f3ff8dce4a0904429b74d73a439e8cab1bd9e54", size = 1339723, upload-time = "2025-04-04T12:03:24.358Z" }, + { url = "https://files.pythonhosted.org/packages/4f/11/6d561efe29ad83f7149a7cd48e498e539ed09019c6cd7ecc73f4cc725028/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9bcae3979b2654d5289d3490742378b2f3ce804b0b5fd42036074e2bf35b030", size = 672645, upload-time = "2025-04-04T12:03:25.693Z" }, + { url = "https://files.pythonhosted.org/packages/19/fd/81bfe3e23f418644660bad1a90f0d22f0b3eebe33dd65a79385530bceb3d/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ccdff8ac4246b6fb60dcf3982dfaeeff5dd04f36051fe0632748fc0aa0679c01", size = 910133, upload-time = "2025-04-04T12:03:27.625Z" }, + { url = "https://files.pythonhosted.org/packages/97/68/321b9c775595ea3df832a9516252b653fe32818db66fdc8fa31c9b9fce37/pyzmq-26.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4550af385b442dc2d55ab7717837812799d3674cb12f9a3aa897611839c18e9e", size = 867428, upload-time = "2025-04-04T12:03:29.004Z" }, + { url = "https://files.pythonhosted.org/packages/4e/6e/159cbf2055ef36aa2aa297e01b24523176e5b48ead283c23a94179fb2ba2/pyzmq-26.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f7ffe9db1187a253fca95191854b3fda24696f086e8789d1d449308a34b88", size = 862409, upload-time = "2025-04-04T12:03:31.032Z" }, + { url = "https://files.pythonhosted.org/packages/05/1c/45fb8db7be5a7d0cadea1070a9cbded5199a2d578de2208197e592f219bd/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3709c9ff7ba61589b7372923fd82b99a81932b592a5c7f1a24147c91da9a68d6", size = 1205007, upload-time = "2025-04-04T12:03:32.687Z" }, + { url = "https://files.pythonhosted.org/packages/f8/fa/658c7f583af6498b463f2fa600f34e298e1b330886f82f1feba0dc2dd6c3/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f8f3c30fb2d26ae5ce36b59768ba60fb72507ea9efc72f8f69fa088450cff1df", size = 1514599, upload-time = "2025-04-04T12:03:34.084Z" }, + { url = "https://files.pythonhosted.org/packages/4d/d7/44d641522353ce0a2bbd150379cb5ec32f7120944e6bfba4846586945658/pyzmq-26.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:382a4a48c8080e273427fc692037e3f7d2851959ffe40864f2db32646eeb3cef", size = 1414546, upload-time = "2025-04-04T12:03:35.478Z" }, + { url = "https://files.pythonhosted.org/packages/72/76/c8ed7263218b3d1e9bce07b9058502024188bd52cc0b0a267a9513b431fc/pyzmq-26.4.0-cp311-cp311-win32.whl", hash = "sha256:d56aad0517d4c09e3b4f15adebba8f6372c5102c27742a5bdbfc74a7dceb8fca", size = 579247, upload-time = "2025-04-04T12:03:36.846Z" }, + { url = "https://files.pythonhosted.org/packages/c3/d0/2d9abfa2571a0b1a67c0ada79a8aa1ba1cce57992d80f771abcdf99bb32c/pyzmq-26.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:963977ac8baed7058c1e126014f3fe58b3773f45c78cce7af5c26c09b6823896", size = 644727, upload-time = "2025-04-04T12:03:38.578Z" }, + { url = "https://files.pythonhosted.org/packages/0d/d1/c8ad82393be6ccedfc3c9f3adb07f8f3976e3c4802640fe3f71441941e70/pyzmq-26.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:c0c8e8cadc81e44cc5088fcd53b9b3b4ce9344815f6c4a03aec653509296fae3", size = 559942, upload-time = "2025-04-04T12:03:40.143Z" }, { url = "https://files.pythonhosted.org/packages/10/44/a778555ebfdf6c7fc00816aad12d185d10a74d975800341b1bc36bad1187/pyzmq-26.4.0-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:5227cb8da4b6f68acfd48d20c588197fd67745c278827d5238c707daf579227b", size = 1341586, upload-time = "2025-04-04T12:03:41.954Z" }, { url = "https://files.pythonhosted.org/packages/9c/4f/f3a58dc69ac757e5103be3bd41fb78721a5e17da7cc617ddb56d973a365c/pyzmq-26.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1c07a7fa7f7ba86554a2b1bef198c9fed570c08ee062fd2fd6a4dcacd45f905", size = 665880, upload-time = "2025-04-04T12:03:43.45Z" }, { url = "https://files.pythonhosted.org/packages/fe/45/50230bcfb3ae5cb98bee683b6edeba1919f2565d7cc1851d3c38e2260795/pyzmq-26.4.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae775fa83f52f52de73183f7ef5395186f7105d5ed65b1ae65ba27cb1260de2b", size = 902216, upload-time = "2025-04-04T12:03:45.572Z" }, @@ -2937,6 +3290,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/bc/f88b0bad0f7a7f500547d71e99f10336f2314e525d4ebf576a1ea4a1d903/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:b30f862f6768b17040929a68432c8a8be77780317f45a353cb17e423127d250c", size = 1189183, upload-time = "2025-04-04T12:04:27.035Z" }, { url = "https://files.pythonhosted.org/packages/d9/8c/db446a3dd9cf894406dec2e61eeffaa3c07c3abb783deaebb9812c4af6a5/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_i686.whl", hash = "sha256:c80fcd3504232f13617c6ab501124d373e4895424e65de8b72042333316f64a8", size = 1495501, upload-time = "2025-04-04T12:04:28.833Z" }, { url = "https://files.pythonhosted.org/packages/05/4c/bf3cad0d64c3214ac881299c4562b815f05d503bccc513e3fd4fdc6f67e4/pyzmq-26.4.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:26a2a7451606b87f67cdeca2c2789d86f605da08b4bd616b1a9981605ca3a364", size = 1395540, upload-time = "2025-04-04T12:04:30.562Z" }, + { url = "https://files.pythonhosted.org/packages/04/52/a70fcd5592715702248306d8e1729c10742c2eac44529984413b05c68658/pyzmq-26.4.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4478b14cb54a805088299c25a79f27eaf530564a7a4f72bf432a040042b554eb", size = 834405, upload-time = "2025-04-04T12:05:13.3Z" }, + { url = "https://files.pythonhosted.org/packages/25/f9/1a03f1accff16b3af1a6fa22cbf7ced074776abbf688b2e9cb4629700c62/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a28ac29c60e4ba84b5f58605ace8ad495414a724fe7aceb7cf06cd0598d04e1", size = 569578, upload-time = "2025-04-04T12:05:15.36Z" }, + { url = "https://files.pythonhosted.org/packages/76/0c/3a633acd762aa6655fcb71fa841907eae0ab1e8582ff494b137266de341d/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43b03c1ceea27c6520124f4fb2ba9c647409b9abdf9a62388117148a90419494", size = 798248, upload-time = "2025-04-04T12:05:17.376Z" }, + { url = "https://files.pythonhosted.org/packages/cd/cc/6c99c84aa60ac1cc56747bed6be8ce6305b9b861d7475772e7a25ce019d3/pyzmq-26.4.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7731abd23a782851426d4e37deb2057bf9410848a4459b5ede4fe89342e687a9", size = 756757, upload-time = "2025-04-04T12:05:19.19Z" }, + { url = "https://files.pythonhosted.org/packages/13/9c/d8073bd898eb896e94c679abe82e47506e2b750eb261cf6010ced869797c/pyzmq-26.4.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a222ad02fbe80166b0526c038776e8042cd4e5f0dec1489a006a1df47e9040e0", size = 555371, upload-time = "2025-04-04T12:05:20.702Z" }, ] [[package]] @@ -2954,6 +3312,11 @@ dependencies = [ { name = "requests" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/79/cd0376eef04d5dabdf0de04c0ae7d71447797c6db4a09a3f71e746018cea/ray-2.48.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:4b9b92ac29635f555ef341347d9a63dbf02b7d946347239af3c09e364bc45cf8", size = 67315928, upload-time = "2025-07-18T22:32:40.109Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b3/dc73b03bfa75b0668542f77a14d22bee3337754e09af64c7c5c22fdb6649/ray-2.48.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:b94500fe2d17e491fe2e9bd4a3bf62df217e21a8f2845033c353d4d2ea240f73", size = 69829631, upload-time = "2025-07-18T22:32:45.619Z" }, + { url = "https://files.pythonhosted.org/packages/1f/ea/d1f44f5dde662eaf1a61fdfd80b2bac44438506de608c77965be82c2f572/ray-2.48.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:24a70f416ec0be14b975f160044805ccb48cc6bc50de632983eb8f0a8e16682b", size = 69128145, upload-time = "2025-07-18T22:32:51.506Z" }, + { url = "https://files.pythonhosted.org/packages/5c/46/b376189b9df6b41307754bbc8ed8fe191a86908a8a104b37a602897ec5f0/ray-2.48.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:46d4b42a58492dec79caad2d562344689a4f99a828aeea811a0cd2cd653553ef", size = 70079019, upload-time = "2025-07-18T22:32:57.136Z" }, + { url = "https://files.pythonhosted.org/packages/cb/93/98459098f43336ac09c6e5d688468d896f1a791948263727880e1accc7d0/ray-2.48.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfb48c10371c267fdcf7f4ae359cab706f068178b9c65317ead011972f2c0bf3", size = 26763615, upload-time = "2025-07-18T22:33:01.954Z" }, { url = "https://files.pythonhosted.org/packages/41/53/0d105e1baa6c8c9582f90154ba3f0ca08d58129384ea2707b2e59449b03b/ray-2.48.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:8de799f3b0896f48d306d5e4a04fc6037a08c495d45f9c79935344e5693e3cf8", size = 67302857, upload-time = "2025-07-18T22:33:06.414Z" }, { url = "https://files.pythonhosted.org/packages/df/c5/7de1e9d92a45b1805fe828dcbd18b4c5a1f35ab3cad9134efeb20a3ab3e5/ray-2.48.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5a6f57126eac9dd3286289e07e91e87b054792f9698b6f7ccab88b624816b542", size = 69823198, upload-time = "2025-07-18T22:33:12.494Z" }, { url = "https://files.pythonhosted.org/packages/b4/a6/e7c969bd371c65b7c233d86f23610489e15164ee7eadb3eb78f9d55eda4d/ray-2.48.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:f1cf33d260316f92f77558185f1c36fc35506d76ee7fdfed9f5b70f9c4bdba7f", size = 69151702, upload-time = "2025-07-18T22:33:18.655Z" }, @@ -2987,6 +3350,9 @@ default = [ name = "redis" version = "6.2.0" source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "async-timeout", marker = "python_full_version < '3.11.3'" }, +] sdist = { url = "https://files.pythonhosted.org/packages/ea/9a/0551e01ba52b944f97480721656578c8a7c46b51b99d66814f85fe3a4f3e/redis-6.2.0.tar.gz", hash = "sha256:e821f129b75dde6cb99dd35e5c76e8c49512a5a0d8dfdc560b2fbd44b85ca977", size = 4639129, upload-time = "2025-05-28T05:01:18.91Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/13/67/e60968d3b0e077495a8fee89cf3f2373db98e528288a48f1ee44967f6e8c/redis-6.2.0-py3-none-any.whl", hash = "sha256:c8ddf316ee0aab65f04a11229e94a64b2618451dab7a67cb2f77eb799d872d5e", size = 278659, upload-time = "2025-05-28T05:01:16.955Z" }, @@ -3012,6 +3378,20 @@ version = "2025.7.34" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/0b/de/e13fa6dc61d78b30ba47481f99933a3b49a57779d625c392d8036770a60d/regex-2025.7.34.tar.gz", hash = "sha256:9ead9765217afd04a86822dfcd4ed2747dfe426e887da413b15ff0ac2457e21a", size = 400714, upload-time = "2025-07-31T00:21:16.262Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/0d/85/f497b91577169472f7c1dc262a5ecc65e39e146fc3a52c571e5daaae4b7d/regex-2025.7.34-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:da304313761b8500b8e175eb2040c4394a875837d5635f6256d6fa0377ad32c8", size = 484594, upload-time = "2025-07-31T00:19:13.927Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c5/ad2a5c11ce9e6257fcbfd6cd965d07502f6054aaa19d50a3d7fd991ec5d1/regex-2025.7.34-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:35e43ebf5b18cd751ea81455b19acfdec402e82fe0dc6143edfae4c5c4b3909a", size = 289294, upload-time = "2025-07-31T00:19:15.395Z" }, + { url = "https://files.pythonhosted.org/packages/8e/01/83ffd9641fcf5e018f9b51aa922c3e538ac9439424fda3df540b643ecf4f/regex-2025.7.34-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96bbae4c616726f4661fe7bcad5952e10d25d3c51ddc388189d8864fbc1b3c68", size = 285933, upload-time = "2025-07-31T00:19:16.704Z" }, + { url = "https://files.pythonhosted.org/packages/77/20/5edab2e5766f0259bc1da7381b07ce6eb4401b17b2254d02f492cd8a81a8/regex-2025.7.34-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9feab78a1ffa4f2b1e27b1bcdaad36f48c2fed4870264ce32f52a393db093c78", size = 792335, upload-time = "2025-07-31T00:19:18.561Z" }, + { url = "https://files.pythonhosted.org/packages/30/bd/744d3ed8777dce8487b2606b94925e207e7c5931d5870f47f5b643a4580a/regex-2025.7.34-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f14b36e6d4d07f1a5060f28ef3b3561c5d95eb0651741474ce4c0a4c56ba8719", size = 858605, upload-time = "2025-07-31T00:19:20.204Z" }, + { url = "https://files.pythonhosted.org/packages/99/3d/93754176289718d7578c31d151047e7b8acc7a8c20e7706716f23c49e45e/regex-2025.7.34-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85c3a958ef8b3d5079c763477e1f09e89d13ad22198a37e9d7b26b4b17438b33", size = 905780, upload-time = "2025-07-31T00:19:21.876Z" }, + { url = "https://files.pythonhosted.org/packages/ee/2e/c689f274a92deffa03999a430505ff2aeace408fd681a90eafa92fdd6930/regex-2025.7.34-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:37555e4ae0b93358fa7c2d240a4291d4a4227cc7c607d8f85596cdb08ec0a083", size = 798868, upload-time = "2025-07-31T00:19:23.222Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9e/39673688805d139b33b4a24851a71b9978d61915c4d72b5ffda324d0668a/regex-2025.7.34-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee38926f31f1aa61b0232a3a11b83461f7807661c062df9eb88769d86e6195c3", size = 781784, upload-time = "2025-07-31T00:19:24.59Z" }, + { url = "https://files.pythonhosted.org/packages/18/bd/4c1cab12cfabe14beaa076523056b8ab0c882a8feaf0a6f48b0a75dab9ed/regex-2025.7.34-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a664291c31cae9c4a30589bd8bc2ebb56ef880c9c6264cb7643633831e606a4d", size = 852837, upload-time = "2025-07-31T00:19:25.911Z" }, + { url = "https://files.pythonhosted.org/packages/cb/21/663d983cbb3bba537fc213a579abbd0f263fb28271c514123f3c547ab917/regex-2025.7.34-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f3e5c1e0925e77ec46ddc736b756a6da50d4df4ee3f69536ffb2373460e2dafd", size = 844240, upload-time = "2025-07-31T00:19:27.688Z" }, + { url = "https://files.pythonhosted.org/packages/8e/2d/9beeeb913bc5d32faa913cf8c47e968da936af61ec20af5d269d0f84a100/regex-2025.7.34-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d428fc7731dcbb4e2ffe43aeb8f90775ad155e7db4347a639768bc6cd2df881a", size = 787139, upload-time = "2025-07-31T00:19:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f5/9b9384415fdc533551be2ba805dd8c4621873e5df69c958f403bfd3b2b6e/regex-2025.7.34-cp311-cp311-win32.whl", hash = "sha256:e154a7ee7fa18333ad90b20e16ef84daaeac61877c8ef942ec8dfa50dc38b7a1", size = 264019, upload-time = "2025-07-31T00:19:31.129Z" }, + { url = "https://files.pythonhosted.org/packages/18/9d/e069ed94debcf4cc9626d652a48040b079ce34c7e4fb174f16874958d485/regex-2025.7.34-cp311-cp311-win_amd64.whl", hash = "sha256:24257953d5c1d6d3c129ab03414c07fc1a47833c9165d49b954190b2b7f21a1a", size = 276047, upload-time = "2025-07-31T00:19:32.497Z" }, + { url = "https://files.pythonhosted.org/packages/fd/cf/3bafbe9d1fd1db77355e7fbbbf0d0cfb34501a8b8e334deca14f94c7b315/regex-2025.7.34-cp311-cp311-win_arm64.whl", hash = "sha256:3157aa512b9e606586900888cd469a444f9b898ecb7f8931996cb715f77477f0", size = 268362, upload-time = "2025-07-31T00:19:34.094Z" }, { url = "https://files.pythonhosted.org/packages/ff/f0/31d62596c75a33f979317658e8d261574785c6cd8672c06741ce2e2e2070/regex-2025.7.34-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7f7211a746aced993bef487de69307a38c5ddd79257d7be83f7b202cb59ddb50", size = 485492, upload-time = "2025-07-31T00:19:35.57Z" }, { url = "https://files.pythonhosted.org/packages/d8/16/b818d223f1c9758c3434be89aa1a01aae798e0e0df36c1f143d1963dd1ee/regex-2025.7.34-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fb31080f2bd0681484b275461b202b5ad182f52c9ec606052020fe13eb13a72f", size = 290000, upload-time = "2025-07-31T00:19:37.175Z" }, { url = "https://files.pythonhosted.org/packages/cd/70/69506d53397b4bd6954061bae75677ad34deb7f6ca3ba199660d6f728ff5/regex-2025.7.34-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0200a5150c4cf61e407038f4b4d5cdad13e86345dac29ff9dab3d75d905cf130", size = 286072, upload-time = "2025-07-31T00:19:38.612Z" }, @@ -3120,6 +3500,20 @@ version = "0.26.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/a5/aa/4456d84bbb54adc6a916fb10c9b374f78ac840337644e4a5eda229c81275/rpds_py-0.26.0.tar.gz", hash = "sha256:20dae58a859b0906f0685642e591056f1e787f3a8b39c8e8749a45dc7d26bdb0", size = 27385, upload-time = "2025-07-01T15:57:13.958Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/09/4c/4ee8f7e512030ff79fda1df3243c88d70fc874634e2dbe5df13ba4210078/rpds_py-0.26.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9e8cb77286025bdb21be2941d64ac6ca016130bfdcd228739e8ab137eb4406ed", size = 372610, upload-time = "2025-07-01T15:53:58.844Z" }, + { url = "https://files.pythonhosted.org/packages/fa/9d/3dc16be00f14fc1f03c71b1d67c8df98263ab2710a2fbd65a6193214a527/rpds_py-0.26.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e09330b21d98adc8ccb2dbb9fc6cb434e8908d4c119aeaa772cb1caab5440a0", size = 358032, upload-time = "2025-07-01T15:53:59.985Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5a/7f1bf8f045da2866324a08ae80af63e64e7bfaf83bd31f865a7b91a58601/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c9c1b92b774b2e68d11193dc39620d62fd8ab33f0a3c77ecdabe19c179cdbc1", size = 381525, upload-time = "2025-07-01T15:54:01.162Z" }, + { url = "https://files.pythonhosted.org/packages/45/8a/04479398c755a066ace10e3d158866beb600867cacae194c50ffa783abd0/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:824e6d3503ab990d7090768e4dfd9e840837bae057f212ff9f4f05ec6d1975e7", size = 397089, upload-time = "2025-07-01T15:54:02.319Z" }, + { url = "https://files.pythonhosted.org/packages/72/88/9203f47268db488a1b6d469d69c12201ede776bb728b9d9f29dbfd7df406/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8ad7fd2258228bf288f2331f0a6148ad0186b2e3643055ed0db30990e59817a6", size = 514255, upload-time = "2025-07-01T15:54:03.38Z" }, + { url = "https://files.pythonhosted.org/packages/f5/b4/01ce5d1e853ddf81fbbd4311ab1eff0b3cf162d559288d10fd127e2588b5/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0dc23bbb3e06ec1ea72d515fb572c1fea59695aefbffb106501138762e1e915e", size = 402283, upload-time = "2025-07-01T15:54:04.923Z" }, + { url = "https://files.pythonhosted.org/packages/34/a2/004c99936997bfc644d590a9defd9e9c93f8286568f9c16cdaf3e14429a7/rpds_py-0.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d80bf832ac7b1920ee29a426cdca335f96a2b5caa839811803e999b41ba9030d", size = 383881, upload-time = "2025-07-01T15:54:06.482Z" }, + { url = "https://files.pythonhosted.org/packages/05/1b/ef5fba4a8f81ce04c427bfd96223f92f05e6cd72291ce9d7523db3b03a6c/rpds_py-0.26.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0919f38f5542c0a87e7b4afcafab6fd2c15386632d249e9a087498571250abe3", size = 415822, upload-time = "2025-07-01T15:54:07.605Z" }, + { url = "https://files.pythonhosted.org/packages/16/80/5c54195aec456b292f7bd8aa61741c8232964063fd8a75fdde9c1e982328/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d422b945683e409000c888e384546dbab9009bb92f7c0b456e217988cf316107", size = 558347, upload-time = "2025-07-01T15:54:08.591Z" }, + { url = "https://files.pythonhosted.org/packages/f2/1c/1845c1b1fd6d827187c43afe1841d91678d7241cbdb5420a4c6de180a538/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:77a7711fa562ba2da1aa757e11024ad6d93bad6ad7ede5afb9af144623e5f76a", size = 587956, upload-time = "2025-07-01T15:54:09.963Z" }, + { url = "https://files.pythonhosted.org/packages/2e/ff/9e979329dd131aa73a438c077252ddabd7df6d1a7ad7b9aacf6261f10faa/rpds_py-0.26.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238e8c8610cb7c29460e37184f6799547f7e09e6a9bdbdab4e8edb90986a2318", size = 554363, upload-time = "2025-07-01T15:54:11.073Z" }, + { url = "https://files.pythonhosted.org/packages/00/8b/d78cfe034b71ffbe72873a136e71acc7a831a03e37771cfe59f33f6de8a2/rpds_py-0.26.0-cp311-cp311-win32.whl", hash = "sha256:893b022bfbdf26d7bedb083efeea624e8550ca6eb98bf7fea30211ce95b9201a", size = 220123, upload-time = "2025-07-01T15:54:12.382Z" }, + { url = "https://files.pythonhosted.org/packages/94/c1/3c8c94c7dd3905dbfde768381ce98778500a80db9924731d87ddcdb117e9/rpds_py-0.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:87a5531de9f71aceb8af041d72fc4cab4943648d91875ed56d2e629bef6d4c03", size = 231732, upload-time = "2025-07-01T15:54:13.434Z" }, + { url = "https://files.pythonhosted.org/packages/67/93/e936fbed1b734eabf36ccb5d93c6a2e9246fbb13c1da011624b7286fae3e/rpds_py-0.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:de2713f48c1ad57f89ac25b3cb7daed2156d8e822cf0eca9b96a6f990718cc41", size = 221917, upload-time = "2025-07-01T15:54:14.559Z" }, { url = "https://files.pythonhosted.org/packages/ea/86/90eb87c6f87085868bd077c7a9938006eb1ce19ed4d06944a90d3560fce2/rpds_py-0.26.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:894514d47e012e794f1350f076c427d2347ebf82f9b958d554d12819849a369d", size = 363933, upload-time = "2025-07-01T15:54:15.734Z" }, { url = "https://files.pythonhosted.org/packages/63/78/4469f24d34636242c924626082b9586f064ada0b5dbb1e9d096ee7a8e0c6/rpds_py-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc921b96fa95a097add244da36a1d9e4f3039160d1d30f1b35837bf108c21136", size = 350447, upload-time = "2025-07-01T15:54:16.922Z" }, { url = "https://files.pythonhosted.org/packages/ad/91/c448ed45efdfdade82348d5e7995e15612754826ea640afc20915119734f/rpds_py-0.26.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e1157659470aa42a75448b6e943c895be8c70531c43cb78b9ba990778955582", size = 384711, upload-time = "2025-07-01T15:54:18.101Z" }, @@ -3188,6 +3582,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/d9/3f0f105420fecd18551b678c9a6ce60bd23986098b252a56d35781b3e7e9/rpds_py-0.26.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c1851f429b822831bd2edcbe0cfd12ee9ea77868f8d3daf267b189371671c80e", size = 554886, upload-time = "2025-07-01T15:55:52.541Z" }, { url = "https://files.pythonhosted.org/packages/6b/c5/347c056a90dc8dd9bc240a08c527315008e1b5042e7a4cf4ac027be9d38a/rpds_py-0.26.0-cp314-cp314t-win32.whl", hash = "sha256:7bdb17009696214c3b66bb3590c6d62e14ac5935e53e929bcdbc5a495987a84f", size = 219027, upload-time = "2025-07-01T15:55:53.874Z" }, { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload-time = "2025-07-01T15:55:55.167Z" }, + { url = "https://files.pythonhosted.org/packages/51/f2/b5c85b758a00c513bb0389f8fc8e61eb5423050c91c958cdd21843faa3e6/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f61a9326f80ca59214d1cceb0a09bb2ece5b2563d4e0cd37bfd5515c28510674", size = 373505, upload-time = "2025-07-01T15:56:34.716Z" }, + { url = "https://files.pythonhosted.org/packages/23/e0/25db45e391251118e915e541995bb5f5ac5691a3b98fb233020ba53afc9b/rpds_py-0.26.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:183f857a53bcf4b1b42ef0f57ca553ab56bdd170e49d8091e96c51c3d69ca696", size = 359468, upload-time = "2025-07-01T15:56:36.219Z" }, + { url = "https://files.pythonhosted.org/packages/0b/73/dd5ee6075bb6491be3a646b301dfd814f9486d924137a5098e61f0487e16/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:941c1cfdf4799d623cf3aa1d326a6b4fdb7a5799ee2687f3516738216d2262fb", size = 382680, upload-time = "2025-07-01T15:56:37.644Z" }, + { url = "https://files.pythonhosted.org/packages/2f/10/84b522ff58763a5c443f5bcedc1820240e454ce4e620e88520f04589e2ea/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72a8d9564a717ee291f554eeb4bfeafe2309d5ec0aa6c475170bdab0f9ee8e88", size = 397035, upload-time = "2025-07-01T15:56:39.241Z" }, + { url = "https://files.pythonhosted.org/packages/06/ea/8667604229a10a520fcbf78b30ccc278977dcc0627beb7ea2c96b3becef0/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:511d15193cbe013619dd05414c35a7dedf2088fcee93c6bbb7c77859765bd4e8", size = 514922, upload-time = "2025-07-01T15:56:40.645Z" }, + { url = "https://files.pythonhosted.org/packages/24/e6/9ed5b625c0661c4882fc8cdf302bf8e96c73c40de99c31e0b95ed37d508c/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aea1f9741b603a8d8fedb0ed5502c2bc0accbc51f43e2ad1337fe7259c2b77a5", size = 402822, upload-time = "2025-07-01T15:56:42.137Z" }, + { url = "https://files.pythonhosted.org/packages/8a/58/212c7b6fd51946047fb45d3733da27e2fa8f7384a13457c874186af691b1/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4019a9d473c708cf2f16415688ef0b4639e07abaa569d72f74745bbeffafa2c7", size = 384336, upload-time = "2025-07-01T15:56:44.239Z" }, + { url = "https://files.pythonhosted.org/packages/aa/f5/a40ba78748ae8ebf4934d4b88e77b98497378bc2c24ba55ebe87a4e87057/rpds_py-0.26.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:093d63b4b0f52d98ebae33b8c50900d3d67e0666094b1be7a12fffd7f65de74b", size = 416871, upload-time = "2025-07-01T15:56:46.284Z" }, + { url = "https://files.pythonhosted.org/packages/d5/a6/33b1fc0c9f7dcfcfc4a4353daa6308b3ece22496ceece348b3e7a7559a09/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2abe21d8ba64cded53a2a677e149ceb76dcf44284202d737178afe7ba540c1eb", size = 559439, upload-time = "2025-07-01T15:56:48.549Z" }, + { url = "https://files.pythonhosted.org/packages/71/2d/ceb3f9c12f8cfa56d34995097f6cd99da1325642c60d1b6680dd9df03ed8/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:4feb7511c29f8442cbbc28149a92093d32e815a28aa2c50d333826ad2a20fdf0", size = 588380, upload-time = "2025-07-01T15:56:50.086Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, ] [[package]] @@ -3278,7 +3683,7 @@ wheels = [ [[package]] name = "selection-pipeline" version = "0.1.0" -source = { git = "https://github.com/enigma-brain/selection_pipeline#dcae2860c0bd59ab612d9b4a434c6a817a767255" } +source = { git = "https://github.com/enigma-brain/selection_pipeline#a5e8e979e559ea8131648d05af9929a27f570758" } dependencies = [ { name = "decord" }, { name = "einops" }, @@ -3402,6 +3807,45 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/f2/fd673d979185f5dcbac4be7d09461cbb99751554ffb6718d0013af8604cb/tokenizers-0.21.4-cp39-abi3-win_amd64.whl", hash = "sha256:475d807a5c3eb72c59ad9b5fcdb254f6e17f53dfcbb9903233b0dfa9c943b597", size = 2507568, upload-time = "2025-07-28T15:48:55.456Z" }, ] +[[package]] +name = "tomli" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175, upload-time = "2024-11-27T22:38:36.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077, upload-time = "2024-11-27T22:37:54.956Z" }, + { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429, upload-time = "2024-11-27T22:37:56.698Z" }, + { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067, upload-time = "2024-11-27T22:37:57.63Z" }, + { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030, upload-time = "2024-11-27T22:37:59.344Z" }, + { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898, upload-time = "2024-11-27T22:38:00.429Z" }, + { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894, upload-time = "2024-11-27T22:38:02.094Z" }, + { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319, upload-time = "2024-11-27T22:38:03.206Z" }, + { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273, upload-time = "2024-11-27T22:38:04.217Z" }, + { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310, upload-time = "2024-11-27T22:38:05.908Z" }, + { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309, upload-time = "2024-11-27T22:38:06.812Z" }, + { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762, upload-time = "2024-11-27T22:38:07.731Z" }, + { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453, upload-time = "2024-11-27T22:38:09.384Z" }, + { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486, upload-time = "2024-11-27T22:38:10.329Z" }, + { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349, upload-time = "2024-11-27T22:38:11.443Z" }, + { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159, upload-time = "2024-11-27T22:38:13.099Z" }, + { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243, upload-time = "2024-11-27T22:38:14.766Z" }, + { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645, upload-time = "2024-11-27T22:38:15.843Z" }, + { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584, upload-time = "2024-11-27T22:38:17.645Z" }, + { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875, upload-time = "2024-11-27T22:38:19.159Z" }, + { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418, upload-time = "2024-11-27T22:38:20.064Z" }, + { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708, upload-time = "2024-11-27T22:38:21.659Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582, upload-time = "2024-11-27T22:38:22.693Z" }, + { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543, upload-time = "2024-11-27T22:38:24.367Z" }, + { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691, upload-time = "2024-11-27T22:38:26.081Z" }, + { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170, upload-time = "2024-11-27T22:38:27.921Z" }, + { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530, upload-time = "2024-11-27T22:38:29.591Z" }, + { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666, upload-time = "2024-11-27T22:38:30.639Z" }, + { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954, upload-time = "2024-11-27T22:38:31.702Z" }, + { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724, upload-time = "2024-11-27T22:38:32.837Z" }, + { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383, upload-time = "2024-11-27T22:38:34.455Z" }, + { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, +] + [[package]] name = "torch" version = "2.6.0" @@ -3424,12 +3868,16 @@ dependencies = [ { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "setuptools" }, + { name = "setuptools", marker = "python_full_version >= '3.12'" }, { name = "sympy" }, { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/78/a9/97cbbc97002fff0de394a2da2cdfa859481fdca36996d7bd845d50aa9d8d/torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:7979834102cd5b7a43cc64e87f2f3b14bd0e1458f06e9f88ffa386d07c7446e1", size = 766715424, upload-time = "2025-01-29T16:25:15.874Z" }, + { url = "https://files.pythonhosted.org/packages/6d/fa/134ce8f8a7ea07f09588c9cc2cea0d69249efab977707cf67669431dcf5c/torch-2.6.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ccbd0320411fe1a3b3fec7b4d3185aa7d0c52adac94480ab024b5c8f74a0bf1d", size = 95759416, upload-time = "2025-01-29T16:27:38.429Z" }, + { url = "https://files.pythonhosted.org/packages/11/c5/2370d96b31eb1841c3a0883a492c15278a6718ccad61bb6a649c80d1d9eb/torch-2.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:46763dcb051180ce1ed23d1891d9b1598e07d051ce4c9d14307029809c4d64f7", size = 204164970, upload-time = "2025-01-29T16:26:16.182Z" }, + { url = "https://files.pythonhosted.org/packages/0b/fa/f33a4148c6fb46ca2a3f8de39c24d473822d5774d652b66ed9b1214da5f7/torch-2.6.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:94fc63b3b4bedd327af588696559f68c264440e2503cc9e6954019473d74ae21", size = 66530713, upload-time = "2025-01-29T16:26:38.881Z" }, { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" }, { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" }, { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" }, @@ -3445,6 +3893,8 @@ name = "torchcodec" version = "0.2.0" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/19/c03287d8c47306fe5214667adf3a522ce00b0589314c9983c3a9a4068c25/TorchCodec-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f845356627f7e4e5a2b6daaf8525f19d2a467c46b1130ee9945153fb7e877a11", size = 2822334, upload-time = "2025-02-05T13:57:15.255Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b6/19d4c13661bb681e7e7a3541f6876ea2059448708c433078152339e7c16b/TorchCodec-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf88951edb1e8fadbd2aa3d81453c0909fa2f501209475b9d4684ecea9117cc5", size = 755421, upload-time = "2025-02-05T13:56:46.614Z" }, { url = "https://files.pythonhosted.org/packages/26/5b/16c30a9bfd3d056c6c10598d3067a850b46202d43b6984da117fa91d1a25/TorchCodec-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e831722e6723eb9d632bd0e04b0320723985d19e43ef4a1624f5425efc0ef9b", size = 3028185, upload-time = "2025-02-05T13:57:17.796Z" }, { url = "https://files.pythonhosted.org/packages/9f/0f/ec751e200fe44660139755f637b20836496b3e28a653d4b8f131f26e3e04/TorchCodec-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c9dd97f1b59e02ee8cc1e140650aae04e469e9fb848d9b70af9e2fddfcb9ff3", size = 755291, upload-time = "2025-02-05T13:56:50.016Z" }, { url = "https://files.pythonhosted.org/packages/32/80/1d948edfe98e5e918b221560527605ea2539c5cbd699db8f3d6183791449/TorchCodec-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:51090be7bb7bf7a45dc65c599b0e864852a97df397bb70fa2a976dd6e9cb5358", size = 2962876, upload-time = "2025-02-05T13:57:19.938Z" }, @@ -3461,8 +3911,13 @@ dependencies = [ { name = "torch" }, ] wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/3d/b7241abfa3e6651c6e00796f5de2bd1ce4d500bf5159bcbfeea47e711b93/torchvision-0.21.0-1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ff96666b94a55e802ea6796cabe788541719e6f4905fc59c380fed3517b6a64d", size = 2329320, upload-time = "2025-03-18T17:25:52.272Z" }, { url = "https://files.pythonhosted.org/packages/52/5b/76ca113a853b19c7b1da761f8a72cb6429b3bd0bf932537d8df4657f47c3/torchvision-0.21.0-1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:ffa2a16499508fe6798323e455f312c7c55f2a88901c9a7c0fb1efa86cf7e327", size = 2329878, upload-time = "2025-03-18T17:25:50.039Z" }, { url = "https://files.pythonhosted.org/packages/4e/fe/5e193353706dab96fe73ae100d5a633ff635ce310e0d92f3bc2958d075b1/torchvision-0.21.0-1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7e9e9afa150e40cd2a8f0701c43cb82a8d724f512896455c0918b987f94b84a4", size = 2280711, upload-time = "2025-03-18T17:25:46.012Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/00c69db213ee2443ada8886ec60789b227e06bb869d85ee324578221a7f7/torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:110d115333524d60e9e474d53c7d20f096dbd8a080232f88dddb90566f90064c", size = 1784141, upload-time = "2025-01-29T16:28:51.207Z" }, + { url = "https://files.pythonhosted.org/packages/be/a2/b0cedf0a411f1a5d75cfc0b87cde56dd1ddc1878be46a42c905cd8580220/torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:3891cd086c5071bda6b4ee9d266bb2ac39c998c045c2ebcd1e818b8316fb5d41", size = 7237719, upload-time = "2025-01-29T16:28:20.724Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a1/ee962ef9d0b2bf7a6f8b14cb95acb70e05cd2101af521032a09e43f8582f/torchvision-0.21.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:54454923a50104c66a9ab6bd8b73a11c2fc218c964b1006d5d1fe5b442c3dcb6", size = 14700617, upload-time = "2025-01-29T16:28:30.247Z" }, + { url = "https://files.pythonhosted.org/packages/88/53/4ad334b9b1d8dd99836869fec139cb74a27781298360b91b9506c53f1d10/torchvision-0.21.0-cp311-cp311-win_amd64.whl", hash = "sha256:49bcfad8cfe2c27dee116c45d4f866d7974bcf14a5a9fbef893635deae322f2f", size = 1560523, upload-time = "2025-01-29T16:28:48.751Z" }, { url = "https://files.pythonhosted.org/packages/6e/1b/28f527b22d5e8800184d0bc847f801ae92c7573a8c15979d92b7091c0751/torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:97a5814a93c793aaf0179cfc7f916024f4b63218929aee977b645633d074a49f", size = 1784140, upload-time = "2025-01-29T16:28:44.694Z" }, { url = "https://files.pythonhosted.org/packages/36/63/0722e153fd27d64d5b0af45b5c8cb0e80b35a68cf0130303bc9a8bb095c7/torchvision-0.21.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:b578bcad8a4083b40d34f689b19ca9f7c63e511758d806510ea03c29ac568f7b", size = 7238673, upload-time = "2025-01-29T16:28:27.631Z" }, { url = "https://files.pythonhosted.org/packages/bb/ea/03541ed901cdc30b934f897060d09bbf7a98466a08ad1680320f9ce0cbe0/torchvision-0.21.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5083a5b1fec2351bf5ea9900a741d54086db75baec4b1d21e39451e00977f1b1", size = 14701186, upload-time = "2025-01-29T16:28:16.491Z" }, @@ -3515,7 +3970,7 @@ wheels = [ [[package]] name = "transformers" -version = "4.55.3" +version = "4.55.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -3529,9 +3984,9 @@ dependencies = [ { name = "tokenizers" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/20/a7/f8ef0b1caa022e085a31cd01bc705fb9194558f8a35a5107b3ee5cb640ed/transformers-4.55.3.tar.gz", hash = "sha256:31dca715549f56cb1b591a933d2caf534f948705191e809234a52a2df407c98f", size = 9572448, upload-time = "2025-08-21T09:23:01.46Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/43/3cb831d5f28cc723516e5bb43a8c6042aca3038bb36b6bd6016b40dfd1e8/transformers-4.55.4.tar.gz", hash = "sha256:574a30559bc273c7a4585599ff28ab6b676e96dc56ffd2025ecfce2fd0ab915d", size = 9573015, upload-time = "2025-08-22T15:18:43.192Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f5/be/79ef53e6a65795b40bfc2d11fa54a16abcb173f069e32005b8092c5d5c19/transformers-4.55.3-py3-none-any.whl", hash = "sha256:c85e7feace634541e23b3e34d28aa9492d67974b733237ade9eba7c57c0fd1bd", size = 11269669, upload-time = "2025-08-21T09:22:57.535Z" }, + { url = "https://files.pythonhosted.org/packages/fa/0a/8791a6ee0529c45f669566969e99b75e2ab20eb0bfee8794ce295c18bdad/transformers-4.55.4-py3-none-any.whl", hash = "sha256:df28f3849665faba4af5106f0db4510323277c4bb595055340544f7e59d06458", size = 11269659, upload-time = "2025-08-22T15:18:40.025Z" }, ] [[package]] @@ -3539,6 +3994,7 @@ name = "triton" version = "3.2.0" source = { registry = "https://pypi.org/simple" } wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/2e/757d2280d4fefe7d33af7615124e7e298ae7b8e3bc4446cdb8e88b0f9bab/triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220", size = 253157636, upload-time = "2025-01-22T19:12:51.322Z" }, { url = "https://files.pythonhosted.org/packages/06/00/59500052cb1cf8cf5316be93598946bc451f14072c6ff256904428eaf03c/triton-3.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d9b215efc1c26fa7eefb9a157915c92d52e000d2bf83e5f69704047e63f125c", size = 253159365, upload-time = "2025-01-22T19:13:24.648Z" }, { url = "https://files.pythonhosted.org/packages/c7/30/37a3384d1e2e9320331baca41e835e90a3767303642c7a80d4510152cbcf/triton-3.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5dfa23ba84541d7c0a531dfce76d8bcd19159d50a4a8b14ad01e91734a5c1b0", size = 253154278, upload-time = "2025-01-22T19:13:54.221Z" }, ] @@ -3602,6 +4058,9 @@ version = "6.0.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/24/d9be5cd6642a6aa68352ded4b4b10fb0d7889cb7f45814fb92cecd35f101/watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c", size = 96393, upload-time = "2024-11-01T14:06:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/63/7a/6013b0d8dbc56adca7fdd4f0beed381c59f6752341b12fa0886fa7afc78b/watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2", size = 88392, upload-time = "2024-11-01T14:06:32.99Z" }, + { url = "https://files.pythonhosted.org/packages/d1/40/b75381494851556de56281e053700e46bff5b37bf4c7267e858640af5a7f/watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c", size = 89019, upload-time = "2024-11-01T14:06:34.963Z" }, { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" }, { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" }, { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" }, @@ -3656,6 +4115,17 @@ version = "1.17.2" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/c3/fc/e91cc220803d7bc4db93fb02facd8461c37364151b8494762cc88b0fbcef/wrapt-1.17.2.tar.gz", hash = "sha256:41388e9d4d1522446fe79d3213196bd9e3b301a336965b9e27ca2788ebd122f3", size = 55531, upload-time = "2025-01-14T10:35:45.465Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/f7/a2aab2cbc7a665efab072344a8949a71081eed1d2f451f7f7d2b966594a2/wrapt-1.17.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ff04ef6eec3eee8a5efef2401495967a916feaa353643defcc03fc74fe213b58", size = 53308, upload-time = "2025-01-14T10:33:33.992Z" }, + { url = "https://files.pythonhosted.org/packages/50/ff/149aba8365fdacef52b31a258c4dc1c57c79759c335eff0b3316a2664a64/wrapt-1.17.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db983e7bca53819efdbd64590ee96c9213894272c776966ca6306b73e4affda", size = 38488, upload-time = "2025-01-14T10:33:35.264Z" }, + { url = "https://files.pythonhosted.org/packages/65/46/5a917ce85b5c3b490d35c02bf71aedaa9f2f63f2d15d9949cc4ba56e8ba9/wrapt-1.17.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9abc77a4ce4c6f2a3168ff34b1da9b0f311a8f1cfd694ec96b0603dff1c79438", size = 38776, upload-time = "2025-01-14T10:33:38.28Z" }, + { url = "https://files.pythonhosted.org/packages/ca/74/336c918d2915a4943501c77566db41d1bd6e9f4dbc317f356b9a244dfe83/wrapt-1.17.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b929ac182f5ace000d459c59c2c9c33047e20e935f8e39371fa6e3b85d56f4a", size = 83776, upload-time = "2025-01-14T10:33:40.678Z" }, + { url = "https://files.pythonhosted.org/packages/09/99/c0c844a5ccde0fe5761d4305485297f91d67cf2a1a824c5f282e661ec7ff/wrapt-1.17.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f09b286faeff3c750a879d336fb6d8713206fc97af3adc14def0cdd349df6000", size = 75420, upload-time = "2025-01-14T10:33:41.868Z" }, + { url = "https://files.pythonhosted.org/packages/b4/b0/9fc566b0fe08b282c850063591a756057c3247b2362b9286429ec5bf1721/wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7ed2d9d039bd41e889f6fb9364554052ca21ce823580f6a07c4ec245c1f5d6", size = 83199, upload-time = "2025-01-14T10:33:43.598Z" }, + { url = "https://files.pythonhosted.org/packages/9d/4b/71996e62d543b0a0bd95dda485219856def3347e3e9380cc0d6cf10cfb2f/wrapt-1.17.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:129a150f5c445165ff941fc02ee27df65940fcb8a22a61828b1853c98763a64b", size = 82307, upload-time = "2025-01-14T10:33:48.499Z" }, + { url = "https://files.pythonhosted.org/packages/39/35/0282c0d8789c0dc9bcc738911776c762a701f95cfe113fb8f0b40e45c2b9/wrapt-1.17.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1fb5699e4464afe5c7e65fa51d4f99e0b2eadcc176e4aa33600a3df7801d6662", size = 75025, upload-time = "2025-01-14T10:33:51.191Z" }, + { url = "https://files.pythonhosted.org/packages/4f/6d/90c9fd2c3c6fee181feecb620d95105370198b6b98a0770cba090441a828/wrapt-1.17.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9a2bce789a5ea90e51a02dfcc39e31b7f1e662bc3317979aa7e5538e3a034f72", size = 81879, upload-time = "2025-01-14T10:33:52.328Z" }, + { url = "https://files.pythonhosted.org/packages/8f/fa/9fb6e594f2ce03ef03eddbdb5f4f90acb1452221a5351116c7c4708ac865/wrapt-1.17.2-cp311-cp311-win32.whl", hash = "sha256:4afd5814270fdf6380616b321fd31435a462019d834f83c8611a0ce7484c7317", size = 36419, upload-time = "2025-01-14T10:33:53.551Z" }, + { url = "https://files.pythonhosted.org/packages/47/f8/fb1773491a253cbc123c5d5dc15c86041f746ed30416535f2a8df1f4a392/wrapt-1.17.2-cp311-cp311-win_amd64.whl", hash = "sha256:acc130bc0375999da18e3d19e5a86403667ac0c4042a094fefb7eec8ebac7cf3", size = 38773, upload-time = "2025-01-14T10:33:56.323Z" }, { url = "https://files.pythonhosted.org/packages/a1/bd/ab55f849fd1f9a58ed7ea47f5559ff09741b25f00c191231f9f059c83949/wrapt-1.17.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:d5e2439eecc762cd85e7bd37161d4714aa03a33c5ba884e26c81559817ca0925", size = 53799, upload-time = "2025-01-14T10:33:57.4Z" }, { url = "https://files.pythonhosted.org/packages/53/18/75ddc64c3f63988f5a1d7e10fb204ffe5762bc663f8023f18ecaf31a332e/wrapt-1.17.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fc7cb4c1c744f8c05cd5f9438a3caa6ab94ce8344e952d7c45a8ed59dd88392", size = 38821, upload-time = "2025-01-14T10:33:59.334Z" }, { url = "https://files.pythonhosted.org/packages/48/2a/97928387d6ed1c1ebbfd4efc4133a0633546bec8481a2dd5ec961313a1c7/wrapt-1.17.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8fdbdb757d5390f7c675e558fd3186d590973244fab0c5fe63d373ade3e99d40", size = 38919, upload-time = "2025-01-14T10:34:04.093Z" }, @@ -3698,6 +4168,21 @@ version = "3.5.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/00/5e/d6e5258d69df8b4ed8c83b6664f2b47d30d2dec551a29ad72a6c69eafd31/xxhash-3.5.0.tar.gz", hash = "sha256:84f2caddf951c9cbf8dc2e22a89d4ccf5d86391ac6418fe81e3c67d0cf60b45f", size = 84241, upload-time = "2024-08-17T09:20:38.972Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/c7/afed0f131fbda960ff15eee7f304fa0eeb2d58770fade99897984852ef23/xxhash-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02c2e816896dc6f85922ced60097bcf6f008dedfc5073dcba32f9c8dd786f3c1", size = 31969, upload-time = "2024-08-17T09:18:00.852Z" }, + { url = "https://files.pythonhosted.org/packages/8c/0c/7c3bc6d87e5235672fcc2fb42fd5ad79fe1033925f71bf549ee068c7d1ca/xxhash-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6027dcd885e21581e46d3c7f682cfb2b870942feeed58a21c29583512c3f09f8", size = 30800, upload-time = "2024-08-17T09:18:01.863Z" }, + { url = "https://files.pythonhosted.org/packages/04/9e/01067981d98069eec1c20201f8c145367698e9056f8bc295346e4ea32dd1/xxhash-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1308fa542bbdbf2fa85e9e66b1077eea3a88bef38ee8a06270b4298a7a62a166", size = 221566, upload-time = "2024-08-17T09:18:03.461Z" }, + { url = "https://files.pythonhosted.org/packages/d4/09/d4996de4059c3ce5342b6e1e6a77c9d6c91acce31f6ed979891872dd162b/xxhash-3.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c28b2fdcee797e1c1961cd3bcd3d545cab22ad202c846235197935e1df2f8ef7", size = 201214, upload-time = "2024-08-17T09:18:05.616Z" }, + { url = "https://files.pythonhosted.org/packages/62/f5/6d2dc9f8d55a7ce0f5e7bfef916e67536f01b85d32a9fbf137d4cadbee38/xxhash-3.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:924361811732ddad75ff23e90efd9ccfda4f664132feecb90895bade6a1b4623", size = 429433, upload-time = "2024-08-17T09:18:06.957Z" }, + { url = "https://files.pythonhosted.org/packages/d9/72/9256303f10e41ab004799a4aa74b80b3c5977d6383ae4550548b24bd1971/xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89997aa1c4b6a5b1e5b588979d1da048a3c6f15e55c11d117a56b75c84531f5a", size = 194822, upload-time = "2024-08-17T09:18:08.331Z" }, + { url = "https://files.pythonhosted.org/packages/34/92/1a3a29acd08248a34b0e6a94f4e0ed9b8379a4ff471f1668e4dce7bdbaa8/xxhash-3.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:685c4f4e8c59837de103344eb1c8a3851f670309eb5c361f746805c5471b8c88", size = 208538, upload-time = "2024-08-17T09:18:10.332Z" }, + { url = "https://files.pythonhosted.org/packages/53/ad/7fa1a109663366de42f724a1cdb8e796a260dbac45047bce153bc1e18abf/xxhash-3.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbd2ecfbfee70bc1a4acb7461fa6af7748ec2ab08ac0fa298f281c51518f982c", size = 216953, upload-time = "2024-08-17T09:18:11.707Z" }, + { url = "https://files.pythonhosted.org/packages/35/02/137300e24203bf2b2a49b48ce898ecce6fd01789c0fcd9c686c0a002d129/xxhash-3.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:25b5a51dc3dfb20a10833c8eee25903fd2e14059e9afcd329c9da20609a307b2", size = 203594, upload-time = "2024-08-17T09:18:13.799Z" }, + { url = "https://files.pythonhosted.org/packages/23/03/aeceb273933d7eee248c4322b98b8e971f06cc3880e5f7602c94e5578af5/xxhash-3.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a8fb786fb754ef6ff8c120cb96629fb518f8eb5a61a16aac3a979a9dbd40a084", size = 210971, upload-time = "2024-08-17T09:18:15.824Z" }, + { url = "https://files.pythonhosted.org/packages/e3/64/ed82ec09489474cbb35c716b189ddc1521d8b3de12b1b5ab41ce7f70253c/xxhash-3.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a905ad00ad1e1c34fe4e9d7c1d949ab09c6fa90c919860c1534ff479f40fd12d", size = 415050, upload-time = "2024-08-17T09:18:17.142Z" }, + { url = "https://files.pythonhosted.org/packages/71/43/6db4c02dcb488ad4e03bc86d70506c3d40a384ee73c9b5c93338eb1f3c23/xxhash-3.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:963be41bcd49f53af6d795f65c0da9b4cc518c0dd9c47145c98f61cb464f4839", size = 192216, upload-time = "2024-08-17T09:18:18.779Z" }, + { url = "https://files.pythonhosted.org/packages/22/6d/db4abec29e7a567455344433d095fdb39c97db6955bb4a2c432e486b4d28/xxhash-3.5.0-cp311-cp311-win32.whl", hash = "sha256:109b436096d0a2dd039c355fa3414160ec4d843dfecc64a14077332a00aeb7da", size = 30120, upload-time = "2024-08-17T09:18:20.009Z" }, + { url = "https://files.pythonhosted.org/packages/52/1c/fa3b61c0cf03e1da4767213672efe186b1dfa4fc901a4a694fb184a513d1/xxhash-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b702f806693201ad6c0a05ddbbe4c8f359626d0b3305f766077d51388a6bac58", size = 30003, upload-time = "2024-08-17T09:18:21.052Z" }, + { url = "https://files.pythonhosted.org/packages/6b/8e/9e6fc572acf6e1cc7ccb01973c213f895cb8668a9d4c2b58a99350da14b7/xxhash-3.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:c4dcb4120d0cc3cc448624147dba64e9021b278c63e34a38789b688fd0da9bf3", size = 26777, upload-time = "2024-08-17T09:18:22.809Z" }, { url = "https://files.pythonhosted.org/packages/07/0e/1bfce2502c57d7e2e787600b31c83535af83746885aa1a5f153d8c8059d6/xxhash-3.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:14470ace8bd3b5d51318782cd94e6f94431974f16cb3b8dc15d52f3b69df8e00", size = 31969, upload-time = "2024-08-17T09:18:24.025Z" }, { url = "https://files.pythonhosted.org/packages/3f/d6/8ca450d6fe5b71ce521b4e5db69622383d039e2b253e9b2f24f93265b52c/xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:59aa1203de1cb96dbeab595ded0ad0c0056bb2245ae11fac11c0ceea861382b9", size = 30787, upload-time = "2024-08-17T09:18:25.318Z" }, { url = "https://files.pythonhosted.org/packages/5b/84/de7c89bc6ef63d750159086a6ada6416cc4349eab23f76ab870407178b93/xxhash-3.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08424f6648526076e28fae6ea2806c0a7d504b9ef05ae61d196d571e5c879c84", size = 220959, upload-time = "2024-08-17T09:18:26.518Z" }, @@ -3741,6 +4226,23 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/18/893b50efc2350e47a874c5c2d67e55a0ea5df91186b2a6f5ac52eff887cd/yarl-1.20.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:47ee6188fea634bdfaeb2cc420f5b3b17332e6225ce88149a17c413c77ff269e", size = 133833, upload-time = "2025-06-10T00:43:07.393Z" }, + { url = "https://files.pythonhosted.org/packages/89/ed/b8773448030e6fc47fa797f099ab9eab151a43a25717f9ac043844ad5ea3/yarl-1.20.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d0f6500f69e8402d513e5eedb77a4e1818691e8f45e6b687147963514d84b44b", size = 91070, upload-time = "2025-06-10T00:43:09.538Z" }, + { url = "https://files.pythonhosted.org/packages/e3/e3/409bd17b1e42619bf69f60e4f031ce1ccb29bd7380117a55529e76933464/yarl-1.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a8900a42fcdaad568de58887c7b2f602962356908eedb7628eaf6021a6e435b", size = 89818, upload-time = "2025-06-10T00:43:11.575Z" }, + { url = "https://files.pythonhosted.org/packages/f8/77/64d8431a4d77c856eb2d82aa3de2ad6741365245a29b3a9543cd598ed8c5/yarl-1.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bad6d131fda8ef508b36be3ece16d0902e80b88ea7200f030a0f6c11d9e508d4", size = 347003, upload-time = "2025-06-10T00:43:14.088Z" }, + { url = "https://files.pythonhosted.org/packages/8d/d2/0c7e4def093dcef0bd9fa22d4d24b023788b0a33b8d0088b51aa51e21e99/yarl-1.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:df018d92fe22aaebb679a7f89fe0c0f368ec497e3dda6cb81a567610f04501f1", size = 336537, upload-time = "2025-06-10T00:43:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/f0/f3/fc514f4b2cf02cb59d10cbfe228691d25929ce8f72a38db07d3febc3f706/yarl-1.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f969afbb0a9b63c18d0feecf0db09d164b7a44a053e78a7d05f5df163e43833", size = 362358, upload-time = "2025-06-10T00:43:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6d/a313ac8d8391381ff9006ac05f1d4331cee3b1efaa833a53d12253733255/yarl-1.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:812303eb4aa98e302886ccda58d6b099e3576b1b9276161469c25803a8db277d", size = 357362, upload-time = "2025-06-10T00:43:20.888Z" }, + { url = "https://files.pythonhosted.org/packages/00/70/8f78a95d6935a70263d46caa3dd18e1f223cf2f2ff2037baa01a22bc5b22/yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98c4a7d166635147924aa0bf9bfe8d8abad6fffa6102de9c99ea04a1376f91e8", size = 348979, upload-time = "2025-06-10T00:43:23.169Z" }, + { url = "https://files.pythonhosted.org/packages/cb/05/42773027968968f4f15143553970ee36ead27038d627f457cc44bbbeecf3/yarl-1.20.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12e768f966538e81e6e7550f9086a6236b16e26cd964cf4df35349970f3551cf", size = 337274, upload-time = "2025-06-10T00:43:27.111Z" }, + { url = "https://files.pythonhosted.org/packages/05/be/665634aa196954156741ea591d2f946f1b78ceee8bb8f28488bf28c0dd62/yarl-1.20.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe41919b9d899661c5c28a8b4b0acf704510b88f27f0934ac7a7bebdd8938d5e", size = 363294, upload-time = "2025-06-10T00:43:28.96Z" }, + { url = "https://files.pythonhosted.org/packages/eb/90/73448401d36fa4e210ece5579895731f190d5119c4b66b43b52182e88cd5/yarl-1.20.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:8601bc010d1d7780592f3fc1bdc6c72e2b6466ea34569778422943e1a1f3c389", size = 358169, upload-time = "2025-06-10T00:43:30.701Z" }, + { url = "https://files.pythonhosted.org/packages/c3/b0/fce922d46dc1eb43c811f1889f7daa6001b27a4005587e94878570300881/yarl-1.20.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:daadbdc1f2a9033a2399c42646fbd46da7992e868a5fe9513860122d7fe7a73f", size = 362776, upload-time = "2025-06-10T00:43:32.51Z" }, + { url = "https://files.pythonhosted.org/packages/f1/0d/b172628fce039dae8977fd22caeff3eeebffd52e86060413f5673767c427/yarl-1.20.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:03aa1e041727cb438ca762628109ef1333498b122e4c76dd858d186a37cec845", size = 381341, upload-time = "2025-06-10T00:43:34.543Z" }, + { url = "https://files.pythonhosted.org/packages/6b/9b/5b886d7671f4580209e855974fe1cecec409aa4a89ea58b8f0560dc529b1/yarl-1.20.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:642980ef5e0fa1de5fa96d905c7e00cb2c47cb468bfcac5a18c58e27dbf8d8d1", size = 379988, upload-time = "2025-06-10T00:43:36.489Z" }, + { url = "https://files.pythonhosted.org/packages/73/be/75ef5fd0fcd8f083a5d13f78fd3f009528132a1f2a1d7c925c39fa20aa79/yarl-1.20.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:86971e2795584fe8c002356d3b97ef6c61862720eeff03db2a7c86b678d85b3e", size = 371113, upload-time = "2025-06-10T00:43:38.592Z" }, + { url = "https://files.pythonhosted.org/packages/50/4f/62faab3b479dfdcb741fe9e3f0323e2a7d5cd1ab2edc73221d57ad4834b2/yarl-1.20.1-cp311-cp311-win32.whl", hash = "sha256:597f40615b8d25812f14562699e287f0dcc035d25eb74da72cae043bb884d773", size = 81485, upload-time = "2025-06-10T00:43:41.038Z" }, + { url = "https://files.pythonhosted.org/packages/f0/09/d9c7942f8f05c32ec72cd5c8e041c8b29b5807328b68b4801ff2511d4d5e/yarl-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:26ef53a9e726e61e9cd1cda6b478f17e350fb5800b4bd1cd9fe81c4d91cfeb2e", size = 86686, upload-time = "2025-06-10T00:43:42.692Z" }, { url = "https://files.pythonhosted.org/packages/5f/9a/cb7fad7d73c69f296eda6815e4a2c7ed53fc70c2f136479a91c8e5fbdb6d/yarl-1.20.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdcc4cd244e58593a4379fe60fdee5ac0331f8eb70320a24d591a3be197b94a9", size = 133667, upload-time = "2025-06-10T00:43:44.369Z" }, { url = "https://files.pythonhosted.org/packages/67/38/688577a1cb1e656e3971fb66a3492501c5a5df56d99722e57c98249e5b8a/yarl-1.20.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b29a2c385a5f5b9c7d9347e5812b6f7ab267193c62d282a540b4fc528c8a9d2a", size = 91025, upload-time = "2025-06-10T00:43:46.295Z" }, { url = "https://files.pythonhosted.org/packages/50/ec/72991ae51febeb11a42813fc259f0d4c8e0507f2b74b5514618d8b640365/yarl-1.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1112ae8154186dfe2de4732197f59c05a83dc814849a5ced892b708033f40dc2", size = 89709, upload-time = "2025-06-10T00:43:48.22Z" }, From aff0c5f8ead97879089b8506451147efe47aeb31 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:48:35 -0700 Subject: [PATCH 194/224] refactor: turn sources into a subpackage --- src/orcapod/data/sources/base.py | 71 +++- src/orcapod/data/sources/csv_source.py | 68 ++++ src/orcapod/data/sources/dict_source.py | 105 ++++++ src/orcapod/data/sources/list_source.py | 188 ++++++++++ .../manual_table_source.py} | 330 +----------------- 5 files changed, 435 insertions(+), 327 deletions(-) create mode 100644 src/orcapod/data/sources/csv_source.py create mode 100644 src/orcapod/data/sources/dict_source.py create mode 100644 src/orcapod/data/sources/list_source.py rename src/orcapod/data/{sources.py => sources/manual_table_source.py} (54%) diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py index 3ef6fa9..28394f7 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/data/sources/base.py @@ -1,5 +1,6 @@ +from abc import abstractmethod from collections.abc import Collection, Iterator -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from orcapod.data.kernels import TrackedKernelBase @@ -36,6 +37,29 @@ def __init__(self, **kwargs): # Cache the KernelStream for reuse across all stream method calls self._cached_kernel_stream: KernelStream | None = None + def kernel_identity_structure( + self, streams: Collection[dp.Stream] | None = None + ) -> Any: + if streams is not None: + # when checked for invocation id, act as a source + # and just return the output packet types + # _, packet_types = self.stream.types() + # return packet_types + return None + # otherwise, return the identity structure of the stream + return self.source_identity_structure() + + def kernel_output_types( + self, *streams: dp.Stream, include_system_tags: bool = False + ) -> tuple[dict[str, type], dict[str, type]]: + return self.source_output_types(include_system_tags=include_system_tags) + + @abstractmethod + def source_identity_structure(self) -> Any: ... + + @abstractmethod + def source_output_types(self, include_system_tags: bool = False) -> Any: ... + # =========================== Kernel Methods =========================== # The following are inherited from TrackedKernelBase as abstract methods. @@ -211,4 +235,49 @@ def reset_cache(self) -> None: self._cached_kernel_stream = None +class StreamSource(SourceBase): + def __init__(self, stream: dp.Stream, label: str | None = None, **kwargs) -> None: + """ + A placeholder source based on stream + This is used to represent a kernel that has no computation. + """ + label = label or stream.label + self.stream = stream + super().__init__(label=label, **kwargs) + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + return self.stream.types(include_system_tags=include_system_tags) + + @property + def kernel_id(self) -> tuple[str, ...]: + return (self.stream.__class__.__name__,) + + def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: + """ + Forward the stream through the stub kernel. + This is a no-op and simply returns the stream. + """ + return self.stream + + def source_identity_structure(self) -> Any: + return self.stream.identity_structure() + + # def __hash__(self) -> int: + # # TODO: resolve the logic around identity structure on a stream / stub kernel + # """ + # Hash the StubKernel based on its label and stream. + # This is used to uniquely identify the StubKernel in the tracker. + # """ + # identity_structure = self.identity_structure() + # if identity_structure is None: + # return hash(self.stream) + # return identity_structure + + # ==================== Example Implementation ==================== diff --git a/src/orcapod/data/sources/csv_source.py b/src/orcapod/data/sources/csv_source.py new file mode 100644 index 0000000..c560879 --- /dev/null +++ b/src/orcapod/data/sources/csv_source.py @@ -0,0 +1,68 @@ +from collections.abc import Collection +from typing import TYPE_CHECKING, Any + + +from orcapod.data.streams import ( + TableStream, +) +from orcapod.protocols import data_protocols as dp +from orcapod.types import TypeSpec +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pandas as pd + import polars as pl + import pyarrow as pa +else: + pl = LazyModule("polars") + pd = LazyModule("pandas") + pa = LazyModule("pyarrow") + +from orcapod.data.sources.base import SourceBase + + +class CSVSource(SourceBase): + """Loads data from a CSV file.""" + + def __init__( + self, + file_path: str, + tag_columns: list[str] | None = None, + source_id: str | None = None, + **kwargs, + ): + super().__init__(**kwargs) + self.file_path = file_path + self.tag_columns = tag_columns or [] + if source_id is None: + source_id = self.file_path + self.source_id = source_id + + def source_identity_structure(self) -> Any: + return (self.__class__.__name__, self.source_id, tuple(self.tag_columns)) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + import pyarrow.csv as csv + + # Load current state of the file + table = csv.read_csv(self.file_path) + + return TableStream( + table=table, + tag_columns=self.tag_columns, + source=self, + upstreams=(), + ) + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[TypeSpec, TypeSpec]: + """Infer types from the file (could be cached).""" + # For demonstration - in practice you might cache this + sample_stream = self.forward() + return sample_stream.types(include_system_tags=include_system_tags) diff --git a/src/orcapod/data/sources/dict_source.py b/src/orcapod/data/sources/dict_source.py new file mode 100644 index 0000000..290d7ea --- /dev/null +++ b/src/orcapod/data/sources/dict_source.py @@ -0,0 +1,105 @@ +from collections.abc import Collection, Mapping +from typing import TYPE_CHECKING, Any + + +from pyarrow.lib import Table + +from orcapod.data.streams import TableStream +from orcapod.protocols import data_protocols as dp +from orcapod.types import DataValue +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.data.system_constants import constants +from orcapod.semantic_types import infer_schema_from_pylist_data + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + +from orcapod.data.sources.base import SourceBase + + +class DictSource(SourceBase): + """Construct source from a collection of dictionaries""" + + def __init__( + self, + tags: Collection[dict[str, DataValue]], + packets: Collection[dict[str, DataValue]], + tag_schema: Mapping[str, type] | None = None, + packet_schema: Mapping[str, type] | None = None, + **kwargs, + ): + super().__init__(**kwargs) + self.tags = list(tags) + self.packets = list(packets) + if len(self.tags) != len(self.packets) or len(self.tags) == 0: + raise ValueError( + "Tags and packets must be non-empty collections of equal length" + ) + self.tag_schema = ( + dict(tag_schema) if tag_schema else infer_schema_from_pylist_data(self.tags) + ) + self.packet_schema = ( + dict(packet_schema) + if packet_schema + else infer_schema_from_pylist_data(self.packets) + ) + source_info = ":".join(self.kernel_id) + self.source_info = { + f"{constants.SOURCE_PREFIX}{k}": f"{source_info}:{k}" + for k in self.tag_schema + } + + def source_identity_structure(self) -> Any: + return ( + self.__class__.__name__, + tuple(self.tag_schema.items()), + tuple(self.packet_schema.items()), + ) + + def get_all_records(self, include_system_columns: bool = False) -> Table | None: + return self().as_table(include_source=include_system_columns) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + tag_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self.tag_schema + ) + ) + packet_arrow_schema = ( + self._data_context.type_converter.python_schema_to_arrow_schema( + self.packet_schema + ) + ) + + joined_data = [ + {**tag, **packet} for tag, packet in zip(self.tags, self.packets) + ] + + table = pa.Table.from_pylist( + joined_data, + schema=arrow_utils.join_arrow_schemas( + tag_arrow_schema, packet_arrow_schema + ), + ) + + return TableStream( + table=table, + tag_columns=self.tag_keys, + source=self, + upstreams=(), + ) + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[dict[str, type], dict[str, type]]: + """Return tag and packet types based on provided typespecs.""" + # TODO: add system tag + return self.tag_schema, self.packet_schema diff --git a/src/orcapod/data/sources/list_source.py b/src/orcapod/data/sources/list_source.py new file mode 100644 index 0000000..95e90ae --- /dev/null +++ b/src/orcapod/data/sources/list_source.py @@ -0,0 +1,188 @@ +from collections.abc import Callable, Collection, Iterator +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, cast + +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError +from pyarrow.lib import Table + +from orcapod.data.datagrams import DictTag +from orcapod.data.kernels import TrackedKernelBase +from orcapod.data.streams import ( + TableStream, + KernelStream, + StatefulStreamBase, +) +from orcapod.errors import DuplicateTagError +from orcapod.protocols import data_protocols as dp +from orcapod.types import DataValue, TypeSpec +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.data.system_constants import constants +from orcapod.semantic_types import infer_schema_from_pylist_data + +if TYPE_CHECKING: + import pandas as pd + import polars as pl + import pyarrow as pa +else: + pl = LazyModule("polars") + pd = LazyModule("pandas") + pa = LazyModule("pyarrow") + +from orcapod.data.sources.base import SourceBase + + +class ListSource(SourceBase): + """ + A stream source that sources data from a list of elements. + For each element in the list, yields a tuple containing: + - A tag generated either by the provided tag_function or defaulting to the element index + - A packet containing the element under the provided name key + Parameters + ---------- + name : str + The key name under which each list element will be stored in the packet + data : list[Any] + The list of elements to source data from + tag_function : Callable[[Any, int], Tag] | None, default=None + Optional function to generate a tag from a list element and its index. + The function receives the element and the index as arguments. + If None, uses the element index in a dict with key 'element_index' + tag_function_hash_mode : Literal["content", "signature", "name"], default="name" + How to hash the tag function for identity purposes + expected_tag_keys : Collection[str] | None, default=None + Expected tag keys for the stream + label : str | None, default=None + Optional label for the source + Examples + -------- + >>> # Simple list of file names + >>> file_list = ['/path/to/file1.txt', '/path/to/file2.txt', '/path/to/file3.txt'] + >>> source = ListSource('file_path', file_list) + >>> + >>> # Custom tag function using filename stems + >>> from pathlib import Path + >>> source = ListSource( + ... 'file_path', + ... file_list, + ... tag_function=lambda elem, idx: {'file_name': Path(elem).stem} + ... ) + >>> + >>> # List of sample IDs + >>> samples = ['sample_001', 'sample_002', 'sample_003'] + >>> source = ListSource( + ... 'sample_id', + ... samples, + ... tag_function=lambda elem, idx: {'sample': elem} + ... ) + """ + + @staticmethod + def default_tag_function(element: Any, idx: int) -> dp.Tag: + return DictTag({"element_index": idx}) + + def __init__( + self, + name: str, + data: list[Any], + tag_function: Callable[[Any, int], dp.Tag] | None = None, + label: str | None = None, + tag_function_hash_mode: Literal["content", "signature", "name"] = "name", + expected_tag_keys: Collection[str] | None = None, + **kwargs, + ) -> None: + super().__init__(label=label, **kwargs) + self.name = name + self.elements = list(data) # Create a copy to avoid external modifications + + if tag_function is None: + tag_function = self.__class__.default_tag_function + # If using default tag function and no explicit expected_tag_keys, set to default + if expected_tag_keys is None: + expected_tag_keys = ["element_index"] + + self.expected_tag_keys = expected_tag_keys + self.tag_function = tag_function + self.tag_function_hash_mode = tag_function_hash_mode + + def forward(self, *streams: SyncStream) -> SyncStream: + if len(streams) != 0: + raise ValueError( + "ListSource does not support forwarding streams. " + "It generates its own stream from the list elements." + ) + + def generator() -> Iterator[tuple[Tag, Packet]]: + for idx, element in enumerate(self.elements): + tag = self.tag_function(element, idx) + packet = {self.name: element} + yield tag, packet + + return SyncStreamFromGenerator(generator) + + def __repr__(self) -> str: + return f"ListSource({self.name}, {len(self.elements)} elements)" + + def identity_structure(self, *streams: SyncStream) -> Any: + hash_function_kwargs = {} + if self.tag_function_hash_mode == "content": + # if using content hash, exclude few + hash_function_kwargs = { + "include_name": False, + "include_module": False, + "include_declaration": False, + } + + tag_function_hash = hash_function( + self.tag_function, + function_hash_mode=self.tag_function_hash_mode, + hash_kwargs=hash_function_kwargs, + ) + + # Convert list to hashable representation + # Handle potentially unhashable elements by converting to string + try: + elements_hashable = tuple(self.elements) + except TypeError: + # If elements are not hashable, convert to string representation + elements_hashable = tuple(str(elem) for elem in self.elements) + + return ( + self.__class__.__name__, + self.name, + elements_hashable, + tag_function_hash, + ) + tuple(streams) + + def keys( + self, *streams: SyncStream, trigger_run: bool = False + ) -> tuple[Collection[str] | None, Collection[str] | None]: + """ + Returns the keys of the stream. The keys are the names of the packets + in the stream. The keys are used to identify the packets in the stream. + If expected_keys are provided, they will be used instead of the default keys. + """ + if len(streams) != 0: + raise ValueError( + "ListSource does not support forwarding streams. " + "It generates its own stream from the list elements." + ) + + if self.expected_tag_keys is not None: + return tuple(self.expected_tag_keys), (self.name,) + return super().keys(trigger_run=trigger_run) + + def claims_unique_tags( + self, *streams: "SyncStream", trigger_run: bool = True + ) -> bool | None: + if len(streams) != 0: + raise ValueError( + "ListSource does not support forwarding streams. " + "It generates its own stream from the list elements." + ) + # Claim uniqueness only if the default tag function is used + if self.tag_function == self.__class__.default_tag_function: + return True + # Otherwise, delegate to the base class + return super().claims_unique_tags(trigger_run=trigger_run) diff --git a/src/orcapod/data/sources.py b/src/orcapod/data/sources/manual_table_source.py similarity index 54% rename from src/orcapod/data/sources.py rename to src/orcapod/data/sources/manual_table_source.py index 0f25436..e393cce 100644 --- a/src/orcapod/data/sources.py +++ b/src/orcapod/data/sources/manual_table_source.py @@ -29,244 +29,7 @@ pd = LazyModule("pandas") pa = LazyModule("pyarrow") - -class SourceBase(TrackedKernelBase, StatefulStreamBase): - """ - Base class for sources that act as both Kernels and LiveStreams. - - Design Philosophy: - 1. Source is fundamentally a Kernel (data loader) - 2. forward() returns static snapshots as a stream (pure computation) - 3. __call__() returns a cached KernelStream (live, tracked) - 4. All stream methods delegate to the cached KernelStream - - This ensures that direct source iteration and source() iteration - are identical and both benefit from KernelStream's lifecycle management. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - # Cache the KernelStream for reuse across all stream method calls - self._cached_kernel_stream: KernelStream | None = None - - # =========================== Kernel Methods =========================== - - # The following are inherited from TrackedKernelBase as abstract methods. - # @abstractmethod - # def forward(self, *streams: dp.Stream) -> dp.Stream: - # """ - # Pure computation: return a static snapshot of the data. - - # This is the core method that subclasses must implement. - # Each call should return a fresh stream representing the current state of the data. - # This is what KernelStream calls when it needs to refresh its data. - # """ - # ... - - # @abstractmethod - # def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: - # """Return the tag and packet types this source produces.""" - # ... - - # @abstractmethod - # def kernel_identity_structure( - # self, streams: Collection[dp.Stream] | None = None - # ) -> dp.Any: ... - - def validate_inputs(self, *streams: dp.Stream) -> None: - """Sources take no input streams.""" - if len(streams) > 0: - raise ValueError( - f"{self.__class__.__name__} is a source and takes no input streams" - ) - - def prepare_output_stream( - self, *streams: dp.Stream, label: str | None = None - ) -> KernelStream: - if self._cached_kernel_stream is None: - self._cached_kernel_stream = super().prepare_output_stream( - *streams, label=label - ) - return self._cached_kernel_stream - - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: - if not self._skip_tracking and self._tracker_manager is not None: - self._tracker_manager.record_source_invocation(self, label=label) - - # ==================== Stream Protocol (Delegation) ==================== - - @property - def source(self) -> dp.Kernel | None: - """Sources are their own source.""" - return self - - @property - def upstreams(self) -> tuple[dp.Stream, ...]: - """Sources have no upstream dependencies.""" - return () - - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: - """Delegate to the cached KernelStream.""" - return self().keys() - - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: - """Delegate to the cached KernelStream.""" - return self().types(include_system_tags=include_system_tags) - - @property - def last_modified(self): - """Delegate to the cached KernelStream.""" - return self().last_modified - - @property - def is_current(self) -> bool: - """Delegate to the cached KernelStream.""" - return self().is_current - - def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: - """ - Iterate over the cached KernelStream. - - This allows direct iteration over the source as if it were a stream. - """ - return self().iter_packets() - - def iter_packets( - self, - execution_engine: dp.ExecutionEngine | None = None, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: - """Delegate to the cached KernelStream.""" - return self().iter_packets(execution_engine=execution_engine) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, - ) -> "pa.Table": - """Delegate to the cached KernelStream.""" - return self().as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def flow( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Collection[tuple[dp.Tag, dp.Packet]]: - """Delegate to the cached KernelStream.""" - return self().flow(execution_engine=execution_engine) - - def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: - """ - Run the source node, executing the contained source. - - This is a no-op for sources since they are not executed like pods. - """ - self().run(execution_engine=execution_engine) - - async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> None: - """ - Run the source node asynchronously, executing the contained source. - - This is a no-op for sources since they are not executed like pods. - """ - await self().run_async(execution_engine=execution_engine) - - # ==================== LiveStream Protocol (Delegation) ==================== - - def refresh(self, force: bool = False) -> bool: - """Delegate to the cached KernelStream.""" - return self().refresh(force=force) - - def invalidate(self) -> None: - """Delegate to the cached KernelStream.""" - return self().invalidate() - - # ==================== Source Protocol ==================== - - @property - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - tag_keys, _ = self.keys() - return tag_keys - - @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - # TODO: consider caching this - _, packet_keys = self.keys() - return packet_keys - - def reset_cache(self) -> None: - """ - Clear the cached KernelStream, forcing a fresh one on next access. - - Useful when the underlying data source has fundamentally changed - (e.g., file path changed, database connection reset). - """ - if self._cached_kernel_stream is not None: - self._cached_kernel_stream.invalidate() - self._cached_kernel_stream = None - - -# ==================== Example Implementation ==================== - - -class CSVSource(SourceBase): - """Loads data from a CSV file.""" - - def __init__(self, file_path: str, tag_columns: list[str] | None = None, **kwargs): - super().__init__(**kwargs) - self.file_path = file_path - self.tag_columns = tag_columns or [] - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - return (self.__class__.__name__, self.file_path, tuple(self.tag_columns)) - - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Load data from file and return a static stream. - - This is called by forward() and creates a fresh snapshot each time. - """ - import pyarrow.csv as csv - - from orcapod.data.streams import TableStream - - # Load current state of the file - table = csv.read_csv(self.file_path) - - return TableStream( - table=table, - tag_columns=self.tag_columns, - source=self, - upstreams=(), - ) - - def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: - """Infer types from the file (could be cached).""" - # For demonstration - in practice you might cache this - sample_stream = self.forward() - return sample_stream.types(include_system_tags=include_system_tags) +from orcapod.data.sources.base import SourceBase class ManualDeltaTableSource(SourceBase): @@ -369,17 +132,15 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: arrow_data, tag_columns=self.tag_columns, source=self, upstreams=() ) - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: + def source_identity_structure(self) -> Any: """ Return the identity structure of the kernel. This is a unique identifier for the kernel based on its class name and table path. """ return (self.__class__.__name__, str(self.table_path)) - def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + def source_output_types( + self, include_system_tags: bool = False ) -> tuple[TypeSpec, TypeSpec]: """Return tag and packet types based on schema and tag columns.""" # TODO: auto add system entry tag @@ -605,86 +366,3 @@ def load_delta_table(self) -> None: self._set_modified_time() self._delta_table = delta_table - - -class DictSource(SourceBase): - """Construct source from a collection of dictionaries""" - - def __init__( - self, - tags: Collection[dict[str, DataValue]], - packets: Collection[dict[str, DataValue]], - tag_typespec: dict[str, type] | None = None, - packet_typespec: dict[str, type] | None = None, - **kwargs, - ): - super().__init__(**kwargs) - self.tags = list(tags) - self.packets = list(packets) - if len(self.tags) != len(self.packets) or len(self.tags) == 0: - raise ValueError( - "Tags and packets must be non-empty collections of equal length" - ) - self.tag_typespec = tag_typespec or infer_schema_from_pylist_data(self.tags) - self.packet_typespec = packet_typespec or infer_schema_from_pylist_data( - self.packets - ) - source_info = ":".join(self.kernel_id) - self.source_info = { - f"{constants.SOURCE_PREFIX}{k}": f"{source_info}:{k}" - for k in self.tag_typespec - } - - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: - return ( - self.__class__.__name__, - tuple(self.tag_typespec.items()), - tuple(self.packet_typespec.items()), - ) - - def get_all_records(self, include_system_columns: bool = False) -> Table | None: - return self().as_table(include_source=include_system_columns) - - def forward(self, *streams: dp.Stream) -> dp.Stream: - """ - Load data from file and return a static stream. - - This is called by forward() and creates a fresh snapshot each time. - """ - tag_arrow_schema = ( - self._data_context.type_converter.python_schema_to_arrow_schema( - self.tag_typespec - ) - ) - packet_arrow_schema = ( - self._data_context.type_converter.python_schema_to_arrow_schema( - self.packet_typespec - ) - ) - - joined_data = [ - {**tag, **packet} for tag, packet in zip(self.tags, self.packets) - ] - - table = pa.Table.from_pylist( - joined_data, - schema=arrow_utils.join_arrow_schemas( - tag_arrow_schema, packet_arrow_schema - ), - ) - - return TableStream( - table=table, - tag_columns=self.tag_keys, - source=self, - upstreams=(), - ) - - def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: - """Return tag and packet types based on provided typespecs.""" - # TODO: add system tag - return self.tag_typespec, self.packet_typespec From 09561f192d112987f291f0f08945075d5fb62b5a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:49:22 -0700 Subject: [PATCH 195/224] refactor: enhance stream protocol with data view methods --- .../protocols/data_protocols/source.py | 22 ------ .../protocols/data_protocols/streams.py | 71 ++++++++++++++++++- 2 files changed, 70 insertions(+), 23 deletions(-) diff --git a/src/orcapod/protocols/data_protocols/source.py b/src/orcapod/protocols/data_protocols/source.py index c0befd6..6af8f13 100644 --- a/src/orcapod/protocols/data_protocols/source.py +++ b/src/orcapod/protocols/data_protocols/source.py @@ -31,25 +31,3 @@ class Source(Kernel, Stream, Protocol): - Serve as the starting point for data lineage - May have their own refresh/update mechanisms """ - - @property - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - ... - - @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - ... - - # def as_lazy_frame(self, sort_by_tags: bool = False) -> "pl.LazyFrame | None": ... - - # def as_polars_df(self, sort_by_tags: bool = False) -> "pl.DataFrame | None": ... - - # def as_pandas_df(self, sort_by_tags: bool = False) -> "pd.DataFrame | None": ... diff --git a/src/orcapod/protocols/data_protocols/streams.py b/src/orcapod/protocols/data_protocols/streams.py index 5c4c495..afec071 100644 --- a/src/orcapod/protocols/data_protocols/streams.py +++ b/src/orcapod/protocols/data_protocols/streams.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: import polars as pl import pyarrow as pa + import pandas as pd from orcapod.protocols.data_protocols.kernel import Kernel @@ -115,7 +116,9 @@ def upstreams(self) -> tuple["Stream", ...]: """ ... - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Available keys/fields in the stream content. @@ -131,6 +134,20 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """ ... + def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: + """ + Return the keys used for the tag in the pipeline run records. + This is used to store the run-associated tag info. + """ + ... + + def packet_keys(self) -> tuple[str, ...]: + """ + Return the keys used for the packet in the pipeline run records. + This is used to store the run-associated packet info. + """ + ... + def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ Type specifications for the stream content. @@ -146,6 +163,23 @@ def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ ... + def tag_types(self, include_system_tags: bool = False) -> TypeSpec: + """ + Type specifications for the stream content. + + Returns the type schema for both tags and packets in this stream. + This information is used for: + - Type checking and validation + - Schema inference and planning + - Compatibility checking between kernels + + Returns: + tuple[TypeSpec, TypeSpec]: (tag_types, packet_types) + """ + ... + + def packet_types(self) -> TypeSpec: ... + @property def last_modified(self) -> datetime | None: """ @@ -261,6 +295,41 @@ def as_df( """ ... + def as_lazy_frame( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: ExecutionEngine | None = None, + ) -> "pl.LazyFrame | None": + """ + Load the entire stream to a Polars LazyFrame. + """ + ... + + def as_polars_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: ExecutionEngine | None = None, + ) -> "pl.DataFrame | None": ... + + def as_pandas_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + index_by_tags: bool = True, + execution_engine: ExecutionEngine | None = None, + ) -> "pd.DataFrame | None": ... + def as_table( self, include_data_context: bool = False, From 81226f8cdce6de0f1ab22a562f7b1ff6e9a00fca Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:49:40 -0700 Subject: [PATCH 196/224] refactor: rename to delta table store --- src/orcapod/stores/__init__.py | 2 +- src/orcapod/stores/delta_lake_stores.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/stores/__init__.py index 4c5fff7..cd9ae0f 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/stores/__init__.py @@ -13,4 +13,4 @@ # "SimpleParquetDataStore", # ] -from .delta_lake_stores import BatchedDeltaTableArrowStore +from .delta_lake_stores import DeltaTableStore diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/stores/delta_lake_stores.py index ab1b8d7..8f2f48b 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/stores/delta_lake_stores.py @@ -24,9 +24,9 @@ logger = logging.getLogger(__name__) -class BatchedDeltaTableArrowStore: +class DeltaTableStore: """ - A batched Delta table store with clear insert vs update semantics. + A Delta table store with clear insert vs update semantics. - insert(): Never overwrites existing records by default. Can skip duplicates if requested. Can be batched for performance. Supports composite keys. From 29f1c5d74b868574985e43dab2014f836a8251ee Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:50:29 -0700 Subject: [PATCH 197/224] refactor: update to match new protocols --- src/orcapod/data/pods.py | 5 +- src/orcapod/data/streams.py | 169 ++++++++++++++++++++++++++++------ src/orcapod/data/trackers.py | 68 ++------------ src/orcapod/pipeline/nodes.py | 28 +----- 4 files changed, 151 insertions(+), 119 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index b45b978..144836c 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -425,7 +425,7 @@ def call( output_data, source_info=source_info, python_schema=self.output_packet_types(), - data_context=self._data_context, + data_context=self.data_context, ) return tag, output_packet @@ -475,7 +475,7 @@ async def async_call( output_data, source_info=source_info, python_schema=self.output_packet_types(), - data_context=self._data_context, + data_context=self.data_context, ) return tag, output_packet @@ -525,6 +525,7 @@ def __init__( data_context: str | contexts.DataContext | None = None, **kwargs, ) -> None: + # if data_context is not explicitly given, use that of the contained pod if data_context is None: data_context = pod.data_context_key super().__init__( diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index d6375b2..7b41747 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -24,11 +24,13 @@ import pyarrow as pa import pyarrow.compute as pc import polars as pl + import pandas as pd import asyncio else: pa = LazyModule("pyarrow") pc = LazyModule("pyarrow.compute") pl = LazyModule("polars") + pd = LazyModule("pandas") asyncio = LazyModule("asyncio") @@ -135,7 +137,6 @@ class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase def __init__( self, - data_context: str | contexts.DataContext | None = None, execution_engine: dp.ExecutionEngine | None = None, **kwargs, ) -> None: @@ -144,7 +145,6 @@ def __init__( self._set_modified_time() # note that this is not necessary for Stream protocol, but is provided # for convenience to resolve semantic types and other context-specific information - self._data_context = contexts.resolve_context(data_context) self._execution_engine = execution_engine @property @@ -181,14 +181,6 @@ def get_substream(self, substream_id: str) -> dp.Stream: else: raise ValueError(f"Substream with ID {substream_id} not found.") - @property - def data_context(self) -> contexts.DataContext: - """ - Returns the data context for the stream. - This is used to resolve semantic types and other context-specific information. - """ - return self._data_context - @property @abstractmethod def source(self) -> dp.Kernel | None: @@ -214,11 +206,25 @@ def computed_label(self) -> str | None: return None @abstractmethod - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + + def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: + return self.keys(include_system_tags=include_system_tags)[0] + + def packet_keys(self) -> tuple[str, ...]: + return self.keys()[1] @abstractmethod def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: ... + def tag_types(self, include_system_tags: bool = False) -> TypeSpec: + return self.types(include_system_tags=include_system_tags)[0] + + def packet_types(self) -> TypeSpec: + return self.types()[1] + @property def last_modified(self) -> datetime | None: """ @@ -303,7 +309,7 @@ def as_table( execution_engine: dp.ExecutionEngine | None = None, ) -> "pa.Table": ... - def as_df( + def as_polars_df( self, include_data_context: bool = False, include_source: bool = False, @@ -326,6 +332,77 @@ def as_df( ) ) + def as_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: dp.ExecutionEngine | None = None, + ) -> "pl.DataFrame | None": + """ + Convert the entire stream to a Polars DataFrame. + """ + return self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def as_lazy_frame( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: dp.ExecutionEngine | None = None, + ) -> "pl.LazyFrame | None": + """ + Convert the entire stream to a Polars LazyFrame. + """ + df = self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + if df is None: + return None + return df.lazy() + + def as_pandas_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + index_by_tags: bool = True, + execution_engine: dp.ExecutionEngine | None = None, + ) -> "pd.DataFrame | None": + df = self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + if df is None: + return None + tag_keys, _ = self.keys() + pdf = df.to_pandas() + if index_by_tags: + pdf = pdf.set_index(list(tag_keys)) + return pdf + def flow( self, execution_engine: dp.ExecutionEngine | None = None ) -> Collection[tuple[dp.Tag, dp.Packet]]: @@ -364,6 +441,8 @@ def __init__( super().__init__(**kwargs) self._source = source self._upstreams = upstreams + + # if data context is not provided, use that of the source kernel if data_context is None and source is not None: # if source is provided, use its data context data_context = source.data_context_key @@ -477,6 +556,7 @@ def __init__( self, table: "pa.Table", tag_columns: Collection[str] = (), + system_tag_columns: Collection[str] = (), source_info: dict[str, str | None] | None = None, source: dp.Kernel | None = None, upstreams: tuple[dp.Stream, ...] = (), @@ -510,6 +590,19 @@ def __init__( self._system_tag_columns = tuple( c for c in table.column_names if c.startswith(constants.SYSTEM_TAG_PREFIX) ) + if len(system_tag_columns) > 0: + # rename system_tag_columns + column_name_map = { + c: f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns + } + table = table.rename_columns( + [column_name_map.get(c, c) for c in table.column_names] + ) + + self._system_tag_columns += tuple( + f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns + ) + self._all_tag_columns = self._tag_columns + self._system_tag_columns if delta := set(tag_columns) - set(self._tag_columns): raise ValueError( @@ -567,7 +660,7 @@ def data_content_identity_structure(self) -> Any: Returns a hash of the content of the stream. This is used to identify the content of the stream. """ - table_hash = self._data_context.arrow_hasher.hash_table( + table_hash = self.data_context.arrow_hasher.hash_table( self.as_table( include_data_context=True, include_source=True, include_system_tags=True ), @@ -578,12 +671,17 @@ def data_content_identity_structure(self) -> Any: self._tag_columns, ) - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - return self._tag_columns, self._packet_columns + tag_columns = self._tag_columns + if include_system_tags: + tag_columns += self._system_tag_columns + return tag_columns, self._packet_columns def types( self, include_system_tags: bool = False @@ -593,7 +691,7 @@ def types( This is useful for accessing the types of the columns in the stream. """ # TODO: consider using MappingProxyType to avoid copying the dicts - converter = self._data_context.type_converter + converter = self.data_context.type_converter if include_system_tags: tag_schema = self._all_tag_schema else: @@ -643,9 +741,10 @@ def as_table( if sort_by_tags: # TODO: cleanup the sorting tag selection logic try: - return table.sort_by( - [(column, "ascending") for column in self._all_tag_columns] + target_tags = ( + self._all_tag_columns if include_system_tags else self._tag_columns ) + return table.sort_by([(column, "ascending") for column in target_tags]) except pa.ArrowTypeError: # If sorting fails, fall back to unsorted table return table @@ -684,7 +783,7 @@ def iter_packets( if tag_present: tag = ArrowTag( tag_batch.slice(i, 1), # type: ignore - data_context=self._data_context, + data_context=self.data_context, ) else: @@ -698,7 +797,7 @@ def iter_packets( source_info=self._source_info_table.slice( i, 1 ).to_pylist()[0], - data_context=self._data_context, + data_context=self.data_context, ), ) ) @@ -771,12 +870,16 @@ def clear_cache(self) -> None: self._cached_stream = None self._set_modified_time(invalidate=True) - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - tag_types, packet_types = self.kernel.output_types(*self.upstreams) + tag_types, packet_types = self.kernel.output_types( + *self.upstreams, include_system_tags=include_system_tags + ) return tuple(tag_types.keys()), tuple(packet_types.keys()) def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: @@ -974,13 +1077,15 @@ def run( # Fallback to synchronous run self.flow(execution_engine=execution_engine) - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - tag_keys, _ = self.prepared_stream.keys() + tag_keys, _ = self.prepared_stream.keys(include_system_tags=include_system_tags) packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys @@ -1023,7 +1128,7 @@ def as_table( all_packets.append(dict_patcket) # TODO: re-verify the implemetation of this conversion - converter = self._data_context.type_converter + converter = self.data_context.type_converter struct_packets = converter.python_dicts_to_struct_dicts(all_packets) all_tags_as_tables: pa.Table = pa.Table.from_pylist( @@ -1311,13 +1416,15 @@ def iter_packets( if packet is not None: yield tag, packet - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - tag_keys, _ = self.input_stream.keys() + tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys @@ -1359,7 +1466,7 @@ def as_table( dict_patcket[k] = str(v) all_packets.append(dict_patcket) - converter = self._data_context.type_converter + converter = self.data_context.type_converter struct_packets = converter.python_dicts_to_struct_dicts(all_packets) all_tags_as_tables: pa.Table = pa.Table.from_pylist( @@ -1438,12 +1545,14 @@ def __init__( super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) self._stream = stream - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """ Returns the keys of the tag and packet columns in the stream. This is useful for accessing the columns in the stream. """ - return self._stream.keys() + return self._stream.keys(include_system_tags=include_system_tags) def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ diff --git a/src/orcapod/data/trackers.py b/src/orcapod/data/trackers.py index 6e14589..29fe622 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/data/trackers.py @@ -1,13 +1,11 @@ -from orcapod import contexts from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp from collections import defaultdict -from collections.abc import Generator, Collection +from collections.abc import Generator from abc import ABC, abstractmethod from typing import Any, TYPE_CHECKING from contextlib import contextmanager -from orcapod.types import TypeSpec if TYPE_CHECKING: import networkx as nx @@ -136,62 +134,6 @@ def __exit__(self, exc_type, exc_val, ext_tb): self.set_active(False) -# TODO: Move this to sources.py -class StubSource: - def __init__(self, stream: dp.Stream, label: str | None = None) -> None: - """ - A placeholder kernel that does nothing. - This is used to represent a kernel that has no computation. - """ - self.label = label or stream.label - self.stream = stream - - def output_types( - self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - assert len(streams) == 0, "StubKernel should not have any input streams." - return self.stream.types(include_system_tags=include_system_tags) - - @property - def kernel_id(self) -> tuple[str, ...]: - return (self.stream.__class__.__name__,) - - def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: - """ - Forward the stream through the stub kernel. - This is a no-op and simply returns the stream. - """ - return self.stream - - def __call__(self, *args: Any, **kwargs: Any) -> dp.Stream: - return self.forward(*args, **kwargs) - - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - if streams is not None: - # when checked for invocation id, act as a source - # and just return the output packet types - # _, packet_types = self.stream.types() - # return packet_types - return None - # otherwise, return the identity structure of the stream - return self.stream.identity_structure() - - def __hash__(self) -> int: - # TODO: resolve the logic around identity structure on a stream / stub kernel - """ - Hash the StubKernel based on its label and stream. - This is used to uniquely identify the StubKernel in the tracker. - """ - identity_structure = self.identity_structure() - if identity_structure is None: - return hash(self.stream) - return identity_structure - - class Invocation(LabeledContentIdentifiableBase): def __init__( self, @@ -213,7 +155,10 @@ def parents(self) -> tuple["Invocation", ...]: if stream.source is not None: parent_invoctions.append(Invocation(stream.source, stream.upstreams)) else: - source = StubSource(stream) + # import JIT to avoid circular imports + from orcapod.data.sources.base import StreamSource + + source = StreamSource(stream) parent_invoctions.append(Invocation(source)) return tuple(parent_invoctions) @@ -251,10 +196,9 @@ class GraphTracker(AutoRegisteringContextBasedTracker): def __init__( self, tracker_manager: dp.TrackerManager | None = None, - data_context: str | contexts.DataContext | None = None, + **kwargs, ) -> None: super().__init__(tracker_manager=tracker_manager) - self._data_context = contexts.resolve_context(data_context) # Dictionary to map kernels to the streams they have invoked # This is used to track the computational graph and the invocations of kernels diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index b6388eb..5b394da 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -76,8 +76,8 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: # super().validate_inputs(*self.input_streams) return super().forward(*self.input_streams) # type: ignore[return-value] - def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + def source_output_types( + self, include_system_tags: bool = False ) -> tuple[TypeSpec, TypeSpec]: """ Return the output types of the node. @@ -87,19 +87,12 @@ def kernel_output_types( *self.input_streams, include_system_tags=include_system_tags ) - def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None - ) -> Any: + def source_identity_structure(self) -> Any: """ Return the identity structure of the node. This is used to compute the invocation hash. """ # construct identity structure from the node's information and the - # contained kernel - if streams is not None and len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) return self.contained_kernel.identity_structure(self.input_streams) def get_all_records( @@ -146,21 +139,6 @@ def __repr__(self): def __str__(self): return f"KernelNode:{self.kernel!s}" - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - """ - Return the identity structure of the node. - This is used to compute the invocation hash. - """ - # construct identity structure from the node's information and the - # contained kernel - if streams is not None: - if len(streams) > 0: - raise NotImplementedError( - "At this moment, Node does not yet support handling additional input streams." - ) - return None - return self.kernel.identity_structure(self.input_streams) - def forward(self, *streams: dp.Stream) -> dp.Stream: output_stream = super().forward(*streams) From 5a57ae6ab71e1e0763a517fe7fb5ee605922d5cd Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 13:51:00 -0700 Subject: [PATCH 198/224] refactor: remove redundant properties --- src/orcapod/data/kernels.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 07e512e..0143396 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -6,7 +6,6 @@ import logging from orcapod.data.streams import KernelStream from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod import contexts from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER from orcapod.types import TypeSpec @@ -28,7 +27,6 @@ class TrackedKernelBase(LabeledContentIdentifiableBase): def __init__( self, label: str | None = None, - data_context: str | contexts.DataContext | None = None, skip_tracking: bool = False, tracker_manager: dp.TrackerManager | None = None, **kwargs, @@ -36,8 +34,6 @@ def __init__( super().__init__(**kwargs) self._label = label - self._data_context = contexts.resolve_context(data_context) - self._skip_tracking = skip_tracking self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self._last_modified = None @@ -52,15 +48,6 @@ def kernel_id(self) -> tuple[str, ...]: """ return (f"{self.__class__.__name__}", self.content_hash().to_hex()) - @property - def data_context(self) -> contexts.DataContext: - return self._data_context - - @property - def data_context_key(self) -> str: - """Return the data context key.""" - return self._data_context.context_key - @property def last_modified(self) -> datetime | None: """ From 719bca46e04abe0d7415fc4a6b22c661f2760a02 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 14:16:04 -0700 Subject: [PATCH 199/224] refactor: remove old core package --- src/orcapod/core/__init__.py | 13 - src/orcapod/core/base.py | 601 --------------------- src/orcapod/core/operators.py | 945 --------------------------------- src/orcapod/core/pod.py | 335 ------------ src/orcapod/core/pod_legacy.py | 373 ------------- src/orcapod/core/sources.py | 204 ------- src/orcapod/core/streams.py | 203 ------- src/orcapod/core/tracker.py | 85 --- 8 files changed, 2759 deletions(-) delete mode 100644 src/orcapod/core/__init__.py delete mode 100644 src/orcapod/core/base.py delete mode 100644 src/orcapod/core/operators.py delete mode 100644 src/orcapod/core/pod.py delete mode 100644 src/orcapod/core/pod_legacy.py delete mode 100644 src/orcapod/core/sources.py delete mode 100644 src/orcapod/core/streams.py delete mode 100644 src/orcapod/core/tracker.py diff --git a/src/orcapod/core/__init__.py b/src/orcapod/core/__init__.py deleted file mode 100644 index d236681..0000000 --- a/src/orcapod/core/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .base import Kernel, Invocation, Stream, SyncStream, Source -from .operators import Operator -from .pod import Pod - -__all__ = [ - "Kernel", - "Operator", - "Invocation", - "Stream", - "SyncStream", - "Source", - "Pod", -] diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py deleted file mode 100644 index 367bc72..0000000 --- a/src/orcapod/core/base.py +++ /dev/null @@ -1,601 +0,0 @@ -# Collection of base classes for operations and streams in the orcapod framework. -import threading -from abc import ABC, abstractmethod -from collections.abc import Callable, Collection, Iterator -from typing import Any - -from orcapod.hashing import ContentIdentifiableBase -from orcapod.types import Packet, Tag, TypeSpec -from orcapod.types.typespec_utils import get_typespec_from_dict -import logging - - -logger = logging.getLogger(__name__) - - -class Kernel(ABC, ContentIdentifiableBase): - """ - Kernel defines the fundamental unit of computation that can be performed on zero, one or more streams of data. - It is the base class for all computations and transformations that can be performed on a collection of streams - (including an empty collection). - A kernel is defined as a callable that takes a (possibly empty) collection of streams as the input - and returns a new stream as output (note that output stream is always singular). - Each "invocation" of the kernel on a collection of streams is assigned a unique ID. - The corresponding invocation information is stored as Invocation object and attached to the output stream - for computational graph tracking. - """ - - def __init__( - self, label: str | None = None, skip_tracking: bool = False, **kwargs - ) -> None: - super().__init__(**kwargs) - self._label = label - self._skip_tracking = skip_tracking - - def pre_forward_hook( - self, *streams: "SyncStream", **kwargs - ) -> tuple["SyncStream", ...]: - """ - A hook that is called before the forward method is invoked. - This can be used to perform any pre-processing or validation on the input streams. - Subclasses can override this method to provide custom behavior. - """ - return streams - - def post_forward_hook(self, output_stream: "SyncStream", **kwargs) -> "SyncStream": - """ - A hook that is called after the forward method is invoked. - This can be used to perform any post-processing on the output stream. - Subclasses can override this method to provide custom behavior. - """ - return output_stream - - def __call__( - self, *streams: "SyncStream", label: str | None = None, **kwargs - ) -> "SyncStream": - # check that inputs are stream instances and if it's source, instantiate it - verified_streams = [] - for stream in streams: - if not isinstance(stream, SyncStream): - raise TypeError( - f"Expected SyncStream, got {type(stream).__name__} for stream {stream}" - ) - if isinstance(stream, Source): - # if the stream is a Source, instantiate it - stream = stream() - verified_streams.append(stream) - - # Special handling of Source: trigger call on source if passed as stream - normalized_streams = [ - stream() if isinstance(stream, Source) else stream - for stream in verified_streams - ] - - pre_processed_streams = self.pre_forward_hook(*normalized_streams, **kwargs) - output_stream = self.forward(*pre_processed_streams, **kwargs) - post_processed_stream = self.post_forward_hook(output_stream, **kwargs) - # create an invocation instance - invocation = Invocation(self, pre_processed_streams, label=label) - # label the output_stream with the invocation that produced the stream - post_processed_stream.invocation = invocation - - if not self._skip_tracking: - # register the invocation to all active trackers - active_trackers = Tracker.get_active_trackers() - for tracker in active_trackers: - tracker.record(invocation) - - return post_processed_stream - - @abstractmethod - def forward(self, *streams: "SyncStream") -> "SyncStream": - """ - Trigger the main computation of the kernel on a collection of streams. - This method is called when the kernel is invoked with a collection of streams. - Subclasses should override this method to provide the kernel with its unique behavior - """ - - def __repr__(self): - return self.__class__.__name__ - - def __str__(self): - if self._label is not None: - return f"{self.__class__.__name__}({self._label})" - return self.__class__.__name__ - - def identity_structure(self, *streams: "SyncStream") -> Any: - # Default implementation of identity_structure for the kernel only - # concerns the kernel class and the streams if present. Subclasses of - # Kernels should override this method to provide a more meaningful - # representation of the kernel. Note that kernel must provide the notion - # of identity under possibly two distinct contexts: - # 1) identity of the kernel in itself when invoked without any stream - # 2) identity of the specific invocation of the kernel with a collection of streams - # While the latter technically corresponds to the identity of the invocation and not - # the kernel, only kernel can provide meaningful information as to the uniqueness of - # the invocation as only kernel would know if / how the input stream(s) alter the identity - # of the invocation. For example, if the kernel corresponds to an commutative computation - # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the - # equivalence of the two by returning the same identity structure for both invocations. - # This can be achieved, for example, by returning a set over the streams instead of a tuple. - logger.warning( - f"Identity structure not implemented for {self.__class__.__name__}" - ) - return (self.__class__.__name__,) + streams - - def keys( - self, *streams: "SyncStream", trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the kernel output. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - If trigger_run is False (default), the keys are returned only if it is feasible to do so without triggering - the chain of computations. If trigger_run is True, underlying computation may get triggered if doing so - would allow for the keys to be determined. Returns None for either part of the keys cannot be inferred. - - This should be overridden by the subclass if subclass can provide smarter inference based on the specific - implementation of the subclass and input streams. - """ - if not trigger_run: - return None, None - - # resolve to actually executing the stream to fetch the first element - tag, packet = next(iter(self(*streams))) - return tuple(tag.keys()), tuple(packet.keys()) - - def types( - self, *streams: "SyncStream", trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Returns the tag and packet typespec of the kernel output. - Each typespec consists of mapping from field name to Python type. - If trigger_run is False (default), the typespec info is returned only if it is feasible to do so without triggering - the chain of computation. If trigger_run is True, underlying computation may get triggered if doing so - would allow for the typespec to be determined. Returns None for either part of the typespec cannot be inferred. - """ - if not trigger_run: - return None, None - - tag, packet = next(iter(self(*streams))) - return get_typespec_from_dict(tag), get_typespec_from_dict(packet) - - def claims_unique_tags( - self, *streams: "SyncStream", trigger_run: bool = False - ) -> bool | None: - """ - Returns True if the kernel claims that it has unique tags, False otherwise. - False indicates that it can be inferred that the kernel does not have unique tags - based on the input streams and the kernel's implementation. None indicates that - whether it is unique or not cannot be determined with certainty. - If trigger_run is True, the kernel may trigger the computation to verify - the uniqueness of tags. If trigger_run is False, the kernel will return - None if it cannot determine the uniqueness of tags without triggering the computation. - This method is useful for checking if the kernel can be used as a source - for other kernels that require unique tags. - Subclasses should override this method if it can provide reasonable check/guarantee - of unique tags. The default implementation returns False, meaning that the kernel - does not claim to have unique tags, even if turns out to be unique. - """ - return None - - -class Tracker(ABC): - """ - A tracker is a class that can track the invocations of kernels. Only "active" trackers - participate in tracking and its `record` method gets called on each invocation of a kernel. - Multiple trackers can be active at any time. - """ - - _local = threading.local() - - @classmethod - def get_active_trackers(cls) -> list["Tracker"]: - if hasattr(cls._local, "active_trackers"): - return cls._local.active_trackers - return [] - - def __init__(self): - self.active = False - - def activate(self) -> None: - """ - Activate the tracker. This is a no-op if the tracker is already active. - """ - if not self.active: - if not hasattr(self._local, "active_trackers"): - self._local.active_trackers = [] - self._local.active_trackers.append(self) - self.active = True - - def deactivate(self) -> None: - # Remove this tracker from active trackers - if hasattr(self._local, "active_trackers") and self.active: - if self in self._local.active_trackers: - self._local.active_trackers.remove(self) - self.active = False - - def __enter__(self): - self.activate() - return self - - def __exit__(self, exc_type, exc_val, ext_tb): - self.deactivate() - - @abstractmethod - def record(self, invocation: "Invocation") -> None: ... - - -# This is NOT an abstract class, but rather a concrete class that -# represents an invocation of a kernel on a collection of streams. -class Invocation(ContentIdentifiableBase): - """ - This class represents an invocation of a kernel on a collection of streams. - It contains the kernel and the streams that were used in the invocation. - Note that the collection of streams may be empty, in which case the invocation - likely corresponds to a source kernel. - """ - - def __init__( - self, - kernel: Kernel, - # TODO: technically this should be Stream to stay consistent with Stream interface. Update to Stream when AsyncStream is implemented - streams: Collection["SyncStream"], - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.kernel = kernel - self.streams = streams - - def computed_label(self) -> str | None: - """ - Returns the computed label for this invocation. - This is used to provide a default label if no label is set. - """ - return self.kernel.label - - def __repr__(self) -> str: - return f"Invocation(kernel={self.kernel}, streams={self.streams})" - - def __str__(self) -> str: - return f"Invocation[ID:{self.__hash__()}]({self.kernel}, {self.streams})" - - def __lt__(self, other: Any) -> bool: - if not isinstance(other, Invocation): - return NotImplemented - - if self.kernel == other.kernel: - return hash(self) < hash(other) - # otherwise, order by the kernel - return hash(self.kernel) < hash(other.kernel) - - # Pass-through implementations: these methods are implemented by "passing-through" the methods logic, - # simply invoking the corresopnding methods on the underlying kernel with the input streams - - def claims_unique_tags(self, trigger_run: bool = True) -> bool | None: - """ - Returns True if the invocation has unique tags, False otherwise. - This method is useful for checking if the invocation can be used as a source - for other kernels that require unique tags. None is returned if the - uniqueness of tags cannot be determined. - Note that uniqueness is best thought of as a "claim" by the kernel - that it has unique tags. The actual uniqueness can only be verified - by iterating over the streams and checking the tags. - """ - return self.kernel.claims_unique_tags(*self.streams, trigger_run=trigger_run) - - def keys(self) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.kernel.keys(*self.streams) - - def types(self) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.kernel.types(*self.streams) - - def identity_structure(self) -> int: - # Identity of an invocation is entirely determined by the - # the kernel's identity structure upon invocation - return self.kernel.identity_structure(*self.streams) - - -class Stream(ABC, ContentIdentifiableBase): - """ - A stream is a collection of tagged-packets that are generated by an operation. - The stream is iterable and can be used to access the packets in the stream. - - A stream has property `invocation` that is an instance of Invocation that generated the stream. - This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self._invocation: Invocation | None = None - - def computed_label(self) -> str | None: - if self.invocation is not None: - # use the invocation operation label - return self.invocation.kernel.label - return None - - @property - def invocation(self) -> Invocation | None: - return self._invocation - - @invocation.setter - def invocation(self, value: Invocation) -> None: - if not isinstance(value, Invocation): - raise TypeError("invocation field must be an instance of Invocation") - self._invocation = value - - @abstractmethod - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - raise NotImplementedError("Subclasses must implement __iter__ method") - - def flow(self) -> Collection[tuple[Tag, Packet]]: - """ - Flow everything through the stream, returning the entire collection of - (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. - """ - return [e for e in self] - - # --------------------- Recursive methods --------------------------- - # These methods form a step in the multi-class recursive invocation that follows the pattern of - # Stream -> Invocation -> Kernel -> Stream ... -> Invocation -> Kernel - # Most of the method logic would be found in Kernel's implementation of the method with - # Stream and Invocation simply serving as recursive steps - - def identity_structure(self) -> Any: - """ - Identity structure of a stream is deferred to the identity structure - of the associated invocation, if present. - A bare stream without invocation has no well-defined identity structure. - Specialized stream subclasses should override this method to provide more meaningful identity structure - """ - if self.invocation is not None: - return self.invocation.identity_structure() - return super().identity_structure() - - def keys( - self, *, trigger_run=False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the stream. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are returned on based-effort basis, and this invocation may trigger the - upstream computation of the stream. - Furthermore, the keys are not guaranteed to be identical across all packets in the stream. - This method is useful for inferring the keys of the stream without having to iterate - over the entire stream. - """ - if self.invocation is not None: - # if the stream is generated by an operation, use the keys from the invocation - tag_keys, packet_keys = self.invocation.keys() - if tag_keys is not None and packet_keys is not None: - return tag_keys, packet_keys - if not trigger_run: - return None, None - # otherwise, use the keys from the first packet in the stream - # note that this may be computationally expensive - tag, packet = next(iter(self)) - return list(tag.keys()), list(packet.keys()) - - def types(self, *, trigger_run=False) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Returns the keys of the stream. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are returned on based-effort basis, and this invocation may trigger the - upstream computation of the stream. - Furthermore, the keys are not guaranteed to be identical across all packets in the stream. - This method is useful for inferring the keys of the stream without having to iterate - over the entire stream. - """ - tag_types, packet_types = None, None - if self.invocation is not None: - # if the stream is generated by an operation, use the keys from the invocation - tag_types, packet_types = self.invocation.types() - if not trigger_run or (tag_types is not None and packet_types is not None): - return tag_types, packet_types - if not trigger_run: - return None, None - # otherwise, use the keys from the first packet in the stream - # note that this may be computationally expensive - tag, packet = next(iter(self)) - return tag_types or get_typespec_from_dict( - tag - ), packet_types or get_typespec_from_dict(packet) - - def claims_unique_tags(self, *, trigger_run=False) -> bool | None: - """ - Returns True if the stream has unique tags, False otherwise. - This method is useful for checking if the stream can be used as a source - for other operations that require unique tags. None is returned if the - uniqueness of tags cannot be determined. - If the stream is generated by an operation, the invocation is consulted for - the information about unique tags. - """ - if self.invocation is not None: - return self.invocation.claims_unique_tags(trigger_run=trigger_run) - return None - - -class SyncStream(Stream): - """ - A stream that will complete in a fixed amount of time. - It is suitable for synchronous operations that - will have to wait for the stream to finish before proceeding. - """ - - def head(self, n: int = 5) -> None: - """ - Print the first n elements of the stream. - This method is useful for previewing the stream - without having to iterate over the entire stream. - If n is <= 0, the entire stream is printed. - """ - for idx, (tag, packet) in enumerate(self): - if n > 0 and idx >= n: - break - print(f"Tag: {tag}, Packet: {packet}") - - def __len__(self) -> int: - """ - Returns the number of packets in the stream. - Note that this method may trigger the upstream computation of the stream. - This method is not guaranteed to be efficient and should be used with caution. - """ - return sum(1 for _ in self) - - def join(self, other: "SyncStream", label: str | None = None) -> "SyncStream": - """ - Returns a new stream that is the result of joining with the other stream. - The join is performed on the tags of the packets in the streams. - """ - from .operators import Join - - if not isinstance(other, SyncStream): - raise TypeError("other must be a SyncStream") - return Join(label=label)(self, other) - - def semijoin(self, other: "SyncStream", label: str | None = None) -> "SyncStream": - """ - Returns a new stream that is the result of semijoining with the other stream. - The semijoin is performed on the tags of the packets in the streams. - """ - from .operators import SemiJoin - - if not isinstance(other, SyncStream): - raise TypeError("other must be a SyncStream") - return SemiJoin(label=label)(self, other) - - def map( - self, - packet_map: dict | None = None, - tag_map: dict | None = None, - drop_unmapped: bool = True, - label: str | None = None, - ) -> "SyncStream": - """ - Returns a new stream that is the result of mapping the packets and tags in the stream. - The mapping is applied to each packet in the stream and the resulting packets - are returned in a new stream. - If packet_map is None, no mapping is applied to the packets. - If tag_map is None, no mapping is applied to the tags. - """ - from .operators import MapTags, MapPackets - - output = self - if packet_map is not None: - output = MapPackets(packet_map, drop_unmapped=drop_unmapped, label=label)( - output - ) - if tag_map is not None: - output = MapTags(tag_map, drop_unmapped=drop_unmapped, label=label)(output) - - return output - - def apply(self, transformer: "dict | Operator") -> "SyncStream": - """ - Returns a new stream that is the result of applying the mapping to the stream. - The mapping is applied to each packet in the stream and the resulting packets - are returned in a new stream. - """ - from .operators import MapPackets - - if isinstance(transformer, dict): - return MapPackets(transformer)(self) - elif isinstance(transformer, Operator): - # If the transformer is an Operator, we can apply it directly - return transformer(self) - - # Otherwise, do not know how to handle the transformer - raise TypeError("transformer must be a dictionary or an operator") - - def __rshift__( - self, transformer: dict | Callable[["SyncStream"], "SyncStream"] - ) -> "SyncStream": - """ - Returns a new stream that is the result of applying the mapping to the stream. - The mapping is applied to each packet in the stream and the resulting packets - are returned in a new stream. - """ - from .operators import MapPackets - - if isinstance(transformer, dict): - return MapPackets(transformer)(self) - elif isinstance(transformer, Callable): - return transformer(self) - - # Otherwise, do not know how to handle the transformer - raise TypeError( - "transformer must be a dictionary or a callable that takes a SyncStream" - ) - - def __mul__(self, other: "SyncStream") -> "SyncStream": - """ - Returns a new stream that is the result joining with the other stream - """ - from .operators import Join - - if not isinstance(other, SyncStream): - raise TypeError("other must be a SyncStream") - return Join()(self, other) - - def claims_unique_tags(self, *, trigger_run=False) -> bool | None: - """ - For synchronous streams, if the stream is generated by an operation, the invocation - is consulted first to see if the uniqueness of tags can be determined without iterating over the stream. - If uniqueness cannot be determined from the invocation and if trigger_run is True, uniqueness is checked - by iterating over all elements and verifying uniqueness. - Consequently, this may trigger upstream computations and can be expensive. - If trigger_run is False, the method will return None if the uniqueness cannot be determined. - Since this consults the invocation, the resulting value is ultimately a claim and not a guarantee - of uniqueness. If guarantee of uniquess is required, then use has_unique_tags method - """ - result = super().claims_unique_tags(trigger_run=trigger_run) - if not trigger_run or result is not None: - return result - - # If the uniqueness cannot be determined from the invocation, iterate over the stream - unique_tags = set() - for tag, _ in self: - if tag in unique_tags: - return False - unique_tags.add(tag) - return True - - -class Operator(Kernel): - """ - A Mapper is an operation that does NOT generate new file content. - It is used to control the flow of data in the pipeline without modifying or creating data content. - """ - - -class Source(Kernel, SyncStream): - """ - A base class for all sources in the system. A source can be seen as a special - type of kernel that takes no input and produces a stream of packets. - For convenience, the source itself can act as a stream and thus can be used - as an input to other kernels directly. - However, note that a source is still best thought of as a kernel that - produces a stream of packets, rather than a stream itself. On almost all occasions, - a source acts as a kernel. - """ - - def __init__(self, label: str | None = None, **kwargs) -> None: - super().__init__(label=label, **kwargs) - self._invocation = None - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - """ - Simple iter method that allows for Source object to act as a stream. - """ - yield from self() - - # TODO: consider adding stream-like behavior for determining keys and types - def keys( - self, *streams: "SyncStream", trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return Kernel.keys(self, *streams, trigger_run=trigger_run) - - def types( - self, *streams: "SyncStream", trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return Kernel.types(self, *streams, trigger_run=trigger_run) diff --git a/src/orcapod/core/operators.py b/src/orcapod/core/operators.py deleted file mode 100644 index 5049e8e..0000000 --- a/src/orcapod/core/operators.py +++ /dev/null @@ -1,945 +0,0 @@ -from collections import defaultdict -from collections.abc import Callable, Collection, Iterator -from itertools import chain -from typing import Any - -from orcapod.types import Packet, Tag, TypeSpec -from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs -from orcapod.core.base import Kernel, SyncStream, Operator -from orcapod.core.streams import SyncStreamFromGenerator -from orcapod.utils.stream_utils import ( - batch_packet, - batch_tags, - check_packet_compatibility, - join_tags, - semijoin_tags, - fill_missing, -) - - -class Repeat(Operator): - """ - A Mapper that repeats the packets in the stream a specified number of times. - The repeat count is the number of times to repeat each packet. - """ - - def __init__(self, repeat_count: int, **kwargs) -> None: - super().__init__(**kwargs) - if not isinstance(repeat_count, int): - raise TypeError("repeat_count must be an integer") - if repeat_count < 0: - raise ValueError("repeat_count must be non-negative") - self.repeat_count = repeat_count - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("Repeat operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in stream: - for _ in range(self.repeat_count): - yield tag, packet - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"Repeat(count={self.repeat_count})" - - def identity_structure(self, *streams) -> tuple[str, int, set[SyncStream]]: - # Join does not depend on the order of the streams -- convert it onto a set - return (self.__class__.__name__, self.repeat_count, set(streams)) - - def keys( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Repeat does not alter the keys of the stream. - """ - if len(streams) != 1: - raise ValueError("Repeat operation requires exactly one stream") - - stream = streams[0] - return stream.keys(trigger_run=trigger_run) - - def types( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Repeat does not alter the types of the stream. - """ - if len(streams) != 1: - raise ValueError("Repeat operation requires exactly one stream") - - stream = streams[0] - return stream.types(trigger_run=trigger_run) - - def claims_unique_tags( - self, *streams: SyncStream, trigger_run: bool = False - ) -> bool | None: - if len(streams) != 1: - raise ValueError( - "Repeat operation only supports operating on a single input stream" - ) - - # Repeat's uniquness is true only if (1) input stream has unique tags and (2) repeat count is 1 - return self.repeat_count == 1 and streams[0].claims_unique_tags( - trigger_run=trigger_run - ) - - -class Merge(Operator): - def forward(self, *streams: SyncStream) -> SyncStream: - tag_keys, packet_keys = self.keys(*streams) - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in chain(*streams): - # fill missing keys with None - tag = fill_missing(tag, tag_keys) - packet = fill_missing(packet, packet_keys) - yield tag, packet - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return "Merge()" - - def identity_structure(self, *streams): - # Merge does not depend on the order of the streams -- convert it onto a set - return (self.__class__.__name__, set(streams)) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Merge does not alter the keys of the stream. - """ - if len(streams) < 2: - raise ValueError("Merge operation requires at least two streams") - - merged_tag_keys = set() - merged_packet_keys = set() - - for stream in streams: - tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) - if tag_keys is not None: - merged_tag_keys.update(set(tag_keys)) - if packet_keys is not None: - merged_packet_keys.update(set(packet_keys)) - - return list(merged_tag_keys), list(merged_packet_keys) - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Merge does not alter the types of the stream. - """ - if len(streams) < 2: - raise ValueError("Merge operation requires at least two streams") - - merged_tag_types: TypeSpec | None = {} - merged_packet_types: TypeSpec | None = {} - - for stream in streams: - if merged_tag_types is None and merged_packet_types is None: - break - tag_types, packet_types = stream.types(trigger_run=trigger_run) - if merged_tag_types is not None and tag_types is not None: - merged_tag_types.update(tag_types) - else: - merged_tag_types = None - if merged_tag_types is not None and packet_types is not None: - merged_packet_types.update(packet_types) - else: - merged_tag_types = None - - return merged_tag_types, merged_packet_types - - def claims_unique_tags( - self, *streams: SyncStream, trigger_run: bool = True - ) -> bool | None: - """ - Merge operation can only claim unique tags if all input streams have unique tags AND - the tag keys are not identical across all streams. - """ - # TODO: update implementation - if len(streams) < 2: - raise ValueError("Merge operation requires at least two streams") - # Check if all streams have unique tags - unique_tags = all( - stream.claims_unique_tags(trigger_run=trigger_run) for stream in streams - ) - if not unique_tags: - return False - # check that all streams' tag keys are not identical - tag_key_pool = set() - for stream in streams: - tag_keys, packet_keys = stream.keys() - # TODO: re-evaluate the implication of having empty tag keys in uniqueness guarantee - if tag_keys is None or set(tag_keys) in tag_key_pool: - return False - tag_key_pool.add(frozenset(tag_keys)) - - return True - - -def union_lists(left, right): - if left is None or right is None: - return None - output = list(left) - for item in right: - if item not in output: - output.append(item) - return output - - -class Join(Operator): - def identity_structure(self, *streams): - # Join does not depend on the order of the streams -- convert it onto a set - return (self.__class__.__name__, set(streams)) - - def keys( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the types of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are returned if it is feasible to do so, otherwise a tuple - (None, None) is returned to signify that the keys are not known. - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - left_stream, right_stream = streams - left_tag_keys, left_packet_keys = left_stream.keys(trigger_run=trigger_run) - right_tag_keys, right_packet_keys = right_stream.keys(trigger_run=trigger_run) - - # TODO: do error handling when merge fails - joined_tag_keys = union_lists(left_tag_keys, right_tag_keys) - joined_packet_keys = union_lists(left_packet_keys, right_packet_keys) - - return joined_tag_keys, joined_packet_keys - - def types( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Returns the types of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are returned if it is feasible to do so, otherwise a tuple - (None, None) is returned to signify that the keys are not known. - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - left_stream, right_stream = streams - left_tag_types, left_packet_types = left_stream.types(trigger_run=False) - right_tag_types, right_packet_types = right_stream.types(trigger_run=False) - - # TODO: do error handling when merge fails - joined_tag_types = union_typespecs(left_tag_types, right_tag_types) - joined_packet_types = union_typespecs(left_packet_types, right_packet_types) - - return joined_tag_types, joined_packet_types - - def forward(self, *streams: SyncStream) -> SyncStream: - """ - Joins two streams together based on their tags. - The resulting stream will contain all the tags from both streams. - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - left_stream, right_stream = streams - - def generator() -> Iterator[tuple[Tag, Packet]]: - # using list comprehension rather than list() to avoid call to __len__ which is expensive - left_stream_buffered = [e for e in left_stream] - right_stream_buffered = [e for e in right_stream] - for left_tag, left_packet in left_stream_buffered: - for right_tag, right_packet in right_stream_buffered: - if (joined_tag := join_tags(left_tag, right_tag)) is not None: - yield joined_tag, left_packet.join(right_packet) - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return "Join()" - - -class FirstMatch(Operator): - def forward(self, *streams: SyncStream) -> SyncStream: - """ - Joins two streams together based on their tags, returning at most one match for each tag. - The resulting stream will contain all the tags from both streams. - """ - if len(streams) != 2: - raise ValueError("MatchUpToN operation requires exactly two streams") - - left_stream, right_stream = streams - - # get all elements from both streams - outer_stream = list(left_stream) - inner_stream = list(right_stream) - - # take the longer one as the outer stream - if len(outer_stream) < len(inner_stream): - # swap the stream - outer_stream, inner_stream = inner_stream, outer_stream - - # only finds up to one possible match for each packet - def generator(): - for outer_tag, outer_packet in outer_stream: - for idx, (inner_tag, inner_packet) in enumerate(inner_stream): - if (joined_tag := join_tags(outer_tag, inner_tag)) is not None: - if not check_packet_compatibility(outer_packet, inner_packet): - raise ValueError( - f"Packets are not compatible: {outer_packet} and {inner_packet}" - ) - # match is found - remove the packet from the inner stream - inner_stream.pop(idx) - yield joined_tag, Packet({**outer_packet, **inner_packet}) - # if enough matches found, move onto the next outer stream packet - break - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return "MatchUpToN()" - - def identity_structure(self, *streams: SyncStream) -> tuple[str, set[SyncStream]]: - # Join does not depend on the order of the streams -- convert it onto a set - return (self.__class__.__name__, set(streams)) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are returned if it is feasible to do so, otherwise a tuple - (None, None) is returned to signify that the keys are not known. - """ - if len(streams) != 2: - raise ValueError("FirstMatch operation requires exactly two streams") - - left_stream, right_stream = streams - left_tag_keys, left_packet_keys = left_stream.keys(trigger_run=trigger_run) - right_tag_keys, right_packet_keys = right_stream.keys(trigger_run=trigger_run) - - # if any of the components return None -> resolve to default operation - if ( - left_tag_keys is None - or right_tag_keys is None - or left_packet_keys is None - or right_packet_keys is None - ): - return super().keys(*streams, trigger_run=trigger_run) - - joined_tag_keys = list(set(left_tag_keys) | set(right_tag_keys)) - joined_packet_keys = list(set(left_packet_keys) | set(right_packet_keys)) - - return joined_tag_keys, joined_packet_keys - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Returns the typespecs of tag and packet. - """ - if len(streams) != 2: - raise ValueError("FirstMatch operation requires exactly two streams") - - left_stream, right_stream = streams - left_tag_types, left_packet_types = left_stream.types(trigger_run=trigger_run) - right_tag_types, right_packet_types = right_stream.types( - trigger_run=trigger_run - ) - - # if any of the components return None -> resolve to default operation - if ( - left_tag_types is None - or right_tag_types is None - or left_packet_types is None - or right_packet_types is None - ): - return super().types(*streams, trigger_run=trigger_run) - - joined_tag_types = union_typespecs(left_tag_types, right_tag_types) - joined_packet_types = union_typespecs(left_packet_types, right_packet_types) - - return joined_tag_types, joined_packet_types - - -class MapPackets(Operator): - """ - A Mapper that maps the keys of the packet in the stream to new keys. - The mapping is done using a dictionary that maps old keys to new keys. - If a key is not in the mapping, it will be dropped from the element unless - drop_unmapped=False, in which case unmapped keys will be retained. - """ - - def __init__( - self, key_map: dict[str, str], drop_unmapped: bool = True, **kwargs - ) -> None: - super().__init__(**kwargs) - self.key_map = key_map - self.drop_unmapped = drop_unmapped - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("MapPackets operation requires exactly one stream") - - stream = streams[0] - - def generator(): - for tag, packet in stream: - yield tag, packet.map_keys(self.key_map, self.drop_unmapped) - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - map_repr = ", ".join([f"{k} ⇒ {v}" for k, v in self.key_map.items()]) - return f"packets({map_repr})" - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.key_map, - self.drop_unmapped, - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are inferred based on the first (tag, packet) pair in the stream. - """ - if len(streams) != 1: - raise ValueError("MapPackets operation requires exactly one stream") - - stream = streams[0] - tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) - if tag_keys is None or packet_keys is None: - super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) - tag_keys = tag_keys or super_tag_keys - packet_keys = packet_keys or super_packet_keys - - if packet_keys is None: - return tag_keys, packet_keys - - if self.drop_unmapped: - # If drop_unmapped is True, we only keep the keys that are in the mapping - mapped_packet_keys = [ - self.key_map[k] for k in packet_keys if k in self.key_map - ] - else: - mapped_packet_keys = [self.key_map.get(k, k) for k in packet_keys] - - return tag_keys, mapped_packet_keys - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - Returns the types of the operation. - The first list contains the types of the tags, and the second list contains the types of the packets. - The types are inferred based on the first (tag, packet) pair in the stream. - """ - if len(streams) != 1: - raise ValueError("MapPackets operation requires exactly one stream") - - stream = streams[0] - tag_types, packet_types = stream.types(trigger_run=trigger_run) - if tag_types is None or packet_types is None: - super_tag_types, super_packet_types = super().types(trigger_run=trigger_run) - tag_types = tag_types or super_tag_types - packet_types = packet_types or super_packet_types - - if packet_types is None: - return tag_types, packet_types - - if self.drop_unmapped: - # If drop_unmapped is True, we only keep the keys that are in the mapping - mapped_packet_types = { - self.key_map[k]: v for k, v in packet_types.items() if k in self.key_map - } - else: - mapped_packet_types = { - self.key_map.get(k, k): v for k, v in packet_types.items() - } - - return tag_types, mapped_packet_types - - -class DefaultTag(Operator): - """ - A Mapper that adds a default tag to the packets in the stream. - The default tag is added to all packets in the stream. If the - tag already contains the same key, it will not be overwritten. - """ - - def __init__(self, default_tag: Tag, **kwargs) -> None: - super().__init__(**kwargs) - self.default_tag = default_tag - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("DefaultTag operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in stream: - yield {**self.default_tag, **tag}, packet - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"DefaultTag({self.default_tag})" - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are inferred based on the first (tag, packet) pair in the stream. - """ - if len(streams) != 1: - raise ValueError("DefaultTag operation requires exactly one stream") - - stream = streams[0] - tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) - if tag_keys is None or packet_keys is None: - return super().keys(trigger_run=trigger_run) - tag_keys = list(set(tag_keys) | set(self.default_tag.keys())) - return tag_keys, packet_keys - - -class MapTags(Operator): - """ - A Mapper that maps the tags of the packet in the stream to new tags. Packet remains unchanged. - The mapping is done using a dictionary that maps old tags to new tags. - If a tag is not in the mapping, it will be dropped from the element unless - drop_unmapped=False, in which case unmapped tags will be retained. - """ - - def __init__( - self, key_map: dict[str, str], drop_unmapped: bool = True, **kwargs - ) -> None: - super().__init__(**kwargs) - self.key_map = key_map - self.drop_unmapped = drop_unmapped - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("MapTags operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in stream: - if self.drop_unmapped: - tag = {v: tag[k] for k, v in self.key_map.items() if k in tag} - else: - tag = {self.key_map.get(k, k): v for k, v in tag.items()} - yield tag, packet - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - map_repr = ", ".join([f"{k} ⇒ {v}" for k, v in self.key_map.items()]) - return f"tags({map_repr})" - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.key_map, - self.drop_unmapped, - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the operation. - The first list contains the keys of the tags, and the second list contains the keys of the packets. - The keys are inferred based on the first (tag, packet) pair in the stream. - """ - if len(streams) != 1: - raise ValueError("MapTags operation requires exactly one stream") - - stream = streams[0] - tag_keys, packet_keys = stream.keys(trigger_run=trigger_run) - if tag_keys is None or packet_keys is None: - return super().keys(trigger_run=trigger_run) - - if self.drop_unmapped: - # If drop_unmapped is True, we only keep the keys that are in the mapping - mapped_tag_keys = [self.key_map[k] for k in tag_keys if k in self.key_map] - else: - mapped_tag_keys = [self.key_map.get(k, k) for k in tag_keys] - - return mapped_tag_keys, packet_keys - - -class SemiJoin(Operator): - """ - Perform semi-join on the left stream tags with the tags of the right stream - """ - - def identity_structure(self, *streams): - # Restrict DOES depend on the order of the streams -- maintain as a tuple - return (self.__class__.__name__,) + streams - - def keys( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - For semijoin, output keys and types are identical to left stream - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - return streams[0].keys(trigger_run=trigger_run) - - def types( - self, *streams: SyncStream, trigger_run=False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - """ - For semijoin, output keys and types are identical to left stream - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - return streams[0].types(trigger_run=trigger_run) - - def forward(self, *streams: SyncStream) -> SyncStream: - """ - Joins two streams together based on their tags. - The resulting stream will contain all the tags from both streams. - """ - if len(streams) != 2: - raise ValueError("Join operation requires exactly two streams") - - left_stream, right_stream = streams - left_tag_typespec, left_packet_typespec = left_stream.types() - right_tag_typespec, right_packet_typespec = right_stream.types() - - common_tag_typespec = intersection_typespecs( - left_tag_typespec, right_tag_typespec - ) - common_tag_keys = None - if common_tag_typespec is not None: - common_tag_keys = list(common_tag_typespec.keys()) - - def generator() -> Iterator[tuple[Tag, Packet]]: - # using list comprehension rather than list() to avoid call to __len__ which is expensive - left_stream_buffered = [e for e in left_stream] - right_stream_buffered = [e for e in right_stream] - for left_tag, left_packet in left_stream_buffered: - for right_tag, _ in right_stream_buffered: - if semijoin_tags(left_tag, right_tag, common_tag_keys) is not None: - yield left_tag, left_packet - # move onto next entry - break - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return "SemiJoin()" - - -class Filter(Operator): - """ - A Mapper that filters the packets in the stream based on a predicate function. - Predicate function should take two arguments: the tag and the packet, both as dictionaries. - The predicate function should return True for packets that should be kept and False for packets that should be dropped. - """ - - def __init__(self, predicate: Callable[[Tag, Packet], bool], **kwargs): - super().__init__(**kwargs) - self.predicate = predicate - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("Filter operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in stream: - if self.predicate(tag, packet): - yield tag, packet - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"Filter({self.predicate})" - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - function_content_hash(self.predicate), - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Filter does not alter the keys of the stream. - """ - if len(streams) != 1: - raise ValueError("Filter operation requires exactly one stream") - - stream = streams[0] - return stream.keys(trigger_run=trigger_run) - - -class Transform(Operator): - """ - A Mapper that transforms the packets in the stream based on a transformation function. - The transformation function should take two arguments: the tag and the packet, both as dictionaries. - The transformation function should return a tuple of (new_tag, new_packet). - """ - - def __init__( - self, transform: Callable[[Tag, Packet], tuple[Tag, Packet]], **kwargs - ): - super().__init__(**kwargs) - self.transform = transform - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("Transform operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - for tag, packet in stream: - yield self.transform(tag, packet) - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"Transform({self.transform})" - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - hash_function(self.transform), - ) + tuple(streams) - - -class Batch(Operator): - """ - A Mapper that batches the packets in the stream based on a batch size. - The batch size is the number of packets to include in each batch. - If the final batch is smaller than the batch size, it will be dropped unless drop_last=False. - """ - - def __init__( - self, - batch_size: int, - tag_processor: None | Callable[[Collection[Tag]], Tag] = None, - drop_last: bool = True, - **kwargs, - ): - super().__init__(**kwargs) - self.batch_size = batch_size - if tag_processor is None: - tag_processor = batch_tags # noqa: E731 - - self.tag_processor = tag_processor - self.drop_last = drop_last - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("Batch operation requires exactly one stream") - - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - batch_tags: list[Tag] = [] - batch_packets: list[Packet] = [] - for tag, packet in stream: - batch_tags.append(tag) - batch_packets.append(packet) - if len(batch_tags) == self.batch_size: - yield self.tag_processor(batch_tags), batch_packet(batch_packets) - batch_tags = [] - batch_packets = [] - if batch_tags and not self.drop_last: - yield self.tag_processor(batch_tags), batch_packet(batch_packets) - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"Batch(size={self.batch_size}, drop_last={self.drop_last})" - - def identity_structure(self, *streams): - return ( - self.__class__.__name__, - self.batch_size, - hash_function( - self.tag_processor, - function_hash_mode="name", - ), - self.drop_last, - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Batch does not alter the keys of the stream. - """ - if len(streams) != 1: - raise ValueError("Batch operation requires exactly one stream") - - stream = streams[0] - return stream.keys(trigger_run=trigger_run) - - -class GroupBy(Operator): - def __init__( - self, - group_keys: Collection[str] | None = None, - reduce_keys: bool = False, - selection_function: Callable[[Collection[tuple[Tag, Packet]]], Collection[bool]] - | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.group_keys = group_keys - self.reduce_keys = reduce_keys - self.selection_function = selection_function - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 1: - raise ValueError("GroupBy operation requires exactly one stream") - - stream = streams[0] - stream_keys, packet_keys = stream.keys() - stream_keys = stream_keys or [] - packet_keys = packet_keys or [] - group_keys = self.group_keys if self.group_keys is not None else stream_keys - - def generator() -> Iterator[tuple[Tag, Packet]]: - # step through all packets in the stream and group them by the specified keys - grouped_packets: dict[tuple, list[tuple[Tag, Packet]]] = defaultdict(list) - for tag, packet in stream: - key = tuple(tag.get(key, None) for key in group_keys) - grouped_packets[key].append((tag, packet)) - - for key, packets in grouped_packets.items(): - if self.selection_function is not None: - # apply the selection function to the grouped packets - selected_packets = self.selection_function(packets) - packets = [ - p for p, selected in zip(packets, selected_packets) if selected - ] - - if not packets: - continue - - # create a new tag that combines the group keys - # if reduce_keys is True, we only keep the group keys as a singular value - new_tag: Tag = {} - if self.reduce_keys: - new_tag = {k: key[i] for i, k in enumerate(group_keys)} - remaining_keys = set(stream_keys) - set(group_keys) - else: - remaining_keys = set(stream_keys) | set(group_keys) - # for remaining keys return list of tag values - for k in remaining_keys: - if k not in new_tag: - new_tag[k] = [t.get(k, None) for t, _ in packets] - # combine all packets into a single packet - combined_packet: Packet = Packet( - {k: [p.get(k, None) for _, p in packets] for k in packet_keys} - ) - yield new_tag, combined_packet - - return SyncStreamFromGenerator(generator) - - def identity_structure(self, *streams: SyncStream) -> Any: - struct = (self.__class__.__name__, self.group_keys, self.reduce_keys) - if self.selection_function is not None: - struct += (hash_function(self.selection_function),) - return struct + tuple(streams) - - -class CacheStream(Operator): - """ - A Mapper that caches the packets in the stream, thus avoiding upstream recomputation. - The cache is filled the first time the stream is iterated over. - For the next iterations, the cached packets are returned. - Call `clear_cache()` to clear the cache. - """ - - def __init__(self, **kwargs) -> None: - super().__init__(**kwargs) - self.cache: list[tuple[Tag, Packet]] = [] - self.is_cached = False - - def forward(self, *streams: SyncStream) -> SyncStream: - if not self.is_cached and len(streams) != 1: - raise ValueError("CacheStream operation requires exactly one stream") - - def generator() -> Iterator[tuple[Tag, Packet]]: - if not self.is_cached: - for tag, packet in streams[0]: - self.cache.append((tag, packet)) - yield tag, packet - self.is_cached = True - else: - for tag, packet in self.cache: - yield tag, packet - - return SyncStreamFromGenerator(generator) - - def clear_cache(self) -> None: - """ - Clear the cache. - """ - self.cache = [] - self.is_cached = False - - def __repr__(self) -> str: - return f"CacheStream(active:{self.is_cached})" - - def identity_structure(self, *streams): - # treat every CacheStream as a different stream - return None - - -def tag( - mapping: dict[str, str], drop_unmapped: bool = True -) -> Callable[[SyncStream], SyncStream]: - def transformer(stream: SyncStream) -> SyncStream: - """ - Transform the stream by renaming the keys in the tag. - The mapping is a dictionary that maps the old keys to the new keys. - """ - return MapTags(mapping, drop_unmapped=drop_unmapped)(stream) - - return transformer - - -def packet( - mapping: dict[str, str], drop_unmapped: bool = True -) -> Callable[[SyncStream], SyncStream]: - def transformer(stream: SyncStream) -> SyncStream: - """ - Transform the stream by renaming the keys in the packet. - The mapping is a dictionary that maps the old keys to the new keys. - """ - return MapPackets(mapping, drop_unmapped=drop_unmapped)(stream) - - return transformer diff --git a/src/orcapod/core/pod.py b/src/orcapod/core/pod.py deleted file mode 100644 index 3ca7d6b..0000000 --- a/src/orcapod/core/pod.py +++ /dev/null @@ -1,335 +0,0 @@ -import logging -import warnings -import sys -from collections.abc import Callable, Collection, Iterable, Iterator, Sequence -from typing import ( - Any, - Literal, -) - -from orcapod.types import Packet, Tag, TypeSpec, default_registry -from orcapod.types.typespec_utils import ( - extract_function_typespecs, - check_typespec_compatibility, -) -from orcapod.types.legacy.packets import PacketConverter - -from orcapod.hashing import ( - FunctionInfoExtractor, -) -from orcapod.hashing.legacy_core import get_function_signature -from orcapod.core import Kernel -from orcapod.core.operators import Join -from orcapod.core.streams import ( - SyncStream, - SyncStreamFromGenerator, -) - -logger = logging.getLogger(__name__) - - -class Pod(Kernel): - """ - An (abstract) base class for all pods. A pod can be seen as a special type of operation that - only operates on the packet content without reading tags. Consequently, no operation - of Pod can dependent on the tags of the packets. This is a design choice to ensure that - the pods act as pure functions which is a necessary condition to guarantee reproducibility. - """ - - def __init__( - self, error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs - ): - super().__init__(**kwargs) - self._active = True - self.error_handling = error_handling - - def is_active(self) -> bool: - """ - Check if the pod is active. If not, it will not process any packets. - """ - return self._active - - def set_active(self, active: bool) -> None: - """ - Set the active state of the pod. If set to False, the pod will not process any packets. - """ - self._active = active - - def process_stream(self, *streams: SyncStream) -> tuple[SyncStream, ...]: - """ - Prepare the incoming streams for execution in the pod. This default implementation - joins all the input streams together. - """ - # if multiple streams are provided, join them - # otherwise, return as is - combined_streams = list(streams) - if len(streams) > 1: - stream = streams[0] - for next_stream in streams[1:]: - stream = Join()(stream, next_stream) - combined_streams = [stream] - return tuple(combined_streams) - - def pre_forward_hook( - self, *streams: SyncStream, **kwargs - ) -> tuple[SyncStream, ...]: - return self.process_stream(*streams) - - def generator_completion_hook(self, n_computed: int) -> None: - """ - Hook that is called when the generator is completed. This can be used to - perform any finalization steps, such as closing resources or logging. - """ - logger.debug(f"Generator completed with {n_computed} items processed.") - - def forward(self, *streams: SyncStream) -> SyncStream: - # at this point, streams should have been joined into one - assert len(streams) == 1, "Only one stream is supported in forward() of Pod" - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - n_computed = 0 - for tag, packet in stream: - try: - tag, output_packet = self.call(tag, packet) - if output_packet is None: - logger.debug( - f"Call returned None as output for tag {tag}. Skipping..." - ) - continue - n_computed += 1 - logger.debug(f"Computed item {n_computed}") - yield tag, output_packet - - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - elif self.error_handling == "ignore": - continue - else: - raise ValueError( - f"Unknown error handling mode: {self.error_handling} encountered while handling error:" - ) from e - self.generator_completion_hook(n_computed) - - return SyncStreamFromGenerator(generator) - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ... - - -def function_pod( - output_keys: str | Collection[str] | None = None, - function_name: str | None = None, - label: str | None = None, - **kwargs, -) -> Callable[..., "FunctionPod"]: - """ - Decorator that wraps a function in a FunctionPod instance. - - Args: - output_keys: Keys for the function output(s) - function_name: Name of the function pod; if None, defaults to the function name - **kwargs: Additional keyword arguments to pass to the FunctionPod constructor. Please refer to the FunctionPod documentation for details. - - Returns: - FunctionPod instance wrapping the decorated function - """ - - def decorator(func) -> FunctionPod: - if func.__name__ == "": - raise ValueError("Lambda functions cannot be used with function_pod") - - if not hasattr(func, "__module__") or func.__module__ is None: - raise ValueError( - f"Function {func.__name__} must be defined at module level" - ) - - # Store the original function in the module for pickling purposes - # and make sure to change the name of the function - module = sys.modules[func.__module__] - base_function_name = func.__name__ - new_function_name = f"_original_{func.__name__}" - setattr(module, new_function_name, func) - # rename the function to be consistent and make it pickleable - setattr(func, "__name__", new_function_name) - setattr(func, "__qualname__", new_function_name) - - # Create a simple typed function pod - pod = FunctionPod( - function=func, - output_keys=output_keys, - function_name=function_name or base_function_name, - label=label, - **kwargs, - ) - return pod - - return decorator - - -class FunctionPod(Pod): - def __init__( - self, - function: Callable[..., Any], - output_keys: str | Collection[str] | None = None, - function_name=None, - input_types: TypeSpec | None = None, - output_types: TypeSpec | Sequence[type] | None = None, - label: str | None = None, - packet_type_registry=None, - function_info_extractor: FunctionInfoExtractor | None = None, - **kwargs, - ) -> None: - self.function = function - if output_keys is None: - output_keys = [] - if isinstance(output_keys, str): - output_keys = [output_keys] - self.output_keys = output_keys - if function_name is None: - if hasattr(self.function, "__name__"): - function_name = getattr(self.function, "__name__") - else: - raise ValueError( - "function_name must be provided if function has no __name__ attribute" - ) - self.function_name = function_name - super().__init__(label=label or self.function_name, **kwargs) - - if packet_type_registry is None: - # TODO: reconsider the use of default registry here - packet_type_registry = default_registry - - self.registry = packet_type_registry - self.function_info_extractor = function_info_extractor - - # extract input and output types from the function signature - self.function_input_typespec, self.function_output_typespec = ( - extract_function_typespecs( - self.function, - self.output_keys, - input_types=input_types, - output_types=output_types, - ) - ) - - self.input_converter = PacketConverter( - self.function_input_typespec, self.registry - ) - self.output_converter = PacketConverter( - self.function_output_typespec, self.registry - ) - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - assert len(streams) == 1, ( - "Only one stream is supported in forward() of FunctionPod" - ) - stream = streams[0] - _, packet_typespec = stream.types(trigger_run=False) - if packet_typespec is not None and not check_typespec_compatibility( - packet_typespec, self.function_input_typespec - ): - raise TypeError( - f"Input packet types {packet_typespec} is not compatible with the function's expected input types {self.function_input_typespec}" - ) - return super().forward(*streams, **kwargs) - - def get_function_typespecs(self) -> tuple[TypeSpec, TypeSpec]: - return self.function_input_typespec, self.function_output_typespec - - def __repr__(self) -> str: - return f"FunctionPod:{self.function!r}" - - def __str__(self) -> str: - include_module = self.function.__module__ != "__main__" - func_sig = get_function_signature( - self.function, - name_override=self.function_name, - include_module=include_module, - ) - return f"FunctionPod:{func_sig}" - - def call(self, tag, packet) -> tuple[Tag, Packet | None]: - if not self.is_active(): - logger.info( - f"Pod is not active: skipping computation on input packet {packet}" - ) - return tag, None - output_values = [] - - values = self.function(**packet) - - if len(self.output_keys) == 0: - output_values = [] - elif len(self.output_keys) == 1: - output_values = [values] # type: ignore - elif isinstance(values, Iterable): - output_values = list(values) # type: ignore - elif len(self.output_keys) > 1: - raise ValueError( - "Values returned by function must be a pathlike or a sequence of pathlikes" - ) - - if len(output_values) != len(self.output_keys): - raise ValueError( - f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" - ) - - output_packet: Packet = Packet( - {k: v for k, v in zip(self.output_keys, output_values)} - ) - return tag, output_packet - - def identity_structure(self, *streams) -> Any: - # construct identity structure for the function - # if function_info_extractor is available, use that but substitute the function_name - if self.function_info_extractor is not None: - function_info = self.function_info_extractor.extract_function_info( - self.function, - function_name=self.function_name, - input_typespec=self.function_input_typespec, - output_typespec=self.function_output_typespec, - ) - else: - # use basic information only - function_info = { - "name": self.function_name, - "input_typespec": self.function_input_typespec, - "output_typespec": self.function_output_typespec, - } - function_info["output_keys"] = tuple(self.output_keys) - - return ( - self.__class__.__name__, - function_info, - ) + streams - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - stream = self.process_stream(*streams) - if len(stream) < 1: - tag_keys = None - else: - tag_keys, _ = stream[0].keys(trigger_run=trigger_run) - return tag_keys, tuple(self.output_keys) - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - stream = self.process_stream(*streams) - if len(stream) < 1: - tag_typespec = None - else: - tag_typespec, _ = stream[0].types(trigger_run=trigger_run) - return tag_typespec, self.function_output_typespec - - def claims_unique_tags( - self, *streams: SyncStream, trigger_run: bool = False - ) -> bool | None: - stream = self.process_stream(*streams) - return stream[0].claims_unique_tags(trigger_run=trigger_run) diff --git a/src/orcapod/core/pod_legacy.py b/src/orcapod/core/pod_legacy.py deleted file mode 100644 index 18099c6..0000000 --- a/src/orcapod/core/pod_legacy.py +++ /dev/null @@ -1,373 +0,0 @@ -import logging -import warnings -import sys -from collections.abc import Callable, Collection, Iterable, Iterator -from typing import ( - Any, - Literal, -) - -from orcapod.types import Packet, PathSet, PodFunction, Tag - -from orcapod.hashing import ( - get_function_signature, - hash_function, -) -from orcapod.core.base import Kernel -from orcapod.core.operators import Join -from orcapod.core.streams import SyncStream, SyncStreamFromGenerator -from orcapod.stores import DataStore, NoOpDataStore - - -logger = logging.getLogger(__name__) - - -class Pod(Kernel): - """ - An (abstract) base class for all pods. A pod can be seen as a special type of operation that - only operates on the packet content without reading tags. Consequently, no operation - of Pod can dependent on the tags of the packets. This is a design choice to ensure that - the pods act as pure functions which is a necessary condition to guarantee reproducibility. - """ - - def __init__( - self, error_handling: Literal["raise", "ignore", "warn"] = "raise", **kwargs - ): - super().__init__(**kwargs) - self.error_handling = error_handling - self._active = True - - def set_active(self, active=True): - self._active = active - - def is_active(self) -> bool: - return self._active - - def process_stream(self, *streams: SyncStream) -> tuple[SyncStream, ...]: - """ - Prepare the incoming streams for execution in the pod. This default implementation - joins all the streams together and raises and error if no streams are provided. - """ - # if multiple streams are provided, join them - # otherwise, return as is - combined_streams = list(streams) - if len(streams) > 1: - stream = streams[0] - for next_stream in streams[1:]: - stream = Join()(stream, next_stream) - combined_streams = [stream] - return tuple(combined_streams) - - def pre_forward_hook( - self, *streams: SyncStream, **kwargs - ) -> tuple[SyncStream, ...]: - return self.process_stream(*streams) - - def forward(self, *streams: SyncStream) -> SyncStream: - # if multiple streams are provided, join them - if len(streams) > 1: - raise ValueError("Multiple streams should be joined before calling forward") - if len(streams) == 0: - raise ValueError("No streams provided to forward") - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - n_computed = 0 - for tag, packet in stream: - try: - tag, output_packet = self.call(tag, packet) - if output_packet is None: - logger.info( - f"Call returned None as output for tag {tag}. Skipping..." - ) - continue - n_computed += 1 - logger.info(f"Computed item {n_computed}") - yield tag, output_packet - - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - elif self.error_handling == "ignore": - continue - else: - raise ValueError( - f"Unknown error handling mode: {self.error_handling} encountered while handling error:" - ) from e - - return SyncStreamFromGenerator(generator) - - def call(self, tag: Tag, packet: Packet) -> tuple[Tag, Packet | None]: ... - - -def function_pod( - output_keys: Collection[str] | None = None, - function_name: str | None = None, - data_store: DataStore | None = None, - store_name: str | None = None, - function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", - custom_hash: int | None = None, - force_computation: bool = False, - skip_memoization: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - **kwargs, -) -> Callable[..., "FunctionPod"]: - """ - Decorator that wraps a function in a FunctionPod instance. - - Args: - output_keys: Keys for the function output - force_computation: Whether to force computation - skip_memoization: Whether to skip memoization - - Returns: - FunctionPod instance wrapping the decorated function - """ - - def decorator(func) -> FunctionPod: - if func.__name__ == "": - raise ValueError("Lambda functions cannot be used with function_pod") - - if not hasattr(func, "__module__") or func.__module__ is None: - raise ValueError( - f"Function {func.__name__} must be defined at module level" - ) - - # Store the original function in the module for pickling purposes - # and make sure to change the name of the function - module = sys.modules[func.__module__] - base_function_name = func.__name__ - new_function_name = f"_original_{func.__name__}" - setattr(module, new_function_name, func) - # rename the function to be consistent and make it pickleable - setattr(func, "__name__", new_function_name) - setattr(func, "__qualname__", new_function_name) - - # Create the FunctionPod - pod = FunctionPod( - function=func, - output_keys=output_keys, - function_name=function_name or base_function_name, - data_store=data_store, - store_name=store_name, - function_hash_mode=function_hash_mode, - custom_hash=custom_hash, - force_computation=force_computation, - skip_memoization=skip_memoization, - error_handling=error_handling, - **kwargs, - ) - - return pod - - return decorator - - -class FunctionPod(Pod): - """ - A pod that wraps a function and allows it to be used as an operation in a stream. - This pod can be used to apply a function to the packets in a stream, with optional memoization - and caching of results. It can also handle multiple output keys and error handling. - The function should accept keyword arguments that correspond to the keys in the packets. - The output of the function should be a path or a collection of paths that correspond to the output keys.""" - - def __init__( - self, - function: PodFunction, - output_keys: Collection[str] | None = None, - function_name=None, - data_store: DataStore | None = None, - store_name: str | None = None, - function_hash_mode: Literal["signature", "content", "name", "custom"] = "name", - custom_hash: int | None = None, - label: str | None = None, - force_computation: bool = False, - skip_memoization_lookup: bool = False, - skip_memoization: bool = False, - error_handling: Literal["raise", "ignore", "warn"] = "raise", - _hash_function_kwargs: dict | None = None, - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self.function = function - self.output_keys = output_keys or [] - if function_name is None: - if hasattr(self.function, "__name__"): - function_name = getattr(self.function, "__name__") - else: - raise ValueError( - "function_name must be provided if function has no __name__ attribute" - ) - - self.function_name = function_name - self.data_store = data_store if data_store is not None else NoOpDataStore() - self.store_name = store_name or function_name - self.function_hash_mode = function_hash_mode - self.custom_hash = custom_hash - self.force_computation = force_computation - self.skip_memoization_lookup = skip_memoization_lookup - self.skip_memoization = skip_memoization - self.error_handling = error_handling - self._hash_function_kwargs = _hash_function_kwargs - - def __repr__(self) -> str: - func_sig = get_function_signature(self.function) - return f"FunctionPod:{func_sig} ⇒ {self.output_keys}" - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - stream = self.process_stream(*streams) - tag_keys, _ = stream[0].keys(trigger_run=trigger_run) - return tag_keys, tuple(self.output_keys) - - def is_memoized(self, packet: Packet) -> bool: - return self.retrieve_memoized(packet) is not None - - def retrieve_memoized(self, packet: Packet) -> Packet | None: - """ - Retrieve a memoized packet from the data store. - Returns None if no memoized packet is found. - """ - return self.data_store.retrieve_memoized( - self.store_name, - self.content_hash(char_count=16), - packet, - ) - - def memoize( - self, - packet: Packet, - output_packet: Packet, - ) -> Packet: - """ - Memoize the output packet in the data store. - Returns the memoized packet. - """ - return self.data_store.memoize( - self.store_name, - self.content_hash(char_count=16), # identity of this function pod - packet, - output_packet, - ) - - def forward(self, *streams: SyncStream) -> SyncStream: - # if multiple streams are provided, join them - if len(streams) > 1: - raise ValueError("Multiple streams should be joined before calling forward") - if len(streams) == 0: - raise ValueError("No streams provided to forward") - stream = streams[0] - - def generator() -> Iterator[tuple[Tag, Packet]]: - n_computed = 0 - for tag, packet in stream: - output_values: list["PathSet"] = [] - try: - if not self.skip_memoization_lookup: - memoized_packet = self.retrieve_memoized(packet) - else: - memoized_packet = None - if not self.force_computation and memoized_packet is not None: - logger.info("Memoized packet found, skipping computation") - yield tag, memoized_packet - continue - if not self.is_active(): - logger.info( - "Pod is not active: skipping computation of a new entry" - ) - continue - values = self.function(**packet) - - if len(self.output_keys) == 0: - output_values = [] - elif len(self.output_keys) == 1: - output_values = [values] # type: ignore - elif isinstance(values, Iterable): - output_values = list(values) # type: ignore - elif len(self.output_keys) > 1: - raise ValueError( - "Values returned by function must be a pathlike or a sequence of pathlikes" - ) - - if len(output_values) != len(self.output_keys): - raise ValueError( - f"Number of output keys {len(self.output_keys)}:{self.output_keys} does not match number of values returned by function {len(output_values)}" - ) - except Exception as e: - logger.error(f"Error processing packet {packet}: {e}") - if self.error_handling == "raise": - raise e - elif self.error_handling == "ignore": - continue - elif self.error_handling == "warn": - warnings.warn(f"Error processing packet {packet}: {e}") - continue - else: - raise ValueError( - f"Unknown error handling mode: {self.error_handling} encountered while handling error:" - ) from e - - output_packet: Packet = { - k: v for k, v in zip(self.output_keys, output_values) - } - - if not self.skip_memoization: - # output packet may be modified by the memoization process - # e.g. if the output is a file, the path may be changed - output_packet = self.memoize(packet, output_packet) # type: ignore - - n_computed += 1 - logger.info(f"Computed item {n_computed}") - yield tag, output_packet - - return SyncStreamFromGenerator(generator) - - def identity_structure(self, *streams) -> Any: - content_kwargs = self._hash_function_kwargs - if self.function_hash_mode == "content": - if content_kwargs is None: - content_kwargs = { - "include_name": False, - "include_module": False, - "include_declaration": False, - } - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="content", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "signature": - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="signature", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "name": - function_hash_value = hash_function( - self.function, - name_override=self.function_name, - function_hash_mode="name", - content_kwargs=content_kwargs, - ) - elif self.function_hash_mode == "custom": - if self.custom_hash is None: - raise ValueError("Custom hash function not provided") - function_hash_value = self.custom_hash - else: - raise ValueError( - f"Unknown function hash mode: {self.function_hash_mode}. " - "Must be one of 'content', 'signature', 'name', or 'custom'." - ) - - return ( - self.__class__.__name__, - function_hash_value, - tuple(self.output_keys), - ) + tuple(streams) diff --git a/src/orcapod/core/sources.py b/src/orcapod/core/sources.py deleted file mode 100644 index b1dca7d..0000000 --- a/src/orcapod/core/sources.py +++ /dev/null @@ -1,204 +0,0 @@ -from collections.abc import Callable, Collection, Iterator -from os import PathLike -from pathlib import Path -from typing import Any, Literal - -import polars as pl - -from orcapod.core.base import Source -from orcapod.hashing.legacy_core import hash_function -from orcapod.core.streams import ( - PolarsStream, - SyncStream, - SyncStreamFromGenerator, - StreamWrapper, -) -from orcapod.types import Packet, Tag, TypeSpec - - -class GlobSource(Source): - """ - A stream source that sources files from a directory matching a glob pattern. - - For each matching file, yields a tuple containing: - - A tag generated either by the provided tag_function or defaulting to the file's stem name - - A packet containing the file path under the provided name key - - Parameters - ---------- - name : str - The key name under which the file path will be stored in the packet - file_path : PathLike - The directory path to search for files - pattern : str, default='*' - The glob pattern to match files against - tag_key : Optional[Union[str, Callable[[PathLike], Tag]]], default=None - Optional function to generate a tag from a file path. If None, uses the file's - stem name (without extension) in a dict with key 'file_name'. If only string is - provided, it will be used as the key for the tag. If a callable is provided, it - should accept a file path and return a dictionary of tags. - - Examples - -------- - >>> # Match all .txt files in data_dir, using filename as tag - >>> glob_source = GlobSource('txt_file', 'data_dir', '*.txt') - >>> # Match all files but use custom tag function - >>> glob_source = GlobSource('file', 'data_dir', '*', - ... lambda f: {'date': Path(f).stem[:8]}) - """ - - @staticmethod - def default_tag_function(f: PathLike) -> Tag: - return {"file_name": Path(f).stem} # noqa: E731 - - def __init__( - self, - name: str, - file_path: PathLike, - pattern: str = "*", - absolute_path: bool = False, - label: str | None = None, - tag_function: Callable[[PathLike], Tag] | None = None, - tag_function_hash_mode: Literal["content", "signature", "name"] = "name", - expected_tag_keys: Collection[str] | None = None, - **kwargs, - ) -> None: - super().__init__(label=label, **kwargs) - self.name = name - file_path = Path(file_path) - if absolute_path: - file_path = file_path.resolve() - self.file_path = file_path - self.pattern = pattern - self.expected_tag_keys = expected_tag_keys - if tag_function is None: - tag_function = self.__class__.default_tag_function - self.tag_function: Callable[[PathLike], Tag] = tag_function - self.tag_function_hash_mode = tag_function_hash_mode - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 0: - raise ValueError( - "GlobSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - - def generator() -> Iterator[tuple[Tag, Packet]]: - for file in Path(self.file_path).glob(self.pattern): - yield self.tag_function(file), Packet({self.name: str(file)}) - - return SyncStreamFromGenerator(generator) - - def __repr__(self) -> str: - return f"GlobSource({str(Path(self.file_path) / self.pattern)}) ⇒ {self.name}" - - def identity_structure(self, *streams) -> Any: - hash_function_kwargs = {} - if self.tag_function_hash_mode == "content": - # if using content hash, exclude few - hash_function_kwargs = { - "include_name": False, - "include_module": False, - "include_declaration": False, - } - - tag_function_hash = hash_function( - self.tag_function, - function_hash_mode=self.tag_function_hash_mode, # type: ignore - hash_kwargs=hash_function_kwargs, - ) - return ( - self.__class__.__name__, - self.name, - str(self.file_path), - self.pattern, - tag_function_hash, - ) + tuple(streams) - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - """ - Returns the keys of the stream. The keys are the names of the packets - in the stream. The keys are used to identify the packets in the stream. - If expected_keys are provided, they will be used instead of the default keys. - """ - if len(streams) != 0: - raise ValueError( - "GlobSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - - if self.expected_tag_keys is not None: - return tuple(self.expected_tag_keys), (self.name,) - return super().keys(trigger_run=trigger_run) - - def claims_unique_tags( - self, *streams: "SyncStream", trigger_run: bool = True - ) -> bool | None: - if len(streams) != 0: - raise ValueError( - "GlobSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - # Claim uniqueness only if the default tag function is used - if self.tag_function == self.__class__.default_tag_function: - return True - # Otherwise, delegate to the base class - return super().claims_unique_tags(trigger_run=trigger_run) - - -class PolarsSource(Source): - def __init__( - self, - df: pl.DataFrame, - tag_keys: Collection[str], - packet_keys: Collection[str] | None = None, - ): - self.df = df - self.tag_keys = tag_keys - self.packet_keys = packet_keys - - def forward(self, *streams: SyncStream, **kwargs) -> SyncStream: - if len(streams) != 0: - raise ValueError( - "PolarsSource does not support forwarding streams. " - "It generates its own stream from the DataFrame." - ) - return PolarsStream(self.df, self.tag_keys, self.packet_keys) - - -class StreamSource(Source): - def __init__(self, stream: SyncStream, **kwargs): - super().__init__(skip_tracking=True, **kwargs) - self.stream = stream - - def forward(self, *streams: SyncStream) -> SyncStream: - if len(streams) != 0: - raise ValueError( - "StreamSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - return StreamWrapper(self.stream) - - def identity_structure(self, *streams) -> Any: - if len(streams) != 0: - raise ValueError( - "StreamSource does not support forwarding streams. " - "It generates its own stream from the file system." - ) - - return (self.__class__.__name__, self.stream) - - def types( - self, *streams: SyncStream, **kwargs - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.stream.types() - - def keys( - self, *streams: SyncStream, **kwargs - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.stream.keys() - - def computed_label(self) -> str | None: - return self.stream.label diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py deleted file mode 100644 index 170c80d..0000000 --- a/src/orcapod/core/streams.py +++ /dev/null @@ -1,203 +0,0 @@ -from collections.abc import Callable, Collection, Iterator - -import polars as pl - -from orcapod.core.base import SyncStream -from orcapod.types import Packet, PacketLike, Tag, TypeSpec -from copy import copy - - -class SyncStreamFromLists(SyncStream): - def __init__( - self, - tags: Collection[Tag] | None = None, - packets: Collection[PacketLike] | None = None, - paired: Collection[tuple[Tag, PacketLike]] | None = None, - tag_keys: list[str] | None = None, - packet_keys: list[str] | None = None, - tag_typespec: TypeSpec | None = None, - packet_typespec: TypeSpec | None = None, - strict: bool = True, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.tag_typespec = tag_typespec - self.packet_typespec = packet_typespec - if tag_keys is None and tag_typespec is not None: - tag_keys = list(tag_typespec.keys()) - if packet_keys is None and packet_typespec is not None: - packet_keys = list(packet_typespec.keys()) - self.tag_keys = tag_keys - self.packet_keys = packet_keys - - if tags is not None and packets is not None: - if strict and len(tags) != len(packets): - raise ValueError( - "tags and packets must have the same length if both are provided" - ) - self.paired = list((t, Packet(v)) for t, v in zip(tags, packets)) - elif paired is not None: - self.paired = list((t, Packet(v)) for t, v in paired) - else: - raise ValueError( - "Either tags and packets or paired must be provided to SyncStreamFromLists" - ) - - def keys( - self, *, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - tag_keys, packet_keys = copy(self.tag_keys), copy(self.packet_keys) - if tag_keys is None or packet_keys is None: - super_tag_keys, super_packet_keys = super().keys(trigger_run=trigger_run) - tag_keys = tag_keys or super_tag_keys - packet_keys = packet_keys or super_packet_keys - - # If the keys are already set, return them - return tag_keys, packet_keys - - def types( - self, *, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - tag_typespec, packet_typespec = ( - copy(self.tag_typespec), - copy(self.packet_typespec), - ) - if tag_typespec is None or packet_typespec is None: - super_tag_typespec, super_packet_typespec = super().types( - trigger_run=trigger_run - ) - tag_typespec = tag_typespec or super_tag_typespec - packet_typespec = packet_typespec or super_packet_typespec - - # If the types are already set, return them - return tag_typespec, packet_typespec - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - yield from self.paired - - -class SyncStreamFromGenerator(SyncStream): - """ - A synchronous stream that is backed by a generator function. - """ - - def __init__( - self, - generator_factory: Callable[[], Iterator[tuple[Tag, Packet]]], - tag_keys: list[str] | None = None, - packet_keys: list[str] | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.tag_keys = tag_keys - self.packet_keys = packet_keys - self.generator_factory = generator_factory - self.check_consistency = False - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - if not self.check_consistency: - yield from self.generator_factory() - - # TODO: add typespec handling - def keys( - self, *, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - if self.tag_keys is None or self.packet_keys is None: - return super().keys(trigger_run=trigger_run) - # If the keys are already set, return them - return self.tag_keys.copy(), self.packet_keys.copy() - - -class PolarsStream(SyncStream): - def __init__( - self, - df: pl.DataFrame, - tag_keys: Collection[str], - packet_keys: Collection[str] | None = None, - ): - self.df = df - self.tag_keys = tuple(tag_keys) - self.packet_keys = tuple(packet_keys) if packet_keys is not None else None - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - df = self.df - # if self.packet_keys is not None: - # df = df.select(self.tag_keys + self.packet_keys) - for row in df.iter_rows(named=True): - tag = {key: row[key] for key in self.tag_keys} - packet = { - key: val - for key, val in row.items() - if key not in self.tag_keys and not key.startswith("_source_info_") - } - # TODO: revisit and fix this rather hacky implementation - source_info = { - key.removeprefix("_source_info_"): val - for key, val in row.items() - if key.startswith("_source_info_") - } - yield tag, Packet(packet, source_info=source_info) - - -class EmptyStream(SyncStream): - def __init__( - self, - tag_keys: Collection[str] | None = None, - packet_keys: Collection[str] | None = None, - tag_typespec: TypeSpec | None = None, - packet_typespec: TypeSpec | None = None, - ): - if tag_keys is None and tag_typespec is not None: - tag_keys = tag_typespec.keys() - self.tag_keys = list(tag_keys) if tag_keys else [] - - if packet_keys is None and packet_typespec is not None: - packet_keys = packet_typespec.keys() - self.packet_keys = list(packet_keys) if packet_keys else [] - - self.tag_typespec = tag_typespec - self.packet_typespec = packet_typespec - - def keys( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.tag_keys, self.packet_keys - - def types( - self, *streams: SyncStream, trigger_run: bool = False - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.tag_typespec, self.packet_typespec - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - # Empty stream, no data to yield - return iter([]) - - -class StreamWrapper(SyncStream): - """ - A wrapper for a SyncStream that allows the stream to be labeled and - associated with an invocation without modifying the original stream. - """ - - def __init__(self, stream: SyncStream, **kwargs): - super().__init__(**kwargs) - self.stream = stream - - def keys( - self, *streams: SyncStream, **kwargs - ) -> tuple[Collection[str] | None, Collection[str] | None]: - return self.stream.keys(*streams, **kwargs) - - def types( - self, *streams: SyncStream, **kwargs - ) -> tuple[TypeSpec | None, TypeSpec | None]: - return self.stream.types(*streams, **kwargs) - - def computed_label(self) -> str | None: - return self.stream.label - - def __iter__(self) -> Iterator[tuple[Tag, Packet]]: - """ - Iterate over the stream, yielding tuples of (tags, packets). - """ - yield from self.stream diff --git a/src/orcapod/core/tracker.py b/src/orcapod/core/tracker.py deleted file mode 100644 index e0a2bd7..0000000 --- a/src/orcapod/core/tracker.py +++ /dev/null @@ -1,85 +0,0 @@ -from orcapod.core.base import Invocation, Kernel, Tracker -from orcapod.core.sources import StreamSource - - -class GraphTracker(Tracker): - """ - A tracker that records the invocations of operations and generates a graph - of the invocations and their dependencies. - """ - - # Thread-local storage to track active trackers - - def __init__(self) -> None: - super().__init__() - self.invocation_lut: dict[Kernel, list[Invocation]] = {} - - def record(self, invocation: Invocation) -> None: - invocation_list = self.invocation_lut.setdefault(invocation.kernel, []) - if invocation not in invocation_list: - invocation_list.append(invocation) - - def reset(self) -> dict[Kernel, list[Invocation]]: - """ - Reset the tracker and return the recorded invocations. - """ - recorded_invocations = self.invocation_lut - self.invocation_lut = {} - return recorded_invocations - - def generate_namemap(self) -> dict[Invocation, str]: - namemap = {} - for kernel, invocations in self.invocation_lut.items(): - # if only one entry present, use the kernel name alone - if kernel.label is not None: - node_label = kernel.label - else: - node_label = str(kernel) - if len(invocations) == 1: - namemap[invocations[0]] = node_label - continue - # if multiple entries, use the kernel name and index - for idx, invocation in enumerate(invocations): - namemap[invocation] = f"{node_label}_{idx}" - return namemap - - def generate_graph(self): - import networkx as nx - - G = nx.DiGraph() - - # Add edges for each invocation - for kernel, invocations in self.invocation_lut.items(): - for invocation in invocations: - for upstream in invocation.streams: - # if upstream.invocation is not in the graph, add it - upstream_invocation = upstream.invocation - if upstream_invocation is None: - # If upstream is None, create a stub kernel - upstream_invocation = Invocation(StreamSource(upstream), []) - if upstream_invocation not in G: - G.add_node(upstream_invocation) - G.add_edge(upstream_invocation, invocation, stream=upstream) - - return G - - def draw_graph(self): - import networkx as nx - import matplotlib.pyplot as plt - - G = self.generate_graph() - labels = self.generate_namemap() - - pos = nx.drawing.nx_agraph.graphviz_layout(G, prog="dot") - nx.draw( - G, - pos, - labels=labels, - node_size=2000, - node_color="lightblue", - with_labels=True, - font_size=10, - font_weight="bold", - arrowsize=20, - ) - plt.tight_layout() From 2561b55b733444b7ae365f974617bd725aa263b1 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 14:16:27 -0700 Subject: [PATCH 200/224] refactor: remove deprecated modules --- src/orcapod/hashing/versioned_hashers.py | 88 ---- src/orcapod/types/__init__.py | 15 - src/orcapod/types/core.py | 34 -- src/orcapod/types/legacy/packets.py | 349 ------------- .../types/legacy/semantic_type_handlers.py | 94 ---- .../types/legacy/semantic_type_registry.py | 461 ------------------ src/orcapod/types/typespec_utils.py | 301 ------------ 7 files changed, 1342 deletions(-) delete mode 100644 src/orcapod/hashing/versioned_hashers.py delete mode 100644 src/orcapod/types/__init__.py delete mode 100644 src/orcapod/types/core.py delete mode 100644 src/orcapod/types/legacy/packets.py delete mode 100644 src/orcapod/types/legacy/semantic_type_handlers.py delete mode 100644 src/orcapod/types/legacy/semantic_type_registry.py delete mode 100644 src/orcapod/types/typespec_utils.py diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py deleted file mode 100644 index 91b7931..0000000 --- a/src/orcapod/hashing/versioned_hashers.py +++ /dev/null @@ -1,88 +0,0 @@ -# A collection of versioned hashers that provide a "default" implementation of hashers. -from .arrow_hashers import SemanticArrowHasher -from orcapod.utils.object_spec import parse_objectspec -from orcapod.protocols.hashing_protocols import ObjectHasher - -CURRENT_VERSION = "v0.1" - -versioned_semantic_arrow_hashers = { - "v0.1": { - "_class": "orcapod.hashing.arrow_hashers.SemanticArrowHasher", - "_config": { - "hasher_id": "arrow_v0.1", - "hash_algorithm": "sha256", - "chunk_size": 8192, - "serialization_method": "logical", - "semantic_type_hashers": { - "path": { - "_class": "orcapod.hashing.semantic_type_hashers.PathHasher", - "_config": { - "file_hasher": { - "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "_config": { - "algorithm": "sha256", - }, - } - }, - } - }, - }, - } -} - -versioned_object_hashers = { - "v0.1": { - "_class": "orcapod.hashing.object_hashers.BasicObjectHasher", - "_config": { - "hasher_id": "object_v0.1", - "function_info_extractor": { - "_class": "orcapod.hashing.function_info_extractors.FunctionSignatureExtractor", - "_config": {"include_module": True, "include_defaults": True}, - }, - }, - } -} - - -def get_versioned_semantic_arrow_hasher( - version: str | None = None, -) -> SemanticArrowHasher: - """ - Get the versioned hasher for the specified version. - - Args: - version (str): The version of the hasher to retrieve. - - Returns: - ArrowHasher: An instance of the arrow hasher of the specified version. - """ - if version is None: - version = CURRENT_VERSION - - if version not in versioned_semantic_arrow_hashers: - raise ValueError(f"Unsupported hasher version: {version}") - - hasher_spec = versioned_semantic_arrow_hashers[version] - return parse_objectspec(hasher_spec) - - -def get_versioned_object_hasher( - version: str | None = None, -) -> ObjectHasher: - """ - Get an object hasher for the specified version. - - Args: - version (str): The version of the hasher to retrieve. - - Returns: - Object: An instance of the object hasher of the specified version. - """ - if version is None: - version = CURRENT_VERSION - - if version not in versioned_object_hashers: - raise ValueError(f"Unsupported hasher version: {version}") - - hasher_spec = versioned_object_hashers[version] - return parse_objectspec(hasher_spec) diff --git a/src/orcapod/types/__init__.py b/src/orcapod/types/__init__.py deleted file mode 100644 index c08aa6a..0000000 --- a/src/orcapod/types/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .core import PathLike, PathSet, TypeSpec, DataValue -from . import typespec_utils - -Packet = dict[str, str] -PacketLike = Packet - - -__all__ = [ - "TypeSpec", - "PathLike", - "PathSet", - "typespec_utils", - "DataValue", - "default_registry", -] diff --git a/src/orcapod/types/core.py b/src/orcapod/types/core.py deleted file mode 100644 index b43d21a..0000000 --- a/src/orcapod/types/core.py +++ /dev/null @@ -1,34 +0,0 @@ -from typing import Protocol, Any, TypeAlias -import os -from collections.abc import Collection, Mapping - -import logging - -logger = logging.getLogger(__name__) - -DataType: TypeAlias = type - -TypeSpec: TypeAlias = Mapping[ - str, DataType -] # Mapping of parameter names to their types - -# Convenience alias for anything pathlike -PathLike = str | os.PathLike - -# an (optional) string or a collection of (optional) string values -# Note that TagValue can be nested, allowing for an arbitrary depth of nested lists -TagValue: TypeAlias = int | str | None | Collection["TagValue"] - -# a pathset is a path or an arbitrary depth of nested list of paths -PathSet: TypeAlias = PathLike | Collection[PathLike | None] - -# Simple data types that we support (with clear Polars correspondence) -SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes - -ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathSet - -# Extended data values that can be stored in packets -# Either the original PathSet or one of our supported simple data types -DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None - -PacketLike: TypeAlias = Mapping[str, DataValue] diff --git a/src/orcapod/types/legacy/packets.py b/src/orcapod/types/legacy/packets.py deleted file mode 100644 index 7950d5b..0000000 --- a/src/orcapod/types/legacy/packets.py +++ /dev/null @@ -1,349 +0,0 @@ -from orcapod.types.core import DataValue -from typing import TypeAlias, Any -from collections.abc import Mapping, Collection -from orcapod.types.core import TypeSpec, Tag, TypeHandler -from orcapod.types.legacy.semantic_type_registry import SemanticTypeRegistry -from orcapod.types import schemas -from orcapod.types.typespec_utils import get_typespec_from_dict -import pyarrow as pa - -# A conveniece packet-like type that defines a value that can be -# converted to a packet. It's broader than Packet and a simple mapping -# from string keys to DataValue (e.g., int, float, str) can be regarded -# as PacketLike, allowing for more flexible interfaces. -# Anything that requires Packet-like data but without the strict features -# of a Packet should accept PacketLike. -# One should be careful when using PacketLike as a return type as it does not -# enforce the typespec or source_info, which are important for packet integrity. -PacketLike: TypeAlias = Mapping[str, DataValue] - - -class Packet(dict[str, DataValue]): - def __init__( - self, - obj: PacketLike | None = None, - typespec: TypeSpec | None = None, - source_info: dict[str, str | None] | None = None, - ): - if obj is None: - obj = {} - super().__init__(obj) - if typespec is None: - typespec = get_typespec_from_dict(self) - self._typespec = typespec - if source_info is None: - source_info = {} - self._source_info = source_info - - @property - def typespec(self) -> TypeSpec: - # consider returning a copy for immutability - return self._typespec - - @property - def source_info(self) -> dict[str, str | None]: - return {key: self._source_info.get(key, None) for key in self.keys()} - - @source_info.setter - def source_info(self, source_info: Mapping[str, str | None]): - self._source_info = { - key: value for key, value in source_info.items() if value is not None - } - - def get_composite(self) -> PacketLike: - composite = self.copy() - for k, v in self.source_info.items(): - composite[f"_source_info_{k}"] = v - return composite - - def map_keys( - self, mapping: Mapping[str, str], drop_unmapped: bool = False - ) -> "Packet": - """ - Map the keys of the packet using the provided mapping. - - Args: - mapping: A dictionary mapping old keys to new keys. - - Returns: - A new Packet with keys mapped according to the provided mapping. - """ - if drop_unmapped: - new_content = {v: self[k] for k, v in mapping.items() if k in self} - new_typespec = { - v: self.typespec[k] for k, v in mapping.items() if k in self.typespec - } - new_source_info = { - v: self.source_info[k] - for k, v in mapping.items() - if k in self.source_info - } - else: - new_content = {mapping.get(k, k): v for k, v in self.items()} - new_typespec = {mapping.get(k, k): v for k, v in self.typespec.items()} - new_source_info = { - mapping.get(k, k): v for k, v in self.source_info.items() - } - - return Packet(new_content, typespec=new_typespec, source_info=new_source_info) - - def join(self, other: "Packet") -> "Packet": - """ - Join another packet to this one, merging their keys and values. - - Args: - other: Another Packet to join with this one. - - Returns: - A new Packet with keys and values from both packets. - """ - # make sure there is no key collision - if not set(self.keys()).isdisjoint(other.keys()): - raise ValueError( - f"Key collision detected: packets {self} and {other} have overlapping keys" - " and cannot be joined without losing information." - ) - - new_content = {**self, **other} - new_typespec = {**self.typespec, **other.typespec} - new_source_info = {**self.source_info, **other.source_info} - - return Packet(new_content, typespec=new_typespec, source_info=new_source_info) - - -# a batch is a tuple of a tag and a list of packets -Batch: TypeAlias = tuple[Tag, Collection[Packet]] - - -class SemanticPacket(dict[str, Any]): - """ - A packet that conforms to a semantic schema, mapping string keys to values. - - This is used to represent data packets in OrcaPod with semantic types. - - Attributes - ---------- - keys : str - The keys of the packet. - values : Any - The values corresponding to each key. - - Examples - -------- - >>> packet = SemanticPacket(name='Alice', age=30) - >>> print(packet) - {'name': 'Alice', 'age': 30} - """ - - def __init__( - self, - *args, - semantic_schema: schemas.SemanticSchema | None = None, - source_info: dict[str, str | None] | None = None, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.schema = semantic_schema - if source_info is None: - source_info = {} - self.source_info = source_info - - def get_composite(self) -> dict[str, Any]: - composite = self.copy() - for k, v in self.source_info.items(): - composite[f"_source_info_{k}"] = v - return composite - - -class PacketConverter: - def __init__( - self, - typespec: TypeSpec, - registry: SemanticTypeRegistry, - include_source_info: bool = True, - ): - self.typespec = typespec - self.registry = registry - - self.semantic_schema = schemas.from_typespec_to_semantic_schema( - typespec, registry - ) - - self.include_source_info = include_source_info - - self.arrow_schema = schemas.from_semantic_schema_to_arrow_schema( - self.semantic_schema, include_source_info=self.include_source_info - ) - - self.key_handlers: dict[str, TypeHandler] = {} - - self.expected_key_set = set(self.typespec.keys()) - - for key, (_, semantic_type) in self.semantic_schema.items(): - if semantic_type is None: - continue - handler = registry.get_handler_by_semantic_type(semantic_type) - if handler is None: - raise ValueError( - f"No handler found for semantic type '{semantic_type}' in key '{key}'" - ) - self.key_handlers[key] = handler - - def _check_key_consistency(self, keys): - """Check if the provided keys match the expected keys.""" - keys_set = set(keys) - if keys_set != self.expected_key_set: - missing_keys = self.expected_key_set - keys_set - extra_keys = keys_set - self.expected_key_set - error_parts = [] - if missing_keys: - error_parts.append(f"Missing keys: {missing_keys}") - if extra_keys: - error_parts.append(f"Extra keys: {extra_keys}") - - raise KeyError(f"Keys don't match expected keys. {'; '.join(error_parts)}") - - def from_python_packet_to_semantic_packet( - self, python_packet: PacketLike - ) -> SemanticPacket: - """Convert a Python packet to a semantic packet. - - Args: - python_packet: Dictionary mapping parameter names to Python values - - Returns: - Packet with values converted to semantic types - - Raises: - KeyError: If packet keys don't match the expected type_info keys - TypeError: If value type doesn't match expected type - ValueError: If conversion fails - """ - # Validate packet keys - semantic_packet = SemanticPacket( - python_packet, - semantic_schema=self.semantic_schema, - source_info=getattr(python_packet, "source_info", None), - ) - self._check_key_consistency(set(semantic_packet.keys())) - - # convert from storage to Python types for semantic types - for key, handler in self.key_handlers.items(): - try: - semantic_packet[key] = handler.python_to_storage(semantic_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return semantic_packet - - def from_python_packet_to_arrow_table(self, python_packet: PacketLike) -> pa.Table: - """Convert a Python packet to an Arrow table. - - Args: - python_packet: Dictionary mapping parameter names to Python values - - Returns: - Arrow table representation of the packet - """ - semantic_packet = self.from_python_packet_to_semantic_packet(python_packet) - return self.from_semantic_packet_to_arrow_table(semantic_packet) - - def from_semantic_packet_to_arrow_table( - self, semantic_packet: SemanticPacket - ) -> pa.Table: - """Convert a semantic packet to an Arrow table. - - Args: - semantic_packet: SemanticPacket with values to convert - - Returns: - Arrow table representation of the packet - """ - if self.include_source_info: - return pa.Table.from_pylist( - [semantic_packet.get_composite()], schema=self.arrow_schema - ) - else: - return pa.Table.from_pylist([semantic_packet], schema=self.arrow_schema) - - def from_arrow_table_to_semantic_packets( - self, arrow_table: pa.Table - ) -> Collection[SemanticPacket]: - """Convert an Arrow table to a semantic packet. - - Args: - arrow_table: Arrow table representation of the packet - - Returns: - SemanticPacket with values converted from Arrow types - """ - # TODO: this is a crude check, implement more robust one to check that - # schema matches what's expected - if not arrow_table.schema.equals(self.arrow_schema): - raise ValueError("Arrow table schema does not match expected schema") - - semantic_packets_contents = arrow_table.to_pylist() - - semantic_packets = [] - for all_packet_content in semantic_packets_contents: - packet_content = { - k: v - for k, v in all_packet_content.items() - if k in self.expected_key_set - } - source_info = { - k.removeprefix("_source_info_"): v - for k, v in all_packet_content.items() - if k.startswith("_source_info_") - } - semantic_packets.append( - SemanticPacket( - packet_content, - semantic_schema=self.semantic_schema, - source_info=source_info, - ) - ) - - return semantic_packets - - def from_semantic_packet_to_python_packet( - self, semantic_packet: SemanticPacket - ) -> Packet: - """Convert a semantic packet to a Python packet. - - Args: - semantic_packet: SemanticPacket with values to convert - - Returns: - Python packet representation of the semantic packet - """ - # Validate packet keys - python_packet = Packet( - semantic_packet, - typespec=self.typespec, - source_info=semantic_packet.source_info, - ) - packet_keys = set(python_packet.keys()) - self._check_key_consistency(packet_keys) - - for key, handler in self.key_handlers.items(): - try: - python_packet[key] = handler.storage_to_python(python_packet[key]) - except Exception as e: - raise ValueError(f"Failed to convert value for '{key}': {e}") from e - - return python_packet - - def from_arrow_table_to_python_packets(self, arrow_table: pa.Table) -> list[Packet]: - """Convert an Arrow table to a list of Python packets. - - Args: - arrow_table: Arrow table representation of the packets - - Returns: - List of Python packets converted from the Arrow table - """ - semantic_packets = self.from_arrow_table_to_semantic_packets(arrow_table) - return [ - self.from_semantic_packet_to_python_packet(sp) for sp in semantic_packets - ] diff --git a/src/orcapod/types/legacy/semantic_type_handlers.py b/src/orcapod/types/legacy/semantic_type_handlers.py deleted file mode 100644 index b3bc70c..0000000 --- a/src/orcapod/types/legacy/semantic_type_handlers.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Any -import pyarrow as pa -from pathlib import Path -from uuid import UUID -from decimal import Decimal -from datetime import datetime, date, time - - -class PathHandler: - """Handler for pathlib.Path objects, stored as strings.""" - - def python_type(self) -> type: - return Path - - def storage_type(self) -> type: - return str - - def python_to_storage(self, value: Path) -> str: - return str(value) - - def storage_to_python(self, value: str) -> Path | None: - return Path(value) if value else None - - -class UUIDHandler: - """Handler for UUID objects, stored as strings.""" - - def python_type(self) -> type: - return UUID - - def storage_type(self) -> type: - return str - - def python_to_storage(self, value: UUID) -> str: - return str(value) - - def storage_to_python(self, value: str) -> UUID | None: - return UUID(value) if value else None - - -class DecimalHandler: - """Handler for Decimal objects, stored as strings.""" - - def python_type(self) -> type: - return Decimal - - def storage_type(self) -> type: - return str - - def python_to_storage(self, value: Decimal) -> str: - return str(value) - - def storage_to_python(self, value: str) -> Decimal | None: - return Decimal(value) if value else None - - -class SimpleMappingHandler: - """Handler for basic types that map directly to Arrow.""" - - def __init__(self, python_type: type): - self._python_type = python_type - - def python_type(self) -> type: - return self._python_type - - def storage_type(self) -> type: - return self._python_type - - def python_to_storage(self, value: Any) -> Any: - return value # Direct mapping - - def storage_to_python(self, value: Any) -> Any: - return value # Direct mapping - - -class DateTimeHandler: - """Handler for datetime objects.""" - - def python_type(self) -> type: - return datetime - - def storage_type(self) -> type: - return datetime - - def python_to_storage(self, value: datetime | date | time) -> Any: - if isinstance(value, datetime): - return value - elif isinstance(value, date): - return datetime.combine(value, time.min) - elif isinstance(value, time): - return datetime.combine(date.today(), value) - - def storage_to_python(self, value: datetime) -> datetime: - return value # Could add logic to restore original type if needed diff --git a/src/orcapod/types/legacy/semantic_type_registry.py b/src/orcapod/types/legacy/semantic_type_registry.py deleted file mode 100644 index 6934bae..0000000 --- a/src/orcapod/types/legacy/semantic_type_registry.py +++ /dev/null @@ -1,461 +0,0 @@ -import logging -import pyarrow as pa -from ..core import TypeHandler -from dataclasses import dataclass - -# This mapping is expected to be stable -# Be sure to test this assumption holds true -DEFAULT_ARROW_TYPE_LUT = { - int: pa.int64(), - float: pa.float64(), - str: pa.string(), - bool: pa.bool_(), -} - -logger = logging.getLogger(__name__) - - -# TODO: reconsider the need for this dataclass as its information is superfluous -# to the registration of the handler into the registry. -@dataclass -class TypeInfo: - python_type: type - storage_type: type - semantic_type: str | None # name under which the type is registered - handler: "TypeHandler" - - -class SemanticTypeRegistry: - """Registry that manages type handlers with semantic type names.""" - - def __init__(self): - self._handlers: dict[ - type, tuple[TypeHandler, str] - ] = {} # PythonType -> (Handler, semantic_name) - self._semantic_handlers: dict[str, TypeHandler] = {} # semantic_name -> Handler - self._semantic_to_python_lut: dict[ - str, type - ] = {} # semantic_name -> Python type - - def register( - self, - semantic_type: str, - handler: TypeHandler, - ): - """Register a handler with a semantic type name. - - Args: - semantic_name: Identifier for this semantic type (e.g., 'path', 'uuid') - handler: The type handler instance - explicit_types: Optional override of types to register for (if different from handler's supported_types) - override: If True, allow overriding existing registration for the same semantic name and Python type(s) - """ - # Determine which types to register for - - python_type = handler.python_type() - - # Register handler for each type - if python_type in self._handlers: - existing_semantic = self._handlers[python_type][1] - # TODO: handle overlapping registration more gracefully - raise ValueError( - f"Type {python_type} already registered with semantic type '{existing_semantic}'" - ) - - # Register by semantic name - if semantic_type in self._semantic_handlers: - raise ValueError(f"Semantic type '{semantic_type}' already registered") - - self._handlers[python_type] = (handler, semantic_type) - self._semantic_handlers[semantic_type] = handler - self._semantic_to_python_lut[semantic_type] = python_type - - def get_python_type(self, semantic_type: str) -> type | None: - """Get Python type for a semantic type.""" - return self._semantic_to_python_lut.get(semantic_type) - - def lookup_handler_info(self, python_type: type) -> tuple[TypeHandler, str] | None: - """Lookup handler info for a Python type.""" - for registered_type, (handler, semantic_type) in self._handlers.items(): - if issubclass(python_type, registered_type): - return (handler, semantic_type) - return None - - def get_semantic_type(self, python_type: type) -> str | None: - """Get semantic type for a Python type.""" - handler_info = self.lookup_handler_info(python_type) - return handler_info[1] if handler_info else None - - def get_handler(self, python_type: type) -> TypeHandler | None: - """Get handler for a Python type.""" - handler_info = self.lookup_handler_info(python_type) - return handler_info[0] if handler_info else None - - def get_handler_by_semantic_type(self, semantic_type: str) -> TypeHandler | None: - """Get handler by semantic type.""" - return self._semantic_handlers.get(semantic_type) - - def get_type_info(self, python_type: type) -> TypeInfo | None: - """Get TypeInfo for a Python type.""" - handler = self.get_handler(python_type) - if handler is None: - return None - semantic_type = self.get_semantic_type(python_type) - return TypeInfo( - python_type=python_type, - storage_type=handler.storage_type(), - semantic_type=semantic_type, - handler=handler, - ) - - def __contains__(self, python_type: type) -> bool: - """Check if a Python type is registered.""" - for registered_type in self._handlers: - if issubclass(python_type, registered_type): - return True - return False - - -# Below is a collection of functions that handles converting between various aspects of Python packets and Arrow tables. -# Here for convenience, any Python dictionary with str keys and supported Python values are referred to as a packet. - - -# Conversions are: -# python packet <-> storage packet <-> arrow table -# python typespec <-> storage typespec <-> arrow schema -# -# python packet <-> storage packet requires the use of SemanticTypeRegistry -# conversion between storage packet <-> arrow table requires info about semantic_type - - -# # Storage packet <-> Arrow table - -# def stroage_typespec_to_arrow_schema(storage_typespec:TypeSpec, semantic_type_info: dict[str, str]|None = None) -> pa.Schema: -# """Convert storage typespec to Arrow Schema with semantic_type metadata.""" -# """Convert storage typespec to PyArrow Schema with semantic_type metadata.""" -# if semantic_type_info is None: -# semantic_type_info = {} - -# fields = [] -# for field_name, field_type in storage_typespec.items(): -# arrow_type = python_to_pyarrow_type(field_type) -# semantic_type = semantic_type_info.get(field_name, None) -# field_metadata = {"semantic_type": semantic_type} if semantic_type else {} -# fields.append(pa.field(field_name, arrow_type, metadata=field_metadata)) -# return pa.schema(fields) - -# def arrow_schema_to_storage_typespec(schema: pa.Schema) -> tuple[TypeSpec, dict[str, str]|None]: -# """Convert Arrow Schema to storage typespec and semantic type metadata.""" -# typespec = {} -# semantic_type_info = {} - -# for field in schema: -# field_type = field.type -# typespec[field.name] = field_type.to_pandas_dtype() # Convert Arrow type to Pandas dtype -# if field.metadata and b"semantic_type" in field.metadata: -# semantic_type_info[field.name] = field.metadata[b"semantic_type"].decode("utf-8") - -# return typespec, semantic_type_info - - -# def storage_packet_to_arrow_table( -# storage_packet: PacketLike, -# typespec: TypeSpec | None = None, -# semantic_type_info: dict[str, str] | None = None, - - -# # TypeSpec + TypeRegistry + ArrowLUT -> Arrow Schema (annotated with semantic_type) - -# # - - -# # TypeSpec <-> Arrow Schema - -# def schema_from_typespec(typespec: TypeSpec, registry: SemanticTypeRegistry, metadata_info: dict | None = None) -> pa.Schema: -# """Convert TypeSpec to PyArrow Schema.""" -# if metadata_info is None: -# metadata_info = {} - -# fields = [] -# for field_name, field_type in typespec.items(): -# type_info = registry.get_type_info(field_type) -# if type_info is None: -# raise ValueError(f"No type info registered for {field_type}") -# fields.append(pa.field(field_name, type_info.arrow_type, metadata={ -# "semantic_type": type_info.semantic_type -# })) -# return pa.schema(fields) - -# def create_schema_from_typespec( -# typespec: TypeSpec, -# registry: SemanticTypeRegistry, -# metadata_info: dict | None = None, -# arrow_type_lut: dict[type, pa.DataType] | None = None, -# ) -> tuple[list[tuple[str, TypeHandler]], pa.Schema]: -# if metadata_info is None: -# metadata_info = {} -# if arrow_type_lut is None: -# arrow_type_lut = DEFAULT_ARROW_TYPE_LUT - -# keys_with_handlers: list[tuple[str, TypeHandler]] = [] -# schema_fields = [] -# for key, python_type in typespec.items(): -# type_info = registry.get_type_info(python_type) - -# field_metadata = {} -# if type_info and type_info.semantic_type: -# field_metadata["semantic_type"] = type_info.semantic_type -# keys_with_handlers.append((key, type_info.handler)) -# arrow_type = type_info.arrow_type -# else: -# arrow_type = arrow_type_lut.get(python_type) -# if arrow_type is None: -# raise ValueError( -# f"Direct support for Python type {python_type} is not provided. Register a handler to work with {python_type}" -# ) - -# schema_fields.append(pa.field(key, arrow_type, metadata=field_metadata)) -# return keys_with_handlers, pa.schema(schema_fields) - - -# def arrow_table_to_packets( -# table: pa.Table, -# registry: SemanticTypeRegistry, -# ) -> list[Packet]: -# """Convert Arrow table to packet with field metadata. - -# Args: -# packet: Dictionary mapping parameter names to Python values - -# Returns: -# PyArrow Table with the packet data as a single row -# """ -# packets: list[Packet] = [] - -# # prepare converter for each field - -# def no_op(x) -> Any: -# return x - -# converter_lut = {} -# for field in table.schema: -# if field.metadata and b"semantic_type" in field.metadata: -# semantic_type = field.metadata[b"semantic_type"].decode("utf-8") -# if semantic_type: -# handler = registry.get_handler_by_semantic_name(semantic_type) -# if handler is None: -# raise ValueError( -# f"No handler registered for semantic type '{semantic_type}'" -# ) -# converter_lut[field.name] = handler.storage_to_python - -# # Create packets from the Arrow table -# # TODO: make this more efficient -# for row in range(table.num_rows): -# packet: Packet = Packet() -# for field in table.schema: -# value = table.column(field.name)[row].as_py() -# packet[field.name] = converter_lut.get(field.name, no_op)(value) -# packets.append(packet) - -# return packets - - -# def create_arrow_table_with_meta( -# storage_packet: dict[str, Any], type_info: dict[str, TypeInfo] -# ): -# """Create an Arrow table with metadata from a storage packet. - -# Args: -# storage_packet: Dictionary with values in storage format -# type_info: Dictionary mapping parameter names to TypeInfo objects - -# Returns: -# PyArrow Table with metadata -# """ -# schema_fields = [] -# for key, type_info_obj in type_info.items(): -# field_metadata = {} -# if type_info_obj.semantic_type: -# field_metadata["semantic_type"] = type_info_obj.semantic_type - -# field = pa.field(key, type_info_obj.arrow_type, metadata=field_metadata) -# schema_fields.append(field) - -# schema = pa.schema(schema_fields) - -# arrays = [] -# for field in schema: -# value = storage_packet[field.name] -# array = pa.array([value], type=field.type) -# arrays.append(array) - -# return pa.Table.from_arrays(arrays, schema=schema) - - -# def retrieve_storage_packet_from_arrow_with_meta( -# arrow_table: pa.Table, -# ) -> dict[str, Any]: -# """Retrieve storage packet from Arrow table with metadata. - -# Args: -# arrow_table: PyArrow Table with metadata - -# Returns: -# Dictionary representing the storage packet -# """ -# storage_packet = {} -# for field in arrow_table.schema: -# # Extract value from Arrow array -# array = arrow_table.column(field.name) -# if array.num_chunks > 0: -# value = array.chunk(0).as_py()[0] # Get first value -# else: -# value = None # Handle empty arrays - -# storage_packet[field.name] = value - -# return storage_packet - -# def typespec_to_schema_with_metadata(typespec: TypeSpec, field_metadata: dict|None = None) -> pa.Schema: -# """Convert TypeSpec to PyArrow Schema""" -# fields = [] -# for field_name, field_type in typespec.items(): -# arrow_type = python_to_pyarrow_type(field_type) -# fields.append(pa.field(field_name, arrow_type)) -# return pa.schema(fields) - -# def python_to_pyarrow_type(python_type: type, strict:bool=True) -> pa.DataType: -# """Convert Python type (including generics) to PyArrow type""" -# # For anywhere we need to store str value, we use large_string as is done in Polars - -# # Handle basic types first -# basic_mapping = { -# int: pa.int64(), -# float: pa.float64(), -# str: pa.large_string(), -# bool: pa.bool_(), -# bytes: pa.binary(), -# } - -# if python_type in basic_mapping: -# return basic_mapping[python_type] - -# # Handle generic types -# origin = get_origin(python_type) -# args = get_args(python_type) - -# if origin is list: -# # Handle list[T] -# if args: -# element_type = python_to_pyarrow_type(args[0]) -# return pa.list_(element_type) -# else: -# return pa.list_(pa.large_string()) # default to list of strings - -# elif origin is dict: -# # Handle dict[K, V] - PyArrow uses map type -# if len(args) == 2: -# key_type = python_to_pyarrow_type(args[0]) -# value_type = python_to_pyarrow_type(args[1]) -# return pa.map_(key_type, value_type) -# else: -# # Otherwise default to using long string -# return pa.map_(pa.large_string(), pa.large_string()) - -# elif origin is UnionType: -# # Handle Optional[T] (Union[T, None]) -# if len(args) == 2 and type(None) in args: -# non_none_type = args[0] if args[1] is type(None) else args[1] -# return python_to_pyarrow_type(non_none_type) - -# # Default fallback -# if not strict: -# logger.warning(f"Unsupported type {python_type}, defaulting to large_string") -# return pa.large_string() -# else: -# raise TypeError(f"Unsupported type {python_type} for PyArrow conversion. " -# "Set strict=False to allow fallback to large_string.") - -# def arrow_to_dicts(table: pa.Table) -> list[dict[str, Any]]: -# """ -# Convert Arrow table to dictionary or list of dictionaries. -# Returns a list of dictionaries (one per row) with column names as keys. -# Args: -# table: PyArrow Table to convert -# Returns: -# A list of dictionaries for multi-row tables. -# """ -# if len(table) == 0: -# return [] - -# # Multiple rows: return list of dicts (one per row) -# return [ -# {col_name: table.column(col_name)[i].as_py() for col_name in table.column_names} -# for i in range(len(table)) -# ] - -# def get_metadata_from_schema( -# schema: pa.Schema, metadata_field: bytes -# ) -> dict[str, str]: -# """ -# Extract metadata from Arrow schema fields. Metadata value will be utf-8 decoded. -# Args: -# schema: PyArrow Schema to extract metadata from -# metadata_field: Metadata field to extract (e.g., b'semantic_type') -# Returns: -# Dictionary mapping field names to their metadata values -# """ -# metadata = {} -# for field in schema: -# if field.metadata and metadata_field in field.metadata: -# metadata[field.name] = field.metadata[metadata_field].decode("utf-8") -# return metadata - -# def dict_to_arrow_table_with_metadata(data: dict, data_type_info: TypeSpec | None = None, metadata: dict | None = None): -# """ -# Convert a tag dictionary to PyArrow table with metadata on each column. - -# Args: -# tag: Dictionary with string keys and any Python data type values -# metadata_key: The metadata key to add to each column -# metadata_value: The metadata value to indicate this column came from tag -# """ -# if metadata is None: -# metadata = {} - -# if field_types is None: -# # First create the table to infer types -# temp_table = pa.Table.from_pylist([data]) - -# # Create new fields with metadata -# fields_with_metadata = [] -# for field in temp_table.schema: -# # Add metadata to each field -# field_metadata = metadata -# new_field = pa.field( -# field.name, field.type, nullable=field.nullable, metadata=field_metadata -# ) -# fields_with_metadata.append(new_field) - -# # Create schema with metadata -# schema_with_metadata = pa.schema(fields_with_metadata) - -# # Create the final table with the metadata-enriched schema -# table = pa.Table.from_pylist([tag], schema=schema_with_metadata) - -# return table - - -# # def get_columns_with_metadata( -# # df: pl.DataFrame, key: str, value: str | None = None -# # ) -> list[str]: -# # """Get column names with specific metadata using list comprehension. If value is given, only -# # columns matching that specific value for the desginated metadata key will be returned. -# # Otherwise, all columns that contains the key as metadata will be returned regardless of the value""" -# # return [ -# # col_name -# # for col_name, dtype in df.schema.items() -# # if hasattr(dtype, "metadata") -# # and (value is None or getattr(dtype, "metadata") == value) -# # ] diff --git a/src/orcapod/types/typespec_utils.py b/src/orcapod/types/typespec_utils.py deleted file mode 100644 index 609a6a0..0000000 --- a/src/orcapod/types/typespec_utils.py +++ /dev/null @@ -1,301 +0,0 @@ -# Library of functions for working with TypeSpecs and for extracting TypeSpecs from a function's signature - -from collections.abc import Callable, Collection, Sequence, Mapping -from typing import get_origin, get_args, Any -from orcapod.types.core import TypeSpec -import inspect -import logging - - -logger = logging.getLogger(__name__) - - -def verify_against_typespec(packet: dict, typespec: TypeSpec) -> bool: - """Verify that the dictionary's types match the expected types in the typespec.""" - from beartype.door import is_bearable - - # verify that packet contains no keys not in typespec - if set(packet.keys()) - set(typespec.keys()): - logger.warning( - f"Packet contains keys not in typespec: {set(packet.keys()) - set(typespec.keys())}. " - ) - return False - for key, type_info in typespec.items(): - if key not in packet: - logger.warning( - f"Key '{key}' not found in packet. Assuming None but this behavior may change in the future" - ) - if not is_bearable(packet.get(key), type_info): - logger.warning( - f"Type mismatch for key '{key}': expected {type_info}, got {packet.get(key)}." - ) - return False - return True - - -# TODO: is_subhint does not handle invariance properly -# so when working with mutable types, we have to make sure to perform deep copy -def check_typespec_compatibility( - incoming_types: TypeSpec, receiving_types: TypeSpec -) -> bool: - from beartype.door import is_subhint - - for key, type_info in incoming_types.items(): - if key not in receiving_types: - logger.warning(f"Key '{key}' not found in parameter types.") - return False - if not is_subhint(type_info, receiving_types[key]): - logger.warning( - f"Type mismatch for key '{key}': expected {receiving_types[key]}, got {type_info}." - ) - return False - return True - - -def extract_function_typespecs( - func: Callable, - output_keys: Collection[str], - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | Sequence[type] | None = None, -) -> tuple[TypeSpec, TypeSpec]: - """ - Extract input and output data types from a function signature. - - This function analyzes a function's signature to determine the types of its parameters - and return values. It combines information from type annotations, user-provided type - specifications, and return key mappings to produce complete type specifications. - - Args: - func: The function to analyze for type information. - output_keys: Collection of string keys that will be used to map the function's - return values. For functions returning a single value, provide a single key. - For functions returning multiple values (tuple/list), provide keys matching - the number of return items. - input_types: Optional mapping of parameter names to their types. If provided, - these types override any type annotations in the function signature for the - specified parameters. If a parameter is not in this mapping and has no - annotation, an error is raised. - output_types: Optional type specification for return values. Can be either: - - A dict mapping output keys to types (TypeSpec) - - A sequence of types that will be mapped to output_keys in order - These types override any inferred types from the function's return annotation. - - Returns: - A tuple containing: - - input_types_dict: Mapping of parameter names to their inferred/specified types - - output_types_dict: Mapping of output keys to their inferred/specified types - - Raises: - ValueError: In various scenarios: - - Parameter has no type annotation and is not in input_types - - Function has return annotation but no output_keys specified - - Function has explicit None return but non-empty output_keys provided - - Multiple output_keys specified but return annotation is not a sequence type - - Return annotation is a sequence type but doesn't specify item types - - Number of types in return annotation doesn't match number of output_keys - - Output types sequence length doesn't match output_keys length - - Output key not specified in output_types and has no type annotation - - Examples: - >>> def add(x: int, y: int) -> int: - ... return x + y - >>> input_types, output_types = extract_function_data_types(add, ['result']) - >>> input_types - {'x': , 'y': } - >>> output_types - {'result': } - - >>> def process(data: str) -> tuple[int, str]: - ... return len(data), data.upper() - >>> input_types, output_types = extract_function_data_types( - ... process, ['length', 'upper_data'] - ... ) - >>> input_types - {'data': } - >>> output_types - {'length': , 'upper_data': } - - >>> def legacy_func(x, y): # No annotations - ... return x + y - >>> input_types, output_types = extract_function_data_types( - ... legacy_func, ['sum'], - ... input_types={'x': int, 'y': int}, - ... output_types={'sum': int} - ... ) - >>> input_types - {'x': , 'y': } - >>> output_types - {'sum': } - - >>> def multi_return(data: list) -> tuple[int, float, str]: - ... return len(data), sum(data), str(data) - >>> input_types, output_types = extract_function_data_types( - ... multi_return, ['count', 'total', 'repr'], - ... output_types=[int, float, str] # Override with sequence - ... ) - >>> output_types - {'count': , 'total': , 'repr': } - """ - verified_output_types: TypeSpec = {} - if output_typespec is not None: - if isinstance(output_typespec, dict): - verified_output_types = output_typespec - elif isinstance(output_typespec, Sequence): - # If output_types is a collection, convert it to a dict with keys from return_keys - if len(output_typespec) != len(output_keys): - raise ValueError( - f"Output types collection length {len(output_typespec)} does not match return keys length {len(output_keys)}." - ) - verified_output_types = {k: v for k, v in zip(output_keys, output_typespec)} - - signature = inspect.signature(func) - - param_info: TypeSpec = {} - for name, param in signature.parameters.items(): - if input_typespec and name in input_typespec: - param_info[name] = input_typespec[name] - else: - # check if the parameter has annotation - if param.annotation is not inspect.Signature.empty: - param_info[name] = param.annotation - else: - raise ValueError( - f"Parameter '{name}' has no type annotation and is not specified in input_types." - ) - - return_annot = signature.return_annotation - inferred_output_types: TypeSpec = {} - if return_annot is not inspect.Signature.empty and return_annot is not None: - output_item_types = [] - if len(output_keys) == 0: - raise ValueError( - "Function has a return type annotation, but no return keys were specified." - ) - elif len(output_keys) == 1: - # if only one return key, the entire annotation is inferred as the return type - output_item_types = [return_annot] - elif (get_origin(return_annot) or return_annot) in (tuple, list, Sequence): - if get_origin(return_annot) is None: - # right type was specified but did not specified the type of items - raise ValueError( - f"Function return type annotation {return_annot} is a Sequence type but does not specify item types." - ) - output_item_types = get_args(return_annot) - if len(output_item_types) != len(output_keys): - raise ValueError( - f"Function return type annotation {return_annot} has {len(output_item_types)} items, " - f"but output_keys has {len(output_keys)} items." - ) - else: - raise ValueError( - f"Multiple return keys were specified but return type annotation {return_annot} is not a sequence type (list, tuple, Collection)." - ) - for key, type_annot in zip(output_keys, output_item_types): - inferred_output_types[key] = type_annot - elif return_annot is None: - if len(output_keys) != 0: - raise ValueError( - f"Function provides explicit return type annotation as None, but return keys of length {len(output_keys)} were specified." - ) - else: - inferred_output_types = {k: inspect.Signature.empty for k in output_keys} - - # TODO: simplify the handling here -- technically all keys should already be in return_types - for key in output_keys: - if key in verified_output_types: - inferred_output_types[key] = verified_output_types[key] - elif ( - key not in inferred_output_types - or inferred_output_types[key] is inspect.Signature.empty - ): - raise ValueError( - f"Type for return item '{key}' is not specified in output_types and has no type annotation in function signature." - ) - return param_info, inferred_output_types - - -def get_typespec_from_dict( - data: Mapping, typespec: TypeSpec | None = None, default=str -) -> TypeSpec: - """ - Returns a TypeSpec for the given dictionary. - The TypeSpec is a mapping from field name to Python type. If typespec is provided, then - it is used as a base when inferring types for the fields in dict - """ - if typespec is None: - typespec = {} - return { - key: typespec.get(key, type(value) if value is not None else default) - for key, value in data.items() - } - - -def get_compatible_type(type1: Any, type2: Any) -> Any: - if type1 is type2: - return type1 - if issubclass(type1, type2): - return type2 - if issubclass(type2, type1): - return type1 - raise TypeError(f"Types {type1} and {type2} are not compatible") - - -def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: - # Merge the two TypeSpecs but raise an error if conflicts in types are found - merged = dict(typespecs[0]) - for typespec in typespecs[1:]: - for key, right_type in typespec.items(): - merged[key] = ( - get_compatible_type(merged[key], right_type) - if key in merged - else right_type - ) - return merged - - -def intersection_typespecs(*typespecs: TypeSpec) -> TypeSpec: - """ - Returns the intersection of all TypeSpecs, only returning keys that are present in all typespecs. - If a key is present in both TypeSpecs, the type must be the same. - """ - - # Find common keys and ensure types match - - common_keys = set(typespecs[0].keys()) - for typespec in typespecs[1:]: - common_keys.intersection_update(typespec.keys()) - - intersection = {k: typespecs[0][k] for k in common_keys} - for typespec in typespecs[1:]: - for key in common_keys: - try: - intersection[key] = get_compatible_type( - intersection[key], typespec[key] - ) - except TypeError: - # If types are not compatible, raise an error - raise TypeError( - f"Type conflict for key '{key}': {intersection[key]} vs {typespec[key]}" - ) - return intersection - - -# def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: -# """ -# Returns the intersection of two TypeSpecs, only returning keys that are present in both. -# If a key is present in both TypeSpecs, the type must be the same. -# """ - -# # Find common keys and ensure types match -# common_keys = set(left.keys()).intersection(set(right.keys())) -# intersection = {} -# for key in common_keys: -# try: -# intersection[key] = get_compatible_type(left[key], right[key]) -# except TypeError: -# # If types are not compatible, raise an error -# raise TypeError( -# f"Type conflict for key '{key}': {left[key]} vs {right[key]}" -# ) - -# return intersection From 468266dabc97fd899b5ee84813ad187bdecb4644 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 14:17:27 -0700 Subject: [PATCH 201/224] refactor: use updated types module path --- src/orcapod/data/datagrams/arrow_datagram.py | 3 +- .../data/datagrams/arrow_tag_packet.py | 3 +- src/orcapod/data/datagrams/base.py | 3 +- src/orcapod/data/datagrams/dict_datagram.py | 2 +- src/orcapod/data/datagrams/dict_tag_packet.py | 2 +- src/orcapod/data/operators/join.py | 10 +- src/orcapod/data/operators/semijoin.py | 29 +- src/orcapod/data/pods.py | 16 +- src/orcapod/types.py | 34 ++ src/orcapod/utils/types_utils.py | 301 ++++++++++++++++++ 10 files changed, 375 insertions(+), 28 deletions(-) create mode 100644 src/orcapod/types.py create mode 100644 src/orcapod/utils/types_utils.py diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index a7d0cb8..15c2655 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -7,8 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.data.system_constants import constants -from orcapod.types import TypeSpec -from orcapod.types.core import DataValue +from orcapod.types import DataValue, TypeSpec from orcapod.protocols.hashing_protocols import ContentHash from orcapod.utils import arrow_utils diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index 88e848e..e2b0aec 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -8,9 +8,8 @@ from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.semantic_types import infer_schema_from_pylist_data -from orcapod.types import TypeSpec -from orcapod.types.core import DataValue +from orcapod.types import DataValue, TypeSpec from orcapod.utils import arrow_utils from orcapod.data.datagrams.arrow_datagram import ArrowDatagram diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index f476cd9..844685a 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -26,8 +26,7 @@ import pyarrow as pa -from orcapod.types import TypeSpec -from orcapod.types.core import DataValue +from orcapod.types import DataValue, TypeSpec logger = logging.getLogger(__name__) diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 9088537..77da019 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -7,7 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_schema_from_pylist_data -from orcapod.types.core import DataValue +from orcapod.types import DataValue from orcapod.utils import arrow_utils from orcapod.protocols.hashing_protocols import ContentHash diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index e71dbac..11b738c 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -7,9 +7,9 @@ from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.data.datagrams.dict_datagram import DictDatagram -from orcapod.types.core import DataValue from orcapod.utils import arrow_utils from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.types import DataValue logger = logging.getLogger(__name__) diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index ee4b652..1090d25 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -1,7 +1,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.data.streams import TableStream from orcapod.types import TypeSpec -from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs +from orcapod.utils import types_utils from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from collections.abc import Collection @@ -48,11 +48,13 @@ def op_output_types( other_tag_typespec, other_packet_typespec = other_stream.types( include_system_tags=include_system_tags ) - tag_typespec = union_typespecs(tag_typespec, other_tag_typespec) - intersection_packet_typespec = intersection_typespecs( + tag_typespec = types_utils.union_typespecs(tag_typespec, other_tag_typespec) + intersection_packet_typespec = types_utils.intersection_typespecs( + packet_typespec, other_packet_typespec + ) + packet_typespec = types_utils.union_typespecs( packet_typespec, other_packet_typespec ) - packet_typespec = union_typespecs(packet_typespec, other_packet_typespec) if intersection_packet_typespec: raise InputValidationError( f"Packets should not have overlapping keys, but {packet_typespec.keys()} found in {stream} and {other_stream}." diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/data/operators/semijoin.py index 9a10eec..604969d 100644 --- a/src/orcapod/data/operators/semijoin.py +++ b/src/orcapod/data/operators/semijoin.py @@ -1,7 +1,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.data.streams import TableStream +from orcapod.utils import types_utils from orcapod.types import TypeSpec -from orcapod.types.typespec_utils import union_typespecs, intersection_typespecs from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.errors import InputValidationError @@ -59,11 +59,17 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre right_tag_typespec, right_packet_typespec = right_stream.types() # Find overlapping columns across all columns (tags + packets) - left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) - right_all_typespec = union_typespecs(right_tag_typespec, right_packet_typespec) + left_all_typespec = types_utils.union_typespecs( + left_tag_typespec, left_packet_typespec + ) + right_all_typespec = types_utils.union_typespecs( + right_tag_typespec, right_packet_typespec + ) common_keys = tuple( - intersection_typespecs(left_all_typespec, right_all_typespec).keys() + types_utils.intersection_typespecs( + left_all_typespec, right_all_typespec + ).keys() ) # If no overlapping columns, return the left stream unmodified @@ -91,14 +97,17 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre ) def op_output_types( - self, left_stream: dp.Stream, right_stream: dp.Stream + self, + left_stream: dp.Stream, + right_stream: dp.Stream, + include_system_tags: bool = False, ) -> tuple[TypeSpec, TypeSpec]: """ Returns the output types for the semi-join operation. The output preserves the exact schema of the left stream. """ # Semi-join preserves the left stream's schema exactly - return left_stream.types() + return left_stream.types(include_system_tags=include_system_tags) def op_validate_inputs( self, left_stream: dp.Stream, right_stream: dp.Stream @@ -112,13 +121,15 @@ def op_validate_inputs( right_tag_typespec, right_packet_typespec = right_stream.types() # Check that overlapping columns have compatible types across all columns - left_all_typespec = union_typespecs(left_tag_typespec, left_packet_typespec) - right_all_typespec = union_typespecs( + left_all_typespec = types_utils.union_typespecs( + left_tag_typespec, left_packet_typespec + ) + right_all_typespec = types_utils.union_typespecs( right_tag_typespec, right_packet_typespec ) # intersection_typespecs will raise an error if types are incompatible - intersection_typespecs(left_all_typespec, right_all_typespec) + types_utils.intersection_typespecs(left_all_typespec, right_all_typespec) except Exception as e: raise InputValidationError( diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 144836c..0c9e7d5 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -18,7 +18,7 @@ from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore from orcapod.types import DataValue, TypeSpec -from orcapod.types import typespec_utils as tsutils +from orcapod.utils import types_utils from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components import hashlib @@ -175,7 +175,7 @@ def validate_inputs(self, *streams: dp.Stream) -> None: ) input_stream = streams[0] _, incoming_packet_types = input_stream.types() - if not tsutils.check_typespec_compatibility( + if not types_utils.check_typespec_compatibility( incoming_packet_types, self.input_packet_types() ): # TODO: use custom exception type for better error handling @@ -302,11 +302,13 @@ def __init__( super().__init__(label=label or self.function_name, version=version, **kwargs) # extract input and output types from the function signature - input_packet_types, output_packet_types = tsutils.extract_function_typespecs( - self.function, - self.output_keys, - input_typespec=input_python_schema, - output_typespec=output_python_schema, + input_packet_types, output_packet_types = ( + types_utils.extract_function_typespecs( + self.function, + self.output_keys, + input_typespec=input_python_schema, + output_typespec=output_python_schema, + ) ) self._input_packet_schema = dict(input_packet_types) self._output_packet_schema = dict(output_packet_types) diff --git a/src/orcapod/types.py b/src/orcapod/types.py new file mode 100644 index 0000000..22043de --- /dev/null +++ b/src/orcapod/types.py @@ -0,0 +1,34 @@ +from typing import TypeAlias +import os +from collections.abc import Collection, Mapping + +import logging + +logger = logging.getLogger(__name__) + +DataType: TypeAlias = type + +TypeSpec: TypeAlias = Mapping[ + str, DataType +] # Mapping of parameter names to their types + +# Convenience alias for anything pathlike +PathLike = str | os.PathLike + +# an (optional) string or a collection of (optional) string values +# Note that TagValue can be nested, allowing for an arbitrary depth of nested lists +TagValue: TypeAlias = int | str | None | Collection["TagValue"] + +# a pathset is a path or an arbitrary depth of nested list of paths +PathSet: TypeAlias = PathLike | Collection[PathLike | None] + +# Simple data types that we support (with clear Polars correspondence) +SupportedNativePythonData: TypeAlias = str | int | float | bool | bytes + +ExtendedSupportedPythonData: TypeAlias = SupportedNativePythonData | PathSet + +# Extended data values that can be stored in packets +# Either the original PathSet or one of our supported simple data types +DataValue: TypeAlias = ExtendedSupportedPythonData | Collection["DataValue"] | None + +PacketLike: TypeAlias = Mapping[str, DataValue] diff --git a/src/orcapod/utils/types_utils.py b/src/orcapod/utils/types_utils.py new file mode 100644 index 0000000..3f5df59 --- /dev/null +++ b/src/orcapod/utils/types_utils.py @@ -0,0 +1,301 @@ +# Library of functions for working with TypeSpecs and for extracting TypeSpecs from a function's signature + +from collections.abc import Callable, Collection, Sequence, Mapping +from typing import get_origin, get_args, Any +from orcapod.types import TypeSpec +import inspect +import logging + + +logger = logging.getLogger(__name__) + + +def verify_against_typespec(packet: dict, typespec: TypeSpec) -> bool: + """Verify that the dictionary's types match the expected types in the typespec.""" + from beartype.door import is_bearable + + # verify that packet contains no keys not in typespec + if set(packet.keys()) - set(typespec.keys()): + logger.warning( + f"Packet contains keys not in typespec: {set(packet.keys()) - set(typespec.keys())}. " + ) + return False + for key, type_info in typespec.items(): + if key not in packet: + logger.warning( + f"Key '{key}' not found in packet. Assuming None but this behavior may change in the future" + ) + if not is_bearable(packet.get(key), type_info): + logger.warning( + f"Type mismatch for key '{key}': expected {type_info}, got {packet.get(key)}." + ) + return False + return True + + +# TODO: is_subhint does not handle invariance properly +# so when working with mutable types, we have to make sure to perform deep copy +def check_typespec_compatibility( + incoming_types: TypeSpec, receiving_types: TypeSpec +) -> bool: + from beartype.door import is_subhint + + for key, type_info in incoming_types.items(): + if key not in receiving_types: + logger.warning(f"Key '{key}' not found in parameter types.") + return False + if not is_subhint(type_info, receiving_types[key]): + logger.warning( + f"Type mismatch for key '{key}': expected {receiving_types[key]}, got {type_info}." + ) + return False + return True + + +def extract_function_typespecs( + func: Callable, + output_keys: Collection[str], + input_typespec: TypeSpec | None = None, + output_typespec: TypeSpec | Sequence[type] | None = None, +) -> tuple[TypeSpec, TypeSpec]: + """ + Extract input and output data types from a function signature. + + This function analyzes a function's signature to determine the types of its parameters + and return values. It combines information from type annotations, user-provided type + specifications, and return key mappings to produce complete type specifications. + + Args: + func: The function to analyze for type information. + output_keys: Collection of string keys that will be used to map the function's + return values. For functions returning a single value, provide a single key. + For functions returning multiple values (tuple/list), provide keys matching + the number of return items. + input_types: Optional mapping of parameter names to their types. If provided, + these types override any type annotations in the function signature for the + specified parameters. If a parameter is not in this mapping and has no + annotation, an error is raised. + output_types: Optional type specification for return values. Can be either: + - A dict mapping output keys to types (TypeSpec) + - A sequence of types that will be mapped to output_keys in order + These types override any inferred types from the function's return annotation. + + Returns: + A tuple containing: + - input_types_dict: Mapping of parameter names to their inferred/specified types + - output_types_dict: Mapping of output keys to their inferred/specified types + + Raises: + ValueError: In various scenarios: + - Parameter has no type annotation and is not in input_types + - Function has return annotation but no output_keys specified + - Function has explicit None return but non-empty output_keys provided + - Multiple output_keys specified but return annotation is not a sequence type + - Return annotation is a sequence type but doesn't specify item types + - Number of types in return annotation doesn't match number of output_keys + - Output types sequence length doesn't match output_keys length + - Output key not specified in output_types and has no type annotation + + Examples: + >>> def add(x: int, y: int) -> int: + ... return x + y + >>> input_types, output_types = extract_function_data_types(add, ['result']) + >>> input_types + {'x': , 'y': } + >>> output_types + {'result': } + + >>> def process(data: str) -> tuple[int, str]: + ... return len(data), data.upper() + >>> input_types, output_types = extract_function_data_types( + ... process, ['length', 'upper_data'] + ... ) + >>> input_types + {'data': } + >>> output_types + {'length': , 'upper_data': } + + >>> def legacy_func(x, y): # No annotations + ... return x + y + >>> input_types, output_types = extract_function_data_types( + ... legacy_func, ['sum'], + ... input_types={'x': int, 'y': int}, + ... output_types={'sum': int} + ... ) + >>> input_types + {'x': , 'y': } + >>> output_types + {'sum': } + + >>> def multi_return(data: list) -> tuple[int, float, str]: + ... return len(data), sum(data), str(data) + >>> input_types, output_types = extract_function_data_types( + ... multi_return, ['count', 'total', 'repr'], + ... output_types=[int, float, str] # Override with sequence + ... ) + >>> output_types + {'count': , 'total': , 'repr': } + """ + verified_output_types: TypeSpec = {} + if output_typespec is not None: + if isinstance(output_typespec, dict): + verified_output_types = output_typespec + elif isinstance(output_typespec, Sequence): + # If output_types is a collection, convert it to a dict with keys from return_keys + if len(output_typespec) != len(output_keys): + raise ValueError( + f"Output types collection length {len(output_typespec)} does not match return keys length {len(output_keys)}." + ) + verified_output_types = {k: v for k, v in zip(output_keys, output_typespec)} + + signature = inspect.signature(func) + + param_info: TypeSpec = {} + for name, param in signature.parameters.items(): + if input_typespec and name in input_typespec: + param_info[name] = input_typespec[name] + else: + # check if the parameter has annotation + if param.annotation is not inspect.Signature.empty: + param_info[name] = param.annotation + else: + raise ValueError( + f"Parameter '{name}' has no type annotation and is not specified in input_types." + ) + + return_annot = signature.return_annotation + inferred_output_types: TypeSpec = {} + if return_annot is not inspect.Signature.empty and return_annot is not None: + output_item_types = [] + if len(output_keys) == 0: + raise ValueError( + "Function has a return type annotation, but no return keys were specified." + ) + elif len(output_keys) == 1: + # if only one return key, the entire annotation is inferred as the return type + output_item_types = [return_annot] + elif (get_origin(return_annot) or return_annot) in (tuple, list, Sequence): + if get_origin(return_annot) is None: + # right type was specified but did not specified the type of items + raise ValueError( + f"Function return type annotation {return_annot} is a Sequence type but does not specify item types." + ) + output_item_types = get_args(return_annot) + if len(output_item_types) != len(output_keys): + raise ValueError( + f"Function return type annotation {return_annot} has {len(output_item_types)} items, " + f"but output_keys has {len(output_keys)} items." + ) + else: + raise ValueError( + f"Multiple return keys were specified but return type annotation {return_annot} is not a sequence type (list, tuple, Collection)." + ) + for key, type_annot in zip(output_keys, output_item_types): + inferred_output_types[key] = type_annot + elif return_annot is None: + if len(output_keys) != 0: + raise ValueError( + f"Function provides explicit return type annotation as None, but return keys of length {len(output_keys)} were specified." + ) + else: + inferred_output_types = {k: inspect.Signature.empty for k in output_keys} + + # TODO: simplify the handling here -- technically all keys should already be in return_types + for key in output_keys: + if key in verified_output_types: + inferred_output_types[key] = verified_output_types[key] + elif ( + key not in inferred_output_types + or inferred_output_types[key] is inspect.Signature.empty + ): + raise ValueError( + f"Type for return item '{key}' is not specified in output_types and has no type annotation in function signature." + ) + return param_info, inferred_output_types + + +def get_typespec_from_dict( + data: Mapping, typespec: TypeSpec | None = None, default=str +) -> TypeSpec: + """ + Returns a TypeSpec for the given dictionary. + The TypeSpec is a mapping from field name to Python type. If typespec is provided, then + it is used as a base when inferring types for the fields in dict + """ + if typespec is None: + typespec = {} + return { + key: typespec.get(key, type(value) if value is not None else default) + for key, value in data.items() + } + + +def get_compatible_type(type1: Any, type2: Any) -> Any: + if type1 is type2: + return type1 + if issubclass(type1, type2): + return type2 + if issubclass(type2, type1): + return type1 + raise TypeError(f"Types {type1} and {type2} are not compatible") + + +def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: + # Merge the two TypeSpecs but raise an error if conflicts in types are found + merged = dict(typespecs[0]) + for typespec in typespecs[1:]: + for key, right_type in typespec.items(): + merged[key] = ( + get_compatible_type(merged[key], right_type) + if key in merged + else right_type + ) + return merged + + +def intersection_typespecs(*typespecs: TypeSpec) -> TypeSpec: + """ + Returns the intersection of all TypeSpecs, only returning keys that are present in all typespecs. + If a key is present in both TypeSpecs, the type must be the same. + """ + + # Find common keys and ensure types match + + common_keys = set(typespecs[0].keys()) + for typespec in typespecs[1:]: + common_keys.intersection_update(typespec.keys()) + + intersection = {k: typespecs[0][k] for k in common_keys} + for typespec in typespecs[1:]: + for key in common_keys: + try: + intersection[key] = get_compatible_type( + intersection[key], typespec[key] + ) + except TypeError: + # If types are not compatible, raise an error + raise TypeError( + f"Type conflict for key '{key}': {intersection[key]} vs {typespec[key]}" + ) + return intersection + + +# def intersection_typespecs(left: TypeSpec, right: TypeSpec) -> TypeSpec: +# """ +# Returns the intersection of two TypeSpecs, only returning keys that are present in both. +# If a key is present in both TypeSpecs, the type must be the same. +# """ + +# # Find common keys and ensure types match +# common_keys = set(left.keys()).intersection(set(right.keys())) +# intersection = {} +# for key in common_keys: +# try: +# intersection[key] = get_compatible_type(left[key], right[key]) +# except TypeError: +# # If types are not compatible, raise an error +# raise TypeError( +# f"Type conflict for key '{key}': {left[key]} vs {right[key]}" +# ) + +# return intersection From 8c6721a0a8a8d39ed2228f4042e418bb14ce8953 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 19:25:38 -0700 Subject: [PATCH 202/224] refactor: change protocol name and use PythonSchema in place of TypeSpec --- src/orcapod/contexts/__init__.py | 2 +- src/orcapod/contexts/core.py | 2 +- src/orcapod/contexts/data/v0.1.json | 6 +- src/orcapod/data/datagrams/arrow_datagram.py | 6 +- .../data/datagrams/arrow_tag_packet.py | 8 +- src/orcapod/data/datagrams/base.py | 4 +- src/orcapod/data/datagrams/dict_datagram.py | 10 ++- src/orcapod/data/datagrams/dict_tag_packet.py | 4 +- src/orcapod/data/kernels.py | 8 +- src/orcapod/data/operators/batch.py | 27 +++--- src/orcapod/data/operators/join.py | 4 +- src/orcapod/data/operators/mappers.py | 6 +- src/orcapod/data/operators/semijoin.py | 4 +- src/orcapod/data/pods.py | 12 +-- src/orcapod/data/sources/__init__.py | 4 +- .../data/sources/arrow_table_source.py | 66 ++++++++++++++ src/orcapod/data/sources/base.py | 8 +- src/orcapod/data/sources/csv_source.py | 4 +- src/orcapod/data/sources/dict_source.py | 85 ++++++++++++++----- src/orcapod/data/sources/list_source.py | 5 +- .../data/sources/manual_table_source.py | 10 +-- src/orcapod/data/streams.py | 26 ++++-- .../hashing/function_info_extractors.py | 10 +-- src/orcapod/pipeline/nodes.py | 4 +- .../protocols/data_protocols/datagrams.py | 8 +- .../protocols/data_protocols/kernel.py | 4 +- src/orcapod/protocols/data_protocols/pods.py | 6 +- src/orcapod/protocols/hashing_protocols.py | 6 +- ...otocols.py => semantic_types_protocols.py} | 0 src/orcapod/semantic_types/__init__.py | 4 +- src/orcapod/semantic_types/type_inference.py | 80 ++++++++++++++--- .../semantic_types/universal_converter.py | 6 +- .../unused/semantic_converters.py | 6 +- src/orcapod/types.py | 5 +- src/orcapod/utils/arrow_utils.py | 5 +- src/orcapod/utils/types_utils.py | 26 +++--- 36 files changed, 334 insertions(+), 147 deletions(-) create mode 100644 src/orcapod/data/sources/arrow_table_source.py rename src/orcapod/protocols/{semantic_protocols.py => semantic_types_protocols.py} (100%) diff --git a/src/orcapod/contexts/__init__.py b/src/orcapod/contexts/__init__.py index a47d847..116dbbb 100644 --- a/src/orcapod/contexts/__init__.py +++ b/src/orcapod/contexts/__init__.py @@ -28,7 +28,7 @@ from .core import DataContext, ContextValidationError, ContextResolutionError from .registry import JSONDataContextRegistry from typing import Any -from orcapod.protocols import hashing_protocols as hp, semantic_protocols as sp +from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp # Global registry instance (lazily initialized) _registry: JSONDataContextRegistry | None = None diff --git a/src/orcapod/contexts/core.py b/src/orcapod/contexts/core.py index 09ca2cc..f1b35d3 100644 --- a/src/orcapod/contexts/core.py +++ b/src/orcapod/contexts/core.py @@ -7,7 +7,7 @@ from dataclasses import dataclass -from orcapod.protocols import hashing_protocols as hp, semantic_protocols as sp +from orcapod.protocols import hashing_protocols as hp, semantic_types_protocols as sp @dataclass diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 7cbf2a9..9f1708e 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -5,12 +5,12 @@ "semantic_registry": { "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { - "converters": [ - { + "converters": { + "path": { "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", "_config": {} } - ] + } } }, "arrow_hasher": { diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index 15c2655..a6cc85a 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -7,7 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.data.system_constants import constants -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema from orcapod.protocols.hashing_protocols import ContentHash from orcapod.utils import arrow_utils @@ -142,9 +142,9 @@ def __init__( ) # Initialize caches - self._cached_python_schema: TypeSpec | None = None + self._cached_python_schema: PythonSchema | None = None self._cached_python_dict: dict[str, DataValue] | None = None - self._cached_meta_python_schema: TypeSpec | None = None + self._cached_meta_python_schema: PythonSchema | None = None self._cached_content_hash: ContentHash | None = None # 1. Core Properties (Identity & Structure) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index e2b0aec..d57e906 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -7,9 +7,9 @@ from orcapod.data.system_constants import constants from orcapod import contexts -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.data.datagrams.arrow_datagram import ArrowDatagram @@ -59,7 +59,7 @@ def __init__( )[0] ) self._system_tags_dict.update(system_tags or {}) - self._system_tags_python_schema = infer_schema_from_pylist_data( + self._system_tags_python_schema = infer_python_schema_from_pylist_data( [self._system_tags_dict] ) self._system_tags_table = ( @@ -279,7 +279,7 @@ def __init__( self._source_info_table = prefixed_tables[constants.SOURCE_PREFIX] self._cached_source_info: dict[str, str | None] | None = None - self._cached_python_schema: TypeSpec | None = None + self._cached_python_schema: PythonSchema | None = None def keys( self, diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/data/datagrams/base.py index 844685a..9199be7 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/data/datagrams/base.py @@ -26,7 +26,7 @@ import pyarrow as pa -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema logger = logging.getLogger(__name__) @@ -178,7 +178,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> TypeSpec: + ) -> PythonSchema: """Return type specification for the datagram.""" ... diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 77da019..dcee19d 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -6,7 +6,7 @@ from orcapod.data.system_constants import constants from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue from orcapod.utils import arrow_utils from orcapod.protocols.hashing_protocols import ContentHash @@ -106,7 +106,9 @@ def __init__( # Combine provided typespec info with inferred typespec from content # If the column value is None and no type spec is provided, defaults to str. - inferred_schema = infer_schema_from_pylist_data([self._data], default_type=str) + inferred_schema = infer_python_schema_from_pylist_data( + [self._data], default_type=str + ) self._data_python_schema = ( {k: python_schema.get(k, v) for k, v in inferred_schema.items()} @@ -115,7 +117,7 @@ def __init__( ) # Create schema for meta data - inferred_meta_schema = infer_schema_from_pylist_data( + inferred_meta_schema = infer_python_schema_from_pylist_data( [self._meta_data], default_type=str ) self._meta_python_schema = ( @@ -752,7 +754,7 @@ def with_columns( if column_types is not None: python_schema.update(column_types) - new_python_schema = infer_schema_from_pylist_data([new_data]) + new_python_schema = infer_python_schema_from_pylist_data([new_data]) new_python_schema = { k: python_schema.get(k, v) for k, v in new_python_schema.items() } diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 11b738c..64575d3 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -8,7 +8,7 @@ from orcapod import contexts from orcapod.data.datagrams.dict_datagram import DictDatagram from orcapod.utils import arrow_utils -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue logger = logging.getLogger(__name__) @@ -55,7 +55,7 @@ def __init__( self._system_tags = {**extracted_system_tags, **(system_tags or {})} self._system_tags_python_schema: dict[str, type] = ( - infer_schema_from_pylist_data([self._system_tags]) + infer_python_schema_from_pylist_data([self._system_tags]) ) self._cached_system_tags_table: pa.Table | None = None self._cached_system_tags_schema: pa.Schema | None = None diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 0143396..ad0ecc1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -7,7 +7,7 @@ from orcapod.data.streams import KernelStream from orcapod.data.base import LabeledContentIdentifiableBase from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema logger = logging.getLogger(__name__) @@ -76,7 +76,7 @@ def _set_modified_time( @abstractmethod def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Return the output types of the kernel given the input streams. """ @@ -84,7 +84,7 @@ def kernel_output_types( def output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) return self.kernel_output_types( @@ -207,7 +207,7 @@ def kernel_id(self) -> tuple[str, ...]: def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: return self.kernel.output_types( *streams, include_system_tags=include_system_tags ) diff --git a/src/orcapod/data/operators/batch.py b/src/orcapod/data/operators/batch.py index 603402a..3d8e82e 100644 --- a/src/orcapod/data/operators/batch.py +++ b/src/orcapod/data/operators/batch.py @@ -1,9 +1,10 @@ from orcapod.data.operators.base import UnaryOperator -from collections.abc import Collection, Mapping +from collections.abc import Collection from orcapod.protocols import data_protocols as dp from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.data.streams import TableStream + if TYPE_CHECKING: import pyarrow as pa import polars as pl @@ -11,19 +12,20 @@ pa = LazyModule("pyarrow") pl = LazyModule("polars") -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema + class Batch(UnaryOperator): """ Base class for all operators. """ - def __init__(self, batch_size:int = 0, drop_last_batch:bool=False, **kwargs): + def __init__(self, batch_size: int = 0, drop_last_batch: bool = False, **kwargs): if batch_size < 0: raise ValueError("Batch size must be non-negative.") - + super().__init__(**kwargs) - + self.batch_size = batch_size self.drop_last_batch = drop_last_batch @@ -42,7 +44,6 @@ def validate_inputs(self, *streams: dp.Stream) -> None: stream = streams[0] return self.op_validate_inputs(stream) - def op_validate_inputs(self, stream: dp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. @@ -50,7 +51,6 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: """ return None - def op_forward(self, stream: dp.Stream) -> dp.Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. @@ -83,23 +83,22 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: batched_table = pa.Table.from_pylist(batched_data) return TableStream(batched_table, tag_columns=tag_columns) - - def op_output_types( self, stream: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. """ - tag_types, packet_types = stream.types() + tag_types, packet_types = stream.types() batched_tag_types = {k: list[v] for k, v in tag_types.items()} batched_packet_types = {k: list[v] for k, v in packet_types.items()} return batched_tag_types, batched_packet_types - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: return ( - (self.__class__.__name__, self.batch_size, self.drop_last_batch) + (stream,) if stream is not None else () - ) \ No newline at end of file + (self.__class__.__name__, self.batch_size, self.drop_last_batch) + (stream,) + if stream is not None + else () + ) diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/data/operators/join.py index 1090d25..f6ecbb7 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/data/operators/join.py @@ -1,6 +1,6 @@ from orcapod.protocols import data_protocols as dp from orcapod.data.streams import TableStream -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.utils import types_utils from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule @@ -34,7 +34,7 @@ def op_validate_inputs(self, *streams: dp.Stream) -> None: def op_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: if len(streams) == 1: # If only one stream is provided, return its typespecs return streams[0].types(include_system_tags=include_system_tags) diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py index c048c5a..a32b4ca 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/data/operators/mappers.py @@ -1,6 +1,6 @@ from orcapod.protocols import data_protocols as dp from orcapod.data.streams import TableStream -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from collections.abc import Mapping @@ -79,7 +79,7 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: message += f"overlapping tag columns: {overlapping_tag_columns}." raise InputValidationError(message) - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types(self, stream: dp.Stream) -> tuple[PythonSchema, PythonSchema]: tag_typespec, packet_typespec = stream.types() # Create new packet typespec with renamed keys @@ -157,7 +157,7 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: message += f"overlapping packet columns: {overlapping_packet_columns}." raise InputValidationError(message) - def op_output_types(self, stream: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + def op_output_types(self, stream: dp.Stream) -> tuple[PythonSchema, PythonSchema]: tag_typespec, packet_typespec = stream.types() # Create new packet typespec with renamed keys diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/data/operators/semijoin.py index 604969d..de537ee 100644 --- a/src/orcapod/data/operators/semijoin.py +++ b/src/orcapod/data/operators/semijoin.py @@ -1,7 +1,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.data.streams import TableStream from orcapod.utils import types_utils -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.errors import InputValidationError @@ -101,7 +101,7 @@ def op_output_types( left_stream: dp.Stream, right_stream: dp.Stream, include_system_tags: bool = False, - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Returns the output types for the semi-join operation. The output preserves the exact schema of the left stream. diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 0c9e7d5..bd41151 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -17,7 +17,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.store_protocols import ArrowDataStore -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema from orcapod.utils import types_utils from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components @@ -60,14 +60,14 @@ class ActivatablePodBase(TrackedKernelBase): """ @abstractmethod - def input_packet_types(self) -> TypeSpec: + def input_packet_types(self) -> PythonSchema: """ Return the input typespec for the pod. This is used to validate the input streams. """ ... @abstractmethod - def output_packet_types(self) -> TypeSpec: + def output_packet_types(self) -> PythonSchema: """ Return the output typespec for the pod. This is used to validate the output streams. """ @@ -122,7 +122,7 @@ def major_version(self) -> int: def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Return the input and output typespecs for the pod. This is used to validate the input and output streams. @@ -558,14 +558,14 @@ def tiered_pod_id(self) -> dict[str, str]: def computed_label(self) -> str | None: return self.pod.label - def input_packet_types(self) -> TypeSpec: + def input_packet_types(self) -> PythonSchema: """ Return the input typespec for the stored pod. This is used to validate the input streams. """ return self.pod.input_packet_types() - def output_packet_types(self) -> TypeSpec: + def output_packet_types(self) -> PythonSchema: """ Return the output typespec for the stored pod. This is used to validate the output streams. diff --git a/src/orcapod/data/sources/__init__.py b/src/orcapod/data/sources/__init__.py index 51b7a56..6d6a954 100644 --- a/src/orcapod/data/sources/__init__.py +++ b/src/orcapod/data/sources/__init__.py @@ -1,3 +1,5 @@ from .base import SourceBase +from .arrow_table_source import ArrowTableSource +from .dict_source import DictSource -__all__ = ["SourceBase"] +__all__ = ["SourceBase", "ArrowTableSource", "DictSource"] diff --git a/src/orcapod/data/sources/arrow_table_source.py b/src/orcapod/data/sources/arrow_table_source.py new file mode 100644 index 0000000..fc98109 --- /dev/null +++ b/src/orcapod/data/sources/arrow_table_source.py @@ -0,0 +1,66 @@ +from collections.abc import Collection, Mapping +from typing import TYPE_CHECKING, Any + + +from orcapod.data.streams import TableStream +from orcapod.protocols import data_protocols as dp +from orcapod.types import DataValue +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.data.system_constants import constants +from orcapod.semantic_types import infer_python_schema_from_pylist_data + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + +from orcapod.data.sources.base import SourceBase + + +class ArrowTableSource(SourceBase): + """Construct source from a collection of dictionaries""" + + def __init__( + self, + table: "pa.Table", + tag_columns: Collection[str] = (), + system_tag_columns: Collection[str] = (), + source_info: dict[str, str | None] | None = None, + **kwargs, + ): + super().__init__(**kwargs) + self.table = table + self.tag_columns = tag_columns + self.system_tag_columns = system_tag_columns + self.source_info = source_info + self.table_hash = self.data_context.arrow_hasher.hash_table(self.table) + self._table_stream = TableStream( + table=self.table, + tag_columns=self.tag_columns, + system_tag_columns=self.system_tag_columns, + source=self, + upstreams=(), + ) + + def source_identity_structure(self) -> Any: + return (self.__class__.__name__, self.table_hash) + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + return self().as_table(include_source=include_system_columns) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + return self._table_stream + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[dict[str, type], dict[str, type]]: + """Return tag and packet types based on provided typespecs.""" + return self._table_stream.types(include_system_tags=include_system_tags) diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py index 28394f7..3190514 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/data/sources/base.py @@ -9,7 +9,7 @@ StatefulStreamBase, ) from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -120,7 +120,9 @@ def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: """Delegate to the cached KernelStream.""" return self().keys() - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: """Delegate to the cached KernelStream.""" return self().types(include_system_tags=include_system_tags) @@ -247,7 +249,7 @@ def __init__(self, stream: dp.Stream, label: str | None = None, **kwargs) -> Non def source_output_types( self, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. diff --git a/src/orcapod/data/sources/csv_source.py b/src/orcapod/data/sources/csv_source.py index c560879..2540645 100644 --- a/src/orcapod/data/sources/csv_source.py +++ b/src/orcapod/data/sources/csv_source.py @@ -6,7 +6,7 @@ TableStream, ) from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: @@ -61,7 +61,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def source_output_types( self, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """Infer types from the file (could be cached).""" # For demonstration - in practice you might cache this sample_stream = self.forward() diff --git a/src/orcapod/data/sources/dict_source.py b/src/orcapod/data/sources/dict_source.py index 290d7ea..42456e8 100644 --- a/src/orcapod/data/sources/dict_source.py +++ b/src/orcapod/data/sources/dict_source.py @@ -2,15 +2,13 @@ from typing import TYPE_CHECKING, Any -from pyarrow.lib import Table - from orcapod.data.streams import TableStream from orcapod.protocols import data_protocols as dp from orcapod.types import DataValue from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: import pyarrow as pa @@ -20,37 +18,84 @@ from orcapod.data.sources.base import SourceBase +def add_source_field( + record: dict[str, DataValue], source_info: str +) -> dict[str, DataValue]: + """Add source information to a record.""" + # for all "regular" fields, add source info + for key in record.keys(): + if not key.startswith(constants.META_PREFIX) and not key.startswith( + constants.DATAGRAM_PREFIX + ): + record[f"{constants.SOURCE_PREFIX}{key}"] = f"{source_info}:{key}" + return record + + +def split_fields_with_prefixes( + record, prefixes: Collection[str] +) -> tuple[dict[str, DataValue], dict[str, DataValue]]: + """Split fields in a record into two dictionaries based on prefixes.""" + matching = {} + non_matching = {} + for key, value in record.items(): + if any(key.startswith(prefix) for prefix in prefixes): + matching[key] = value + else: + non_matching[key] = value + return matching, non_matching + + +def split_system_columns( + data: list[dict[str, DataValue]], +) -> tuple[list[dict[str, DataValue]], list[dict[str, DataValue]]]: + system_columns: list[dict[str, DataValue]] = [] + non_system_columns: list[dict[str, DataValue]] = [] + for record in data: + sys_cols, non_sys_cols = split_fields_with_prefixes( + record, [constants.META_PREFIX, constants.DATAGRAM_PREFIX] + ) + system_columns.append(sys_cols) + non_system_columns.append(non_sys_cols) + return system_columns, non_system_columns + + class DictSource(SourceBase): """Construct source from a collection of dictionaries""" def __init__( self, - tags: Collection[dict[str, DataValue]], - packets: Collection[dict[str, DataValue]], + data: Collection[dict[str, DataValue]], + tag_columns: Collection[str], tag_schema: Mapping[str, type] | None = None, packet_schema: Mapping[str, type] | None = None, **kwargs, ): super().__init__(**kwargs) - self.tags = list(tags) - self.packets = list(packets) - if len(self.tags) != len(self.packets) or len(self.tags) == 0: - raise ValueError( - "Tags and packets must be non-empty collections of equal length" - ) + data = list(data) + tags = [] + packets = [] + for item in data: + tags.append({k: item[k] for k in tag_columns}) + packets.append({k: item[k] for k in item if k not in tag_columns}) + + # TODO: visit source info logic + source_info = ":".join(self.kernel_id) + + raw_data, system_data = split_system_columns(data) + + self.tags = tags + self.packets = [add_source_field(packet, source_info) for packet in packets] + self.tag_schema = ( - dict(tag_schema) if tag_schema else infer_schema_from_pylist_data(self.tags) + dict(tag_schema) + if tag_schema + else infer_python_schema_from_pylist_data(self.tags) ) self.packet_schema = ( dict(packet_schema) if packet_schema - else infer_schema_from_pylist_data(self.packets) + else infer_python_schema_from_pylist_data(self.packets) ) - source_info = ":".join(self.kernel_id) - self.source_info = { - f"{constants.SOURCE_PREFIX}{k}": f"{source_info}:{k}" - for k in self.tag_schema - } def source_identity_structure(self) -> Any: return ( @@ -59,7 +104,9 @@ def source_identity_structure(self) -> Any: tuple(self.packet_schema.items()), ) - def get_all_records(self, include_system_columns: bool = False) -> Table | None: + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": return self().as_table(include_source=include_system_columns) def forward(self, *streams: dp.Stream) -> dp.Stream: diff --git a/src/orcapod/data/sources/list_source.py b/src/orcapod/data/sources/list_source.py index 95e90ae..20503e9 100644 --- a/src/orcapod/data/sources/list_source.py +++ b/src/orcapod/data/sources/list_source.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING, Any, Literal, cast from deltalake import DeltaTable, write_deltalake -from deltalake.exceptions import TableNotFoundError from pyarrow.lib import Table from orcapod.data.datagrams import DictTag @@ -15,11 +14,11 @@ ) from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: import pandas as pd diff --git a/src/orcapod/data/sources/manual_table_source.py b/src/orcapod/data/sources/manual_table_source.py index e393cce..0e9d49b 100644 --- a/src/orcapod/data/sources/manual_table_source.py +++ b/src/orcapod/data/sources/manual_table_source.py @@ -14,11 +14,11 @@ ) from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants -from orcapod.semantic_types import infer_schema_from_pylist_data +from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: import pandas as pd @@ -141,11 +141,11 @@ def source_identity_structure(self) -> Any: def source_output_types( self, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """Return tag and packet types based on schema and tag columns.""" # TODO: auto add system entry tag - tag_types: TypeSpec = {} - packet_types: TypeSpec = {} + tag_types: PythonSchema = {} + packet_types: PythonSchema = {} for field, field_type in self.python_schema.items(): if field in self.tag_columns: tag_types[field] = field_type diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 7b41747..d928b51 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -15,7 +15,7 @@ ) from orcapod.data.system_constants import constants from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -217,12 +217,14 @@ def packet_keys(self) -> tuple[str, ...]: return self.keys()[1] @abstractmethod - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: ... + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: ... - def tag_types(self, include_system_tags: bool = False) -> TypeSpec: + def tag_types(self, include_system_tags: bool = False) -> PythonSchema: return self.types(include_system_tags=include_system_tags)[0] - def packet_types(self) -> TypeSpec: + def packet_types(self) -> PythonSchema: return self.types()[1] @property @@ -882,7 +884,9 @@ def keys( ) return tuple(tag_types.keys()), tuple(packet_types.keys()) - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. @@ -1089,7 +1093,9 @@ def keys( packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: tag_typespec, _ = self.prepared_stream.types( include_system_tags=include_system_tags ) @@ -1428,7 +1434,9 @@ def keys( packet_keys = tuple(self.pod.output_packet_types().keys()) return tag_keys, packet_keys - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: tag_typespec, _ = self.input_stream.types( include_system_tags=include_system_tags ) @@ -1554,7 +1562,9 @@ def keys( """ return self._stream.keys(include_system_tags=include_system_tags) - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. diff --git a/src/orcapod/hashing/function_info_extractors.py b/src/orcapod/hashing/function_info_extractors.py index 27cae33..0b5d448 100644 --- a/src/orcapod/hashing/function_info_extractors.py +++ b/src/orcapod/hashing/function_info_extractors.py @@ -1,7 +1,7 @@ from orcapod.protocols.hashing_protocols import FunctionInfoExtractor from collections.abc import Callable from typing import Any, Literal -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema import inspect @@ -14,8 +14,8 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | None = None, + input_typespec: PythonSchema | None = None, + output_typespec: PythonSchema | None = None, ) -> dict[str, Any]: if not callable(func): raise TypeError("Provided object is not callable") @@ -38,8 +38,8 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | None = None, + input_typespec: PythonSchema | None = None, + output_typespec: PythonSchema | None = None, ) -> dict[str, Any]: if not callable(func): raise TypeError("Provided object is not callable") diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 5b394da..0ab94ba 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -2,7 +2,7 @@ from orcapod.data.sources import SourceBase from orcapod.data.pods import ArrowDataStore, CachedPod from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any from orcapod.data.system_constants import constants @@ -78,7 +78,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def source_output_types( self, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Return the output types of the node. This is used to determine the types of the output streams. diff --git a/src/orcapod/protocols/data_protocols/datagrams.py b/src/orcapod/protocols/data_protocols/datagrams.py index 50ded9e..41cd379 100644 --- a/src/orcapod/protocols/data_protocols/datagrams.py +++ b/src/orcapod/protocols/data_protocols/datagrams.py @@ -1,7 +1,7 @@ from collections.abc import Collection, Iterator, Mapping from typing import Any, Protocol, Self, TYPE_CHECKING, runtime_checkable from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import DataValue, TypeSpec +from orcapod.types import DataValue, PythonSchema if TYPE_CHECKING: @@ -178,7 +178,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> TypeSpec: + ) -> PythonSchema: """ Return type specification mapping field names to Python types. @@ -658,7 +658,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> TypeSpec: + ) -> PythonSchema: """ Return type specification mapping field names to Python types. @@ -901,7 +901,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> TypeSpec: + ) -> PythonSchema: """ Return type specification mapping field names to Python types. diff --git a/src/orcapod/protocols/data_protocols/kernel.py b/src/orcapod/protocols/data_protocols/kernel.py index a9b2a76..ee6b029 100644 --- a/src/orcapod/protocols/data_protocols/kernel.py +++ b/src/orcapod/protocols/data_protocols/kernel.py @@ -2,7 +2,7 @@ from datetime import datetime from typing import Any, Protocol, runtime_checkable from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from orcapod.protocols.data_protocols.base import Labelable from orcapod.protocols.data_protocols.streams import Stream, LiveStream @@ -127,7 +127,7 @@ def forward(self, *streams: Stream) -> Stream: def output_types( self, *streams: Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ Determine output types without triggering computation. diff --git a/src/orcapod/protocols/data_protocols/pods.py b/src/orcapod/protocols/data_protocols/pods.py index 68fcb91..5c04f5c 100644 --- a/src/orcapod/protocols/data_protocols/pods.py +++ b/src/orcapod/protocols/data_protocols/pods.py @@ -3,7 +3,7 @@ from orcapod.protocols.data_protocols.base import ExecutionEngine from orcapod.protocols.data_protocols.datagrams import Packet, Tag from orcapod.protocols.data_protocols.kernel import Kernel -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema if TYPE_CHECKING: import pyarrow as pa @@ -51,7 +51,7 @@ def tiered_pod_id(self) -> dict[str, str]: """ ... - def input_packet_types(self) -> TypeSpec: + def input_packet_types(self) -> PythonSchema: """ TypeSpec for input packets that this Pod can process. @@ -71,7 +71,7 @@ def input_packet_types(self) -> TypeSpec: """ ... - def output_packet_types(self) -> TypeSpec: + def output_packet_types(self) -> PythonSchema: """ TypeSpec for output packets that this Pod produces. diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index 28437a3..e9fb268 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable -from orcapod.types import PathLike, TypeSpec +from orcapod.types import PathLike, PythonSchema if TYPE_CHECKING: import pyarrow as pa @@ -180,8 +180,8 @@ def extract_function_info( self, func: Callable[..., Any], function_name: str | None = None, - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | None = None, + input_typespec: PythonSchema | None = None, + output_typespec: PythonSchema | None = None, exclude_function_signature: bool = False, exclude_function_body: bool = False, ) -> dict[str, Any]: ... diff --git a/src/orcapod/protocols/semantic_protocols.py b/src/orcapod/protocols/semantic_types_protocols.py similarity index 100% rename from src/orcapod/protocols/semantic_protocols.py rename to src/orcapod/protocols/semantic_types_protocols.py diff --git a/src/orcapod/semantic_types/__init__.py b/src/orcapod/semantic_types/__init__.py index 3d09b1b..123777f 100644 --- a/src/orcapod/semantic_types/__init__.py +++ b/src/orcapod/semantic_types/__init__.py @@ -1,9 +1,9 @@ from .semantic_registry import SemanticTypeRegistry from .universal_converter import UniversalTypeConverter -from .type_inference import infer_schema_from_pylist_data +from .type_inference import infer_python_schema_from_pylist_data __all__ = [ "SemanticTypeRegistry", "UniversalTypeConverter", - "infer_schema_from_pylist_data", + "infer_python_schema_from_pylist_data", ] diff --git a/src/orcapod/semantic_types/type_inference.py b/src/orcapod/semantic_types/type_inference.py index bc27a8d..3a55c03 100644 --- a/src/orcapod/semantic_types/type_inference.py +++ b/src/orcapod/semantic_types/type_inference.py @@ -1,10 +1,13 @@ -from typing import Any, Union, Optional, get_origin, get_args +from types import UnionType +from typing import Any, Union, get_origin, get_args +from orcapod.types import PythonSchema -def infer_schema_from_pylist_data( + +def infer_python_schema_from_pylist_data( data: list[dict], default_type: type = str, -) -> dict[str, type]: +) -> PythonSchema: """ Infer schema from sample data (best effort). @@ -21,7 +24,7 @@ def infer_schema_from_pylist_data( if not data: return {} - schema = {} + schema: PythonSchema = {} # Get all possible field names all_fields = [] @@ -54,14 +57,69 @@ def infer_schema_from_pylist_data( schema[field_name] = default_type | None elif has_none: # Wrap with Optional if None values present - schema[field_name] = inferred_type | None if inferred_type != Any else Any + # TODO: consider the case of Any + schema[field_name] = inferred_type | None + else: + schema[field_name] = inferred_type + + return schema + + +def infer_python_schema_from_pydict_data( + data: dict[str, list[Any]], + default_type: type = str, +) -> PythonSchema: + """ + Infer schema from columnar sample data (best effort). + + Args: + data: Dictionary mapping field names to lists of values + default_type: Default type to use for fields with no values + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema: PythonSchema = {} + + # Infer type for each field + for field_name, field_values in data.items(): + if not field_values: + # Handle case where field has empty list + schema[field_name] = default_type | None + continue + + # Separate None and non-None values + non_none_values = [v for v in field_values if v is not None] + has_none = len(non_none_values) < len(field_values) + + if not non_none_values: + # Handle case where all values are None + schema[field_name] = default_type | None + continue + + # Infer type from non-None values + inferred_type = _infer_type_from_values(non_none_values) + + if inferred_type is None: + schema[field_name] = default_type | None + elif has_none: + # Wrap with Optional if None values present + # TODO: consider the case of Any + schema[field_name] = inferred_type | None else: schema[field_name] = inferred_type return schema -def _infer_type_from_values(values: list) -> type | None: +# TODO: reconsider this type hint -- use of Any effectively renders this type hint useless +def _infer_type_from_values(values: list) -> type | UnionType | Any | None: """Infer type from a list of non-None values.""" if not values: return None @@ -171,20 +229,20 @@ def _infer_dict_type(dicts: list[dict]) -> type: return dict[key_type, value_type] -def _handle_mixed_types(value_types: set, values: list) -> type: +def _handle_mixed_types(value_types: set, values: list) -> UnionType | Any: """Handle mixed types by creating appropriate Union types.""" # Handle common int/float mixing if value_types == {int, float}: - return Union[int, float] + return int | float # Handle numeric types with broader compatibility numeric_types = {int, float, complex} if value_types.issubset(numeric_types): if complex in value_types: - return Union[int, float, complex] + return int | float | complex else: - return Union[int, float] + return int | float # For small number of types, create Union if len(value_types) <= 4: # Arbitrary limit to avoid huge unions @@ -238,7 +296,7 @@ def test_schema_inference(): }, ] - schema = infer_schema_from_pylist_data(test_data) + schema = infer_python_schema_from_pylist_data(test_data) print("Inferred Schema:") for field, field_type in sorted(schema.items()): diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 0433646..b07eb7f 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -17,7 +17,7 @@ import logging from orcapod.contexts import DataContext, resolve_context from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.semantic_types.type_inference import infer_schema_from_pylist_data +from orcapod.semantic_types.type_inference import infer_python_schema_from_pylist_data # Handle generic types from typing import get_origin, get_args @@ -182,7 +182,7 @@ def python_dicts_to_struct_dicts( This uses the main conversion logic and caches results for performance. """ if python_schema is None: - python_schema = infer_schema_from_pylist_data(python_dicts) + python_schema = infer_python_schema_from_pylist_data(python_dicts) converters = { field_name: self.get_python_to_arrow_converter(python_type) @@ -246,7 +246,7 @@ def python_dicts_to_arrow_table( ) if python_schema is None and arrow_schema is None: # Infer schema from data if not provided - python_schema = infer_schema_from_pylist_data(python_dicts) + python_schema = infer_python_schema_from_pylist_data(python_dicts) if arrow_schema is None: # Convert to Arrow schema diff --git a/src/orcapod/semantic_types/unused/semantic_converters.py b/src/orcapod/semantic_types/unused/semantic_converters.py index 6abb564..62fd2ec 100644 --- a/src/orcapod/semantic_types/unused/semantic_converters.py +++ b/src/orcapod/semantic_types/unused/semantic_converters.py @@ -3,7 +3,7 @@ import typing from collections.abc import Collection, Sequence, Mapping, Iterable, Set from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema # Basic type mapping for Python -> Arrow conversion @@ -217,7 +217,7 @@ def python_type_to_arrow( def python_schema_to_arrow( - python_schema: TypeSpec, semantic_registry: SemanticTypeRegistry | None = None + python_schema: PythonSchema, semantic_registry: SemanticTypeRegistry | None = None ) -> pa.Schema: """ Convert a Python schema (TypeSpec) to a PyArrow schema. @@ -387,7 +387,7 @@ def arrow_type_to_python( def arrow_schema_to_python( arrow_schema: pa.Schema, semantic_registry: SemanticTypeRegistry | None = None -) -> TypeSpec: +) -> PythonSchema: """ Convert a PyArrow schema to a Python schema (TypeSpec). diff --git a/src/orcapod/types.py b/src/orcapod/types.py index 22043de..32b87df 100644 --- a/src/orcapod/types.py +++ b/src/orcapod/types.py @@ -1,3 +1,4 @@ +from types import UnionType from typing import TypeAlias import os from collections.abc import Collection, Mapping @@ -6,9 +7,9 @@ logger = logging.getLogger(__name__) -DataType: TypeAlias = type +DataType: TypeAlias = type | UnionType -TypeSpec: TypeAlias = Mapping[ +PythonSchema: TypeAlias = dict[ str, DataType ] # Mapping of parameter names to their types diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 111937a..7d5376c 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -6,7 +6,6 @@ from typing import Any - from typing import TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule @@ -604,7 +603,9 @@ def prepare_prefixed_columns( # TODO: clean up the logic here if not isinstance(value, str) and isinstance(value, Collection): # TODO: this won't work other data types!!! - column_values = pa.array([value] * num_rows, type=pa.list_(pa.large_string())) + column_values = pa.array( + [value] * num_rows, type=pa.list_(pa.large_string()) + ) else: column_values = pa.array([value] * num_rows, type=pa.large_string()) # if col_name is in existing_source_info, use that column diff --git a/src/orcapod/utils/types_utils.py b/src/orcapod/utils/types_utils.py index 3f5df59..eff0fb7 100644 --- a/src/orcapod/utils/types_utils.py +++ b/src/orcapod/utils/types_utils.py @@ -2,7 +2,7 @@ from collections.abc import Callable, Collection, Sequence, Mapping from typing import get_origin, get_args, Any -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema import inspect import logging @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -def verify_against_typespec(packet: dict, typespec: TypeSpec) -> bool: +def verify_against_typespec(packet: dict, typespec: PythonSchema) -> bool: """Verify that the dictionary's types match the expected types in the typespec.""" from beartype.door import is_bearable @@ -36,7 +36,7 @@ def verify_against_typespec(packet: dict, typespec: TypeSpec) -> bool: # TODO: is_subhint does not handle invariance properly # so when working with mutable types, we have to make sure to perform deep copy def check_typespec_compatibility( - incoming_types: TypeSpec, receiving_types: TypeSpec + incoming_types: PythonSchema, receiving_types: PythonSchema ) -> bool: from beartype.door import is_subhint @@ -55,9 +55,9 @@ def check_typespec_compatibility( def extract_function_typespecs( func: Callable, output_keys: Collection[str], - input_typespec: TypeSpec | None = None, - output_typespec: TypeSpec | Sequence[type] | None = None, -) -> tuple[TypeSpec, TypeSpec]: + input_typespec: PythonSchema | None = None, + output_typespec: PythonSchema | Sequence[type] | None = None, +) -> tuple[PythonSchema, PythonSchema]: """ Extract input and output data types from a function signature. @@ -136,7 +136,7 @@ def extract_function_typespecs( >>> output_types {'count': , 'total': , 'repr': } """ - verified_output_types: TypeSpec = {} + verified_output_types: PythonSchema = {} if output_typespec is not None: if isinstance(output_typespec, dict): verified_output_types = output_typespec @@ -150,7 +150,7 @@ def extract_function_typespecs( signature = inspect.signature(func) - param_info: TypeSpec = {} + param_info: PythonSchema = {} for name, param in signature.parameters.items(): if input_typespec and name in input_typespec: param_info[name] = input_typespec[name] @@ -164,7 +164,7 @@ def extract_function_typespecs( ) return_annot = signature.return_annotation - inferred_output_types: TypeSpec = {} + inferred_output_types: PythonSchema = {} if return_annot is not inspect.Signature.empty and return_annot is not None: output_item_types = [] if len(output_keys) == 0: @@ -215,8 +215,8 @@ def extract_function_typespecs( def get_typespec_from_dict( - data: Mapping, typespec: TypeSpec | None = None, default=str -) -> TypeSpec: + data: Mapping, typespec: PythonSchema | None = None, default=str +) -> PythonSchema: """ Returns a TypeSpec for the given dictionary. The TypeSpec is a mapping from field name to Python type. If typespec is provided, then @@ -240,7 +240,7 @@ def get_compatible_type(type1: Any, type2: Any) -> Any: raise TypeError(f"Types {type1} and {type2} are not compatible") -def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: +def union_typespecs(*typespecs: PythonSchema) -> PythonSchema: # Merge the two TypeSpecs but raise an error if conflicts in types are found merged = dict(typespecs[0]) for typespec in typespecs[1:]: @@ -253,7 +253,7 @@ def union_typespecs(*typespecs: TypeSpec) -> TypeSpec: return merged -def intersection_typespecs(*typespecs: TypeSpec) -> TypeSpec: +def intersection_typespecs(*typespecs: PythonSchema) -> PythonSchema: """ Returns the intersection of all TypeSpecs, only returning keys that are present in all typespecs. If a key is present in both TypeSpecs, the type must be the same. From d4c016c0dc502a11934f0f6032ade26534dbd8f8 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 19:26:36 -0700 Subject: [PATCH 203/224] test: for semantic types --- .../protocols/data_protocols/streams.py | 10 +- src/orcapod/semantic_types/pydata_utils.py | 326 ++++++++++++++++++ .../semantic_types/semantic_registry.py | 169 +++------ src/orcapod/utils/dict_utils.py | 0 .../test_path_struct_converter.py | 115 ++++++ .../test_semantic_types/test_pydata_utils.py | 116 +++++++ .../test_semantic_registry.py | 200 +++++++++++ .../test_semantic_struct_converters.py | 116 +++++++ .../test_universal_converter.py | 300 ++++++++++++++++ .../test_extract_function_data_types.py | 2 +- 10 files changed, 1225 insertions(+), 129 deletions(-) create mode 100644 src/orcapod/semantic_types/pydata_utils.py delete mode 100644 src/orcapod/utils/dict_utils.py create mode 100644 tests/test_semantic_types/test_path_struct_converter.py create mode 100644 tests/test_semantic_types/test_pydata_utils.py create mode 100644 tests/test_semantic_types/test_semantic_registry.py create mode 100644 tests/test_semantic_types/test_semantic_struct_converters.py create mode 100644 tests/test_semantic_types/test_universal_converter.py diff --git a/src/orcapod/protocols/data_protocols/streams.py b/src/orcapod/protocols/data_protocols/streams.py index afec071..1a9dc25 100644 --- a/src/orcapod/protocols/data_protocols/streams.py +++ b/src/orcapod/protocols/data_protocols/streams.py @@ -5,7 +5,7 @@ from orcapod.protocols.data_protocols.base import ExecutionEngine, Labelable from orcapod.protocols.data_protocols.datagrams import Packet, Tag from orcapod.protocols.hashing_protocols import ContentIdentifiable -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema if TYPE_CHECKING: import polars as pl @@ -148,7 +148,9 @@ def packet_keys(self) -> tuple[str, ...]: """ ... - def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: """ Type specifications for the stream content. @@ -163,7 +165,7 @@ def types(self, include_system_tags: bool = False) -> tuple[TypeSpec, TypeSpec]: """ ... - def tag_types(self, include_system_tags: bool = False) -> TypeSpec: + def tag_types(self, include_system_tags: bool = False) -> PythonSchema: """ Type specifications for the stream content. @@ -178,7 +180,7 @@ def tag_types(self, include_system_tags: bool = False) -> TypeSpec: """ ... - def packet_types(self) -> TypeSpec: ... + def packet_types(self) -> PythonSchema: ... @property def last_modified(self) -> datetime | None: diff --git a/src/orcapod/semantic_types/pydata_utils.py b/src/orcapod/semantic_types/pydata_utils.py new file mode 100644 index 0000000..5acc020 --- /dev/null +++ b/src/orcapod/semantic_types/pydata_utils.py @@ -0,0 +1,326 @@ +# A collection of utility functions for working with Python lists of dictionaries and +# dictionary of lists + +from types import UnionType +from typing import Any, Union, get_origin, get_args +from orcapod.types import PythonSchema + + +def pylist_to_pydict(pylist: list[dict]) -> dict: + """ + Convert a list of dictionaries to a dictionary of lists (columnar format). + + This function transforms row-based data (list of dicts) to column-based data + (dict of lists), similar to converting from records format to columnar format. + Missing keys in individual dictionaries are filled with None values. + + Args: + pylist: List of dictionaries representing rows of data + + Returns: + Dictionary where keys are column names and values are lists of column data + + Example: + >>> data = [{'a': 1, 'b': 2}, {'a': 3, 'c': 4}] + >>> pylist_to_pydict(data) + {'a': [1, 3], 'b': [2, None], 'c': [None, 4]} + """ + result = {} + known_keys = set() + for i, d in enumerate(pylist): + known_keys.update(d.keys()) + for k in known_keys: + result.setdefault(k, [None] * i).append(d.get(k, None)) + return result + + +def pydict_to_pylist(pydict: dict) -> list[dict]: + """ + Convert a dictionary of lists (columnar format) to a list of dictionaries. + + This function transforms column-based data (dict of lists) to row-based data + (list of dicts), similar to converting from columnar format to records format. + All arrays in the input dictionary must have the same length. + + Args: + pydict: Dictionary where keys are column names and values are lists of column data + + Returns: + List of dictionaries representing rows of data + + Raises: + ValueError: If arrays in the dictionary have inconsistent lengths + + Example: + >>> data = {'a': [1, 3], 'b': [2, None], 'c': [None, 4]} + >>> pydict_to_pylist(data) + [{'a': 1, 'b': 2, 'c': None}, {'a': 3, 'b': None, 'c': 4}] + """ + if not pydict: + return [] + + # Check all arrays have same length + lengths = [len(v) for v in pydict.values()] + if not all(length == lengths[0] for length in lengths): + raise ValueError( + f"Inconsistent array lengths: {dict(zip(pydict.keys(), lengths))}" + ) + + num_rows = lengths[0] + if num_rows == 0: + return [] + + result = [] + keys = pydict.keys() + for i in range(num_rows): + row = {k: pydict[k][i] for k in keys} + result.append(row) + return result + + +def infer_python_schema_from_pylist_data( + data: list[dict], + default_type: type = str, +) -> PythonSchema: + """ + Infer schema from sample data (best effort). + + Args: + data: List of sample dictionaries + default_type: Default type to use for fields with no values + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema = {} + + # Get all possible field names + all_fields = [] + for record in data: + all_fields.extend(record.keys()) + + all_fields = list(dict.fromkeys(all_fields)) # Remove duplicates + + # Infer type for each field + for field_name in all_fields: + # Get all values for this field (including None) + all_field_values = [ + record.get(field_name) for record in data if field_name in record + ] + + # Separate None and non-None values + non_none_values = [v for v in all_field_values if v is not None] + # check if there is at least one None value + has_none = len(non_none_values) < len(all_field_values) + + if not non_none_values: + # Handle case where all values are None + schema[field_name] = default_type | None + continue + + # Infer type from non-None values + inferred_type = _infer_type_from_values(non_none_values) + + if inferred_type is None: + schema[field_name] = default_type | None + elif has_none: + # Wrap with Optional if None values present + schema[field_name] = inferred_type | None if inferred_type != Any else Any + else: + schema[field_name] = inferred_type + + return schema + + +def infer_python_schema_from_pydict_data( + data: dict[str, list[Any]], + default_type: type = str, +) -> PythonSchema: + """ + Infer schema from columnar sample data (best effort). + + Args: + data: Dictionary mapping field names to lists of values + default_type: Default type to use for fields with no values + + Returns: + Dictionary mapping field names to inferred Python types + + Note: This is best-effort inference and may not handle all edge cases. + For production use, explicit schemas are recommended. + """ + if not data: + return {} + + schema: PythonSchema = {} + + # Infer type for each field + for field_name, field_values in data.items(): + if not field_values: + # Handle case where field has empty list + schema[field_name] = default_type | None + continue + + # Separate None and non-None values + non_none_values = [v for v in field_values if v is not None] + has_none = len(non_none_values) < len(field_values) + + if not non_none_values: + # Handle case where all values are None + schema[field_name] = default_type | None + continue + + # Infer type from non-None values + inferred_type = _infer_type_from_values(non_none_values) + + if inferred_type is None: + schema[field_name] = default_type | None + elif has_none: + # Wrap with Optional if None values present + # TODO: consider the case of Any + schema[field_name] = inferred_type | None + else: + schema[field_name] = inferred_type + + return schema + + +# TODO: reconsider this type hint -- use of Any effectively renders this type hint useless +def _infer_type_from_values(values: list) -> type | UnionType | Any | None: + """Infer type from a list of non-None values.""" + if not values: + return None + + # Get types of all values + value_types = {type(v) for v in values} + + if len(value_types) == 1: + # All values have same type + value_type = next(iter(value_types)) + return _infer_container_type(value_type, values) + else: + # Mixed types - handle common cases + return _handle_mixed_types(value_types, values) + + +def _infer_container_type(value_type: type, values: list) -> type: + """Infer container type with element types.""" + if value_type is list: + return _infer_list_type(values) + elif value_type is tuple: + return _infer_tuple_type(values) + elif value_type in {set, frozenset}: + return _infer_set_type(values, value_type) + elif value_type is dict: + return _infer_dict_type(values) + else: + return value_type + + +def _infer_list_type(lists: list[list]) -> type: + """Infer list element type.""" + all_elements = [] + for lst in lists: + all_elements.extend(lst) + + if not all_elements: + return list[Any] + + element_type = _infer_type_from_values(all_elements) + return list[element_type] + + +def _infer_tuple_type(tuples: list[tuple]) -> type: + """Infer tuple element types.""" + if not tuples: + return tuple[Any, ...] + + # Check if all tuples have same length + lengths = {len(t) for t in tuples} + + if len(lengths) == 1: + # Fixed-length tuples - infer type for each position + tuple_length = next(iter(lengths)) + if tuple_length == 0: + return tuple[()] + + position_types = [] + for i in range(tuple_length): + position_values = [t[i] for t in tuples if len(t) > i] + position_type = _infer_type_from_values(position_values) + position_types.append(position_type) + + # Always use fixed-length notation for same-length tuples + return tuple[tuple(position_types)] + else: + # Variable-length tuples - infer common element type + all_elements = [] + for t in tuples: + all_elements.extend(t) + + if not all_elements: + return tuple[Any, ...] + + element_type = _infer_type_from_values(all_elements) + return tuple[element_type, ...] + + +def _infer_set_type(sets: list, set_type: type) -> type: + """Infer set element type.""" + all_elements = [] + for s in sets: + all_elements.extend(s) + + if not all_elements: + return set_type[Any] # type: ignore[return-value] + + element_type = _infer_type_from_values(all_elements) + return set_type[element_type] # type: ignore[return-value] + + +def _infer_dict_type(dicts: list[dict]) -> type: + """Infer dictionary key and value types.""" + all_keys = [] + all_values = [] + + for d in dicts: + all_keys.extend(d.keys()) + all_values.extend(d.values()) + + if not all_keys or not all_values: + return dict[Any, Any] + + key_type = _infer_type_from_values(all_keys) + value_type = _infer_type_from_values(all_values) + + return dict[key_type, value_type] + + +def _handle_mixed_types(value_types: set, values: list) -> UnionType | Any: + """Handle mixed types by creating appropriate Union types.""" + + # Handle common int/float mixing + if value_types == {int, float}: + return int | float + + # Handle numeric types with broader compatibility + numeric_types = {int, float, complex} + if value_types.issubset(numeric_types): + if complex in value_types: + return int | float | complex + else: + return int | float + + # For small number of types, create Union + if len(value_types) <= 4: # Arbitrary limit to avoid huge unions + sorted_types = sorted(value_types, key=lambda t: t.__name__) + return Union[tuple(sorted_types)] + + # Too many types, fall back to Any + return Any diff --git a/src/orcapod/semantic_types/semantic_registry.py b/src/orcapod/semantic_types/semantic_registry.py index 28f16e2..ab334ba 100644 --- a/src/orcapod/semantic_types/semantic_registry.py +++ b/src/orcapod/semantic_types/semantic_registry.py @@ -1,8 +1,11 @@ from typing import Any, TYPE_CHECKING -from collections.abc import Collection -from orcapod.protocols.semantic_protocols import SemanticStructConverter +from collections.abc import Mapping +from orcapod.protocols.semantic_types_protocols import SemanticStructConverter from orcapod.utils.lazy_module import LazyModule +# from orcapod.semantic_types.type_inference import infer_python_schema_from_pylist_data +from orcapod.types import PythonSchema +from orcapod.semantic_types import pydata_utils if TYPE_CHECKING: import pyarrow as pa @@ -19,7 +22,21 @@ class SemanticTypeRegistry: struct schema alone. """ - def __init__(self, converters: Collection[SemanticStructConverter] | None = None): + @staticmethod + def infer_python_schema_from_pylist(data: list[dict[str, Any]]) -> PythonSchema: + """ + Infer Python schema from a list of dictionaries (pylist) + """ + return pydata_utils.infer_python_schema_from_pylist_data(data) + + @staticmethod + def infer_python_schema_from_pydict(data: dict[str, list[Any]]) -> PythonSchema: + # TODO: consider which data type is more efficient and use that pylist or pydict + return pydata_utils.infer_python_schema_from_pylist_data( + pydata_utils.pydict_to_pylist(data) + ) + + def __init__(self, converters: Mapping[str, SemanticStructConverter] | None = None): # Bidirectional mappings between Python types and struct signatures self._python_to_struct: dict[type, "pa.StructType"] = {} self._struct_to_python: dict["pa.StructType", type] = {} @@ -29,12 +46,13 @@ def __init__(self, converters: Collection[SemanticStructConverter] | None = None self._name_to_converter: dict[str, SemanticStructConverter] = {} self._struct_to_name: dict["pa.StructType", str] = {} + # If initialized with a list of converters, register them if converters: - for converter in converters: - self.register_converter(converter) + for semantic_type_name, converter in converters.items(): + self.register_converter(semantic_type_name, converter) def register_converter( - self, converter: SemanticStructConverter, semantic_name: str | None = None + self, semantic_type_name: str, converter: SemanticStructConverter ) -> None: """ Register a semantic type converter. @@ -65,25 +83,27 @@ def register_converter( f"Existing: {existing_python}, New: {python_type}" ) - if semantic_name in self._name_to_converter: - existing = self._name_to_converter[semantic_name] - if existing != converter: + # catch case where a different converter is already registered with the semantic type name + if existing_converter := self.get_converter_for_semantic_type( + semantic_type_name + ): + if existing_converter != converter: raise ValueError( - f"Semantic type name '{semantic_name}' already registered" + f"Semantic type name '{semantic_type_name}' is already registered to {existing_converter}" ) # Register bidirectional mappings self._python_to_struct[python_type] = struct_signature self._struct_to_python[struct_signature] = python_type self._struct_to_converter[struct_signature] = converter - if semantic_name is not None: - self._name_to_converter[semantic_name] = converter - self._struct_to_name[struct_signature] = semantic_name + + self._name_to_converter[semantic_type_name] = converter + self._struct_to_name[struct_signature] = semantic_type_name def get_converter_for_python_type( self, python_type: type ) -> SemanticStructConverter | None: - """Get converter for a Python type.""" + """Get converter registered to the Python type.""" # Direct lookup first struct_signature = self._python_to_struct.get(python_type) if struct_signature: @@ -107,16 +127,14 @@ def get_converter_for_python_type( def get_converter_for_semantic_type( self, semantic_type_name: str ) -> SemanticStructConverter | None: - """Get converter by semantic type name.""" + """Get converter registered to the semantic type name.""" return self._name_to_converter.get(semantic_type_name) def get_converter_for_struct_signature( self, struct_signature: "pa.StructType" ) -> SemanticStructConverter | None: """ - Get converter for an Arrow struct signature. - - This is the core method for struct signature recognition. + Get converter registered to the Arrow struct signature. """ return self._struct_to_converter.get(struct_signature) @@ -124,29 +142,27 @@ def get_python_type_for_semantic_struct_signature( self, struct_signature: "pa.StructType" ) -> type | None: """ - Get Python type for an Arrow struct signature. - - This enables automatic type inference from struct schemas. + Get Python type registered to the Arrow struct signature. """ return self._struct_to_python.get(struct_signature) def get_semantic_struct_signature_for_python_type( self, python_type: type ) -> "pa.StructType | None": - """Get Arrow struct signature for a Python type.""" + """Get Arrow struct signature registered to the Python type.""" return self._python_to_struct.get(python_type) - def is_semantic_struct_signature(self, struct_signature: "pa.StructType") -> bool: - """Check if a struct signature represents a semantic type.""" - return struct_signature in self._struct_to_python + def has_semantic_type(self, semantic_type_name: str) -> bool: + """Check if the semantic type name is registered.""" + return semantic_type_name in self._name_to_converter def has_python_type(self, python_type: type) -> bool: - """Check if a Python type is registered.""" + """Check if the Python type is registered.""" return python_type in self._python_to_struct - def has_semantic_type(self, semantic_type_name: str) -> bool: - """Check if a semantic type name is registered.""" - return semantic_type_name in self._name_to_converter + def has_semantic_struct_signature(self, struct_signature: "pa.StructType") -> bool: + """Check if the struct signature is registered.""" + return struct_signature in self._struct_to_python def list_semantic_types(self) -> list[str]: """Get all registered semantic type names.""" @@ -221,98 +237,3 @@ def validate_struct_signature( struct_signature ) return registered_type == expected_python_type - - -# # Conversion utilities using struct signature recognition -# class SemanticStructConverter: -# """Main converter class for working with semantic structs using signature recognition.""" - -# def __init__(self, registry: SemanticTypeRegistry): -# self.registry = registry - -# def python_to_struct_dict(self, value: Any) -> dict[str, Any] | None: -# """Convert Python value to struct dict if it's a semantic type.""" -# converter = self.registry.get_converter_for_python_type(type(value)) -# if converter: -# return converter.python_to_struct_dict(value) -# return None - -# def struct_dict_to_python( -# self, struct_dict: dict[str, Any], struct_signature: "pa.StructType" -# ) -> Any: -# """ -# Convert struct dict back to Python value using struct signature recognition. - -# Args: -# struct_dict: Dictionary representation of the struct -# struct_signature: PyArrow struct type signature - -# Returns: -# Python object corresponding to the semantic type -# """ -# converter = self.registry.get_converter_for_struct_signature(struct_signature) -# if not converter: -# raise ValueError( -# f"No converter found for struct signature: {struct_signature}" -# ) - -# return converter.struct_dict_to_python(struct_dict) - -# def is_semantic_struct_dict( -# self, struct_dict: dict[str, Any], struct_signature: "pa.StructType" -# ) -> bool: -# # FIXME: inconsistent implementation -- should check the passed in struct_dict -# """Check if a dict represents a semantic struct based on signature.""" -# return self.registry.is_semantic_struct_signature(struct_signature) - -# def get_semantic_type_from_struct_signature( -# self, struct_signature: "pa.StructType" -# ) -> str | None: -# """Extract semantic type name from struct signature.""" -# converter = self.registry.get_converter_for_struct_signature(struct_signature) -# return converter.semantic_type_name if converter else None - -# def python_to_arrow_array(self, values: list[Any]) -> "pa.Array": -# """Convert list of Python values to Arrow array of structs.""" -# if not values: -# raise ValueError("Cannot convert empty list") - -# # Check if first value is a semantic type -# first_converter = self.registry.get_converter_for_python_type(type(values[0])) -# if not first_converter: -# raise ValueError(f"No semantic type converter for {type(values[0])}") - -# # Convert all values to struct dicts -# struct_dicts = [] -# for value in values: -# converter = self.registry.get_converter_for_python_type(type(value)) -# if converter is None or converter != first_converter: -# raise ValueError("All values must be the same semantic type") -# struct_dicts.append(converter.python_to_struct_dict(value)) - -# # Create Arrow array with the registered struct signature -# return pa.array(struct_dicts, type=first_converter.arrow_struct_type) - -# # Create Arrow array with the registered struct signature -# return pa.array(struct_dicts, type=first_converter.arrow_struct_type) - -# def arrow_array_to_python(self, array: "pa.Array") -> list[Any]: -# """Convert Arrow struct array back to list of Python values.""" -# if not pa.types.is_struct(array.type): -# raise ValueError(f"Expected struct array, got {array.type}") - -# converter = self.registry.get_converter_for_struct_signature(array.type) -# if not converter: -# raise ValueError(f"No converter found for struct signature: {array.type}") - -# # Convert each struct to Python value -# python_values = [] -# for i in range(len(array)): -# struct_scalar = array[i] -# if struct_scalar.is_valid: -# struct_dict = struct_scalar.as_py() -# python_values.append(converter.struct_dict_to_python(struct_dict)) -# else: -# python_values.append(None) - -# return python_values diff --git a/src/orcapod/utils/dict_utils.py b/src/orcapod/utils/dict_utils.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py new file mode 100644 index 0000000..4e539d3 --- /dev/null +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -0,0 +1,115 @@ +from typing import cast +import pytest +from pathlib import Path +from unittest.mock import patch +from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + +def test_path_to_struct_and_back(): + converter = PathStructConverter() + path_obj = Path("/tmp/test.txt") + struct_dict = converter.python_to_struct_dict(path_obj) + assert struct_dict["path"] == str(path_obj) + restored = converter.struct_dict_to_python(struct_dict) + assert restored == path_obj + + +def test_path_to_struct_invalid_type(): + converter = PathStructConverter() + with pytest.raises(TypeError): + converter.python_to_struct_dict("not_a_path") # type: ignore + + +def test_struct_to_python_missing_field(): + converter = PathStructConverter() + with pytest.raises(ValueError): + converter.struct_dict_to_python({}) + + +def test_can_handle_python_type(): + converter = PathStructConverter() + assert converter.can_handle_python_type(Path) + assert not converter.can_handle_python_type(str) + + +def test_can_handle_struct_type(): + converter = PathStructConverter() + struct_type = converter.arrow_struct_type + assert converter.can_handle_struct_type(struct_type) + + # Should fail for wrong fields + class FakeField: + def __init__(self, name, type): + self.name = name + self.type = type + + class FakeStructType(list): + pass + + import pyarrow as pa + + fake_struct = cast( + pa.StructType, FakeStructType([FakeField("wrong", struct_type[0].type)]) + ) + assert not converter.can_handle_struct_type(fake_struct) + + +def test_is_semantic_struct(): + converter = PathStructConverter() + assert converter.is_semantic_struct({"path": "/tmp/test.txt"}) + assert not converter.is_semantic_struct({"not_path": "value"}) + assert not converter.is_semantic_struct({"path": 123}) + + +def test_hash_struct_dict_file_not_found(tmp_path): + converter = PathStructConverter() + struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} + with pytest.raises(FileNotFoundError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_permission_error(tmp_path): + converter = PathStructConverter() + file_path = tmp_path / "file.txt" + file_path.write_text("data") + with patch("pathlib.Path.read_bytes", side_effect=PermissionError): + struct_dict = {"path": str(file_path)} + with pytest.raises(PermissionError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_is_directory(tmp_path): + converter = PathStructConverter() + struct_dict = {"path": str(tmp_path)} + with pytest.raises(ValueError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_content_based(tmp_path): + converter = PathStructConverter() + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + content = "identical content" + file1.write_text(content) + file2.write_text(content) + struct_dict1 = {"path": str(file1)} + struct_dict2 = {"path": str(file2)} + hash1 = converter.hash_struct_dict(struct_dict1) + hash2 = converter.hash_struct_dict(struct_dict2) + assert hash1 == hash2 + + +def test_hash_path_objects_content_based(tmp_path): + converter = PathStructConverter() + file1 = tmp_path / "fileA.txt" + file2 = tmp_path / "fileB.txt" + content = "same file content" + file1.write_text(content) + file2.write_text(content) + path_obj1 = Path(file1) + path_obj2 = Path(file2) + struct_dict1 = converter.python_to_struct_dict(path_obj1) + struct_dict2 = converter.python_to_struct_dict(path_obj2) + hash1 = converter.hash_struct_dict(struct_dict1) + hash2 = converter.hash_struct_dict(struct_dict2) + assert hash1 == hash2 diff --git a/tests/test_semantic_types/test_pydata_utils.py b/tests/test_semantic_types/test_pydata_utils.py new file mode 100644 index 0000000..622a8fc --- /dev/null +++ b/tests/test_semantic_types/test_pydata_utils.py @@ -0,0 +1,116 @@ +import pytest +from pathlib import Path, PosixPath +from orcapod.semantic_types import pydata_utils + + +def test_pylist_to_pydict_typical(): + data = [{"a": 1, "b": 2}, {"a": 3, "c": 4}] + result = pydata_utils.pylist_to_pydict(data) + assert result == {"a": [1, 3], "b": [2, None], "c": [None, 4]} + + +def test_pylist_to_pydict_missing_keys(): + data = [{"a": 1}, {"b": 2}, {"a": 3, "b": 4}] + result = pydata_utils.pylist_to_pydict(data) + assert result == {"a": [1, None, 3], "b": [None, 2, 4]} + + +def test_pylist_to_pydict_empty(): + assert pydata_utils.pylist_to_pydict([]) == {} + + +def test_pylist_to_pydict_empty_dicts(): + data = [{}, {}, {}] + assert pydata_utils.pylist_to_pydict(data) == {} + + +def test_pydict_to_pylist_typical(): + data = {"a": [1, 3], "b": [2, None], "c": [None, 4]} + result = pydata_utils.pydict_to_pylist(data) + assert result == [{"a": 1, "b": 2, "c": None}, {"a": 3, "b": None, "c": 4}] + + +def test_pydict_to_pylist_uneven_lengths(): + data = {"a": [1, 2], "b": [3]} + with pytest.raises(ValueError): + pydata_utils.pydict_to_pylist(data) + + +def test_pydict_to_pylist_empty(): + assert pydata_utils.pydict_to_pylist({}) == [] + + +def test_pydict_to_pylist_empty_lists(): + data = {"a": [], "b": []} + assert pydata_utils.pydict_to_pylist(data) == [] + + +def test_infer_python_schema_from_pylist_data_typical(): + data = [{"a": 1, "b": 2.0}, {"a": 3, "b": None}] + schema = pydata_utils.infer_python_schema_from_pylist_data(data) + assert schema["a"] in (int, int | None) + assert schema["b"] in (float | None, float) + + +def test_infer_python_schema_from_pylist_data_complex(): + data = [ + {"path": Path("/tmp/file1"), "size": 123}, + {"path": Path("/tmp/file2"), "size": None}, + ] + schema = pydata_utils.infer_python_schema_from_pylist_data(data) + assert schema["path"] in (Path, PosixPath) + assert schema["size"] == int | None + + +def test_infer_python_schema_from_pylist_data_empty(): + assert pydata_utils.infer_python_schema_from_pylist_data([]) == {} + + +def test_infer_python_schema_from_pylist_data_mixed_types(): + data = [{"a": 1}, {"a": "x"}, {"a": 2.5}] + schema = pydata_utils.infer_python_schema_from_pylist_data(data) + # Should be Union[int, float, str] or Any + assert "a" in schema + + +def test_infer_python_schema_from_pydict_data_typical(): + data = {"a": [1, 2], "b": [None, 3.5]} + schema = pydata_utils.infer_python_schema_from_pydict_data(data) + assert schema["a"] in (int, int | None) + assert schema["b"] in (float | None, float) + + +def test_infer_python_schema_from_pydict_data_empty(): + assert pydata_utils.infer_python_schema_from_pydict_data({}) == {} + + +def test_infer_python_schema_from_pydict_data_empty_lists(): + data = {"a": [], "b": []} + schema = pydata_utils.infer_python_schema_from_pydict_data(data) + assert schema["a"] == str | None + assert schema["b"] == str | None + + +def test_infer_python_schema_from_pydict_data_mixed_types(): + data = {"a": [1, "x", 2.5]} + schema = pydata_utils.infer_python_schema_from_pydict_data(data) + assert "a" in schema + + +def test_round_trip_pylist_pydict(): + data = [{"a": 1, "b": 2}, {"a": 3, "c": 4}] + pydict = pydata_utils.pylist_to_pydict(data) + pylist = pydata_utils.pydict_to_pylist(pydict) + # Should be equivalent to original data (order of keys may differ) + for orig, roundtrip in zip(data, pylist): + # Compare dicts for value equality, ignoring key order and missing keys + for k in orig: + assert orig[k] == roundtrip[k] + + +def test_round_trip_pydict_pylist(): + data = {"a": [1, 3], "b": [2, None], "c": [None, 4]} + pylist = pydata_utils.pydict_to_pylist(data) + pydict = pydata_utils.pylist_to_pydict(pylist) + for k in data: + assert pydict[k] == data[k] diff --git a/tests/test_semantic_types/test_semantic_registry.py b/tests/test_semantic_types/test_semantic_registry.py new file mode 100644 index 0000000..4c387a1 --- /dev/null +++ b/tests/test_semantic_types/test_semantic_registry.py @@ -0,0 +1,200 @@ +import pytest +from unittest.mock import Mock +from orcapod.semantic_types import semantic_registry + + +def test_registry_initialization(): + registry = semantic_registry.SemanticTypeRegistry() + assert registry.list_semantic_types() == [] + assert registry.list_python_types() == [] + assert registry.list_struct_signatures() == [] + + +def test_register_and_retrieve_converter(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type = Mock(name="StructType") + converter = Mock() + converter.python_type = python_type + converter.arrow_struct_type = struct_type + registry.register_converter("mock_type", converter) + # Retrieve by semantic type name + assert registry.get_converter_for_semantic_type("mock_type") is converter + # Retrieve by python type + assert registry.get_converter_for_python_type(python_type) is converter + # Retrieve by struct signature + assert registry.get_converter_for_struct_signature(struct_type) is converter + + +def test_register_duplicate_semantic_type_raises(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type = Mock(name="StructType") + converter1 = Mock() + converter1.python_type = python_type + converter1.arrow_struct_type = struct_type + registry.register_converter("mock_type", converter1) + converter2 = Mock() + converter2.python_type = python_type + converter2.arrow_struct_type = struct_type + with pytest.raises(ValueError): + registry.register_converter("mock_type", converter2) + + +def test_register_conflicting_python_type_raises(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type1 = Mock(name="StructType1") + struct_type2 = Mock(name="StructType2") + converter1 = Mock() + converter1.python_type = python_type + converter1.arrow_struct_type = struct_type1 + registry.register_converter("mock_type1", converter1) + converter2 = Mock() + converter2.python_type = python_type + converter2.arrow_struct_type = struct_type2 + with pytest.raises(ValueError): + registry.register_converter("mock_type2", converter2) + + +def test_register_conflicting_struct_signature_raises(): + registry = semantic_registry.SemanticTypeRegistry() + python_type1 = Mock(name="PythonType1") + python_type2 = Mock(name="PythonType2") + struct_type = Mock(name="StructType") + converter1 = Mock() + converter1.python_type = python_type1 + converter1.arrow_struct_type = struct_type + registry.register_converter("mock_type1", converter1) + converter2 = Mock() + converter2.python_type = python_type2 + converter2.arrow_struct_type = struct_type + with pytest.raises(ValueError): + registry.register_converter("mock_type2", converter2) + + +def test_get_nonexistent_returns_none(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type = Mock(name="StructType") + assert registry.get_converter_for_semantic_type("not_present") is None + assert registry.get_converter_for_python_type(python_type) is None + assert registry.get_converter_for_struct_signature(struct_type) is None + + +def test_list_registered_types(): + registry = semantic_registry.SemanticTypeRegistry() + python_type1 = Mock(name="PythonType1") + struct_type1 = Mock(name="StructType1") + converter1 = Mock() + converter1.python_type = python_type1 + converter1.arrow_struct_type = struct_type1 + registry.register_converter("mock_type1", converter1) + + python_type2 = Mock(name="PythonType2") + struct_type2 = Mock(name="StructType2") + converter2 = Mock() + converter2.python_type = python_type2 + converter2.arrow_struct_type = struct_type2 + registry.register_converter("mock_type2", converter2) + + assert set(registry.list_semantic_types()) == {"mock_type1", "mock_type2"} + assert set(registry.list_python_types()) == {python_type1, python_type2} + assert set(registry.list_struct_signatures()) == {struct_type1, struct_type2} + + +def test_has_methods(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type = Mock(name="StructType") + converter = Mock() + converter.python_type = python_type + converter.arrow_struct_type = struct_type + registry.register_converter("mock_type", converter) + assert registry.has_semantic_type("mock_type") + assert registry.has_python_type(python_type) + assert registry.has_semantic_struct_signature(struct_type) + + +def test_integration_with_converter(): + registry = semantic_registry.SemanticTypeRegistry() + python_type = Mock(name="PythonType") + struct_type = Mock(name="StructType") + converter = Mock() + converter.python_type = python_type + converter.arrow_struct_type = struct_type + registry.register_converter("mock_type", converter) + retrieved = registry.get_converter_for_semantic_type("mock_type") + assert retrieved is converter + + +# Comprehensive unregister tests for future implementation +# Uncomment when unregister methods are implemented +# +# def test_unregister_by_semantic_type_name(): +# registry = semantic_registry.SemanticTypeRegistry() +# python_type = Mock(name="PythonType") +# struct_type = Mock(name="StructType") +# converter = Mock() +# converter.python_type = python_type +# converter.arrow_struct_type = struct_type +# registry.register_converter("mock_type", converter) +# result = registry.unregister_by_semantic_type_name("mock_type") +# assert result == {"mock_type": converter} +# assert not registry.has_semantic_type("mock_type") +# assert not registry.has_python_type(python_type) +# assert not registry.has_semantic_struct_signature(struct_type) +# assert registry.get_converter_for_semantic_type("mock_type") is None +# assert registry.get_converter_for_python_type(python_type) is None +# assert registry.get_converter_for_struct_signature(struct_type) is None +# +# def test_unregister_by_converter(): +# registry = semantic_registry.SemanticTypeRegistry() +# python_type = Mock(name="PythonType") +# struct_type = Mock(name="StructType") +# converter = Mock() +# converter.python_type = python_type +# converter.arrow_struct_type = struct_type +# registry.register_converter("mock_type", converter) +# result = registry.unregister_by_converter(converter) +# assert result == {"mock_type": converter} +# assert not registry.has_semantic_type("mock_type") +# assert not registry.has_python_type(python_type) +# assert not registry.has_semantic_struct_signature(struct_type) +# assert registry.get_converter_for_semantic_type("mock_type") is None +# assert registry.get_converter_for_python_type(python_type) is None +# assert registry.get_converter_for_struct_signature(struct_type) is None +# +# def test_unregister_by_python_type(): +# registry = semantic_registry.SemanticTypeRegistry() +# python_type = Mock(name="PythonType") +# struct_type = Mock(name="StructType") +# converter = Mock() +# converter.python_type = python_type +# converter.arrow_struct_type = struct_type +# registry.register_converter("mock_type", converter) +# result = registry.unregister_by_python_type(python_type) +# assert result == {"mock_type": converter} +# assert not registry.has_semantic_type("mock_type") +# assert not registry.has_python_type(python_type) +# assert not registry.has_semantic_struct_signature(struct_type) +# assert registry.get_converter_for_semantic_type("mock_type") is None +# assert registry.get_converter_for_python_type(python_type) is None +# assert registry.get_converter_for_struct_signature(struct_type) is None +# +# def test_unregister_by_struct_signature(): +# registry = semantic_registry.SemanticTypeRegistry() +# python_type = Mock(name="PythonType") +# struct_type = Mock(name="StructType") +# converter = Mock() +# converter.python_type = python_type +# converter.arrow_struct_type = struct_type +# registry.register_converter("mock_type", converter) +# result = registry.unregister_by_struct_signature(struct_type) +# assert result == {"mock_type": converter} +# assert not registry.has_semantic_type("mock_type") +# assert not registry.has_python_type(python_type) +# assert not registry.has_semantic_struct_signature(struct_type) +# assert registry.get_converter_for_semantic_type("mock_type") is None +# assert registry.get_converter_for_python_type(python_type) is None +# assert registry.get_converter_for_struct_signature(struct_type) is None diff --git a/tests/test_semantic_types/test_semantic_struct_converters.py b/tests/test_semantic_types/test_semantic_struct_converters.py new file mode 100644 index 0000000..66eab14 --- /dev/null +++ b/tests/test_semantic_types/test_semantic_struct_converters.py @@ -0,0 +1,116 @@ +from orcapod.semantic_types.semantic_struct_converters import ( + SemanticStructConverterBase, +) + + +class DummyConverter(SemanticStructConverterBase): + def __init__(self): + super().__init__("dummy") + self._python_type = dict + self._arrow_struct_type = "dummy_struct" + + @property + def python_type(self): + return self._python_type + + @property + def arrow_struct_type(self): + return self._arrow_struct_type + + def python_to_struct_dict(self, value): + return value + + def struct_dict_to_python(self, struct_dict): + return struct_dict + + def can_handle_python_type(self, python_type): + return python_type is dict + + def can_handle_struct_type(self, struct_type): + return struct_type == "dummy_struct" + + def is_semantic_struct(self, struct_dict): + return isinstance(struct_dict, dict) + + def hash_struct_dict(self, struct_dict, add_prefix=False): + return "dummyhash" + + +# --- SemanticStructConverterBase tests --- +def test_semantic_struct_converter_base_properties(): + converter = DummyConverter() + assert converter.semantic_type_name == "dummy" + assert converter.hasher_id == "dummy_content_sha256" + + +def test_format_hash_string(): + converter = DummyConverter() + hash_bytes = b"\x01\x02" + assert converter._format_hash_string(hash_bytes, add_prefix=False) == "0102" + assert ( + converter._format_hash_string(hash_bytes, add_prefix=True) + == "dummy:sha256:0102" + ) + + +def test_compute_content_hash(): + converter = DummyConverter() + data = b"abc" + result = converter._compute_content_hash(data) + import hashlib + + assert result == hashlib.sha256(data).digest() + + +# --- PathStructConverter tests --- + + +def test_extensibility_with_new_converter(): + class NewConverter(SemanticStructConverterBase): + def __init__(self): + super().__init__("newtype") + self._python_type = list + self._arrow_struct_type = "new_struct" + + @property + def python_type(self): + return self._python_type + + @property + def arrow_struct_type(self): + return self._arrow_struct_type + + def python_to_struct_dict(self, value): + return {"data": value} + + def struct_dict_to_python(self, struct_dict): + return struct_dict["data"] + + def can_handle_python_type(self, python_type): + return python_type is list + + def can_handle_struct_type(self, struct_type): + return struct_type == "new_struct" + + def is_semantic_struct(self, struct_dict): + return "data" in struct_dict + + def hash_struct_dict(self, struct_dict, add_prefix=False): + return "newhash" + + converter = NewConverter() + assert converter.semantic_type_name == "newtype" + assert converter.python_to_struct_dict([1, 2, 3]) == {"data": [1, 2, 3]} + assert converter.struct_dict_to_python({"data": [1, 2, 3]}) == [1, 2, 3] + assert converter.can_handle_python_type(list) + assert converter.can_handle_struct_type("new_struct") + assert converter.is_semantic_struct({"data": [1, 2, 3]}) + assert converter.hash_struct_dict({"data": [1, 2, 3]}) == "newhash" + + +# --- Edge cases --- +def test_dummy_converter_edge_cases(): + converter = DummyConverter() + assert converter.is_semantic_struct({}) + assert not converter.is_semantic_struct(None) + assert converter.hash_struct_dict({}) == "dummyhash" diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py new file mode 100644 index 0000000..375a119 --- /dev/null +++ b/tests/test_semantic_types/test_universal_converter.py @@ -0,0 +1,300 @@ +from typing import cast +import pytest +import pyarrow as pa +import numpy as np +from pathlib import Path +from orcapod.semantic_types import universal_converter +from orcapod.contexts import get_default_context + + +def test_python_type_to_arrow_type_basic(): + assert universal_converter.python_type_to_arrow_type(int) == pa.int64() + assert universal_converter.python_type_to_arrow_type(float) == pa.float64() + assert universal_converter.python_type_to_arrow_type(str) == pa.large_string() + assert universal_converter.python_type_to_arrow_type(bool) == pa.bool_() + assert universal_converter.python_type_to_arrow_type(bytes) == pa.large_binary() + + +def test_python_type_to_arrow_type_numpy(): + assert universal_converter.python_type_to_arrow_type(np.int32) == pa.int32() + assert universal_converter.python_type_to_arrow_type(np.float64) == pa.float64() + assert universal_converter.python_type_to_arrow_type(np.bool_) == pa.bool_() + + +def test_python_type_to_arrow_type_custom(): + arrow_type = universal_converter.python_type_to_arrow_type(Path) + # Should be a StructType with field 'path' of type large_string + assert isinstance(arrow_type, pa.StructType) + assert len(arrow_type) == 1 + field = arrow_type[0] + assert field.name == "path" + assert field.type == pa.large_string() + + +def test_python_type_to_arrow_type_context(): + ctx = get_default_context() + assert universal_converter.python_type_to_arrow_type(int, ctx) == pa.int64() + + +def test_python_type_to_arrow_type_unsupported(): + class CustomType: + pass + + with pytest.raises(Exception): + universal_converter.python_type_to_arrow_type(CustomType) + + +def test_arrow_type_to_python_type_basic(): + assert universal_converter.arrow_type_to_python_type(pa.int64()) is int + assert universal_converter.arrow_type_to_python_type(pa.float64()) is float + assert universal_converter.arrow_type_to_python_type(pa.large_string()) is str + assert universal_converter.arrow_type_to_python_type(pa.bool_()) is bool + assert universal_converter.arrow_type_to_python_type(pa.large_binary()) is bytes + + +def test_arrow_type_to_python_type_context(): + ctx = get_default_context() + assert universal_converter.arrow_type_to_python_type(pa.int64(), ctx) is int + + +def test_arrow_type_to_python_type_unsupported(): + class FakeArrowType: + pass + + with pytest.raises(Exception): + universal_converter.arrow_type_to_python_type( + cast(pa.DataType, FakeArrowType()) + ) + + +def test_get_conversion_functions_basic(): + to_arrow, to_python = universal_converter.get_conversion_functions(int) + assert callable(to_arrow) + assert callable(to_python) + assert to_arrow(42) == 42 + assert to_python(42) == 42 + + +def test_get_conversion_functions_custom(): + to_arrow, to_python = universal_converter.get_conversion_functions(str) + assert to_arrow("abc") == "abc" + assert to_python("abc") == "abc" + + +def test_get_conversion_functions_context(): + ctx = get_default_context() + to_arrow, to_python = universal_converter.get_conversion_functions(float, ctx) + assert to_arrow(1.5) == 1.5 + assert to_python(1.5) == 1.5 + + +def test_python_type_to_arrow_type_list(): + # Unparameterized list should raise ValueError + with pytest.raises(ValueError): + universal_converter.python_type_to_arrow_type(list) + + +def test_python_type_to_arrow_type_dict(): + # Unparameterized dict should raise ValueError + with pytest.raises(ValueError): + universal_converter.python_type_to_arrow_type(dict) + + +def test_python_type_to_arrow_type_list_of_dict(): + # For list[dict[str, int]], expect LargeListType of LargeListType of StructType + arrow_type = universal_converter.python_type_to_arrow_type(list[dict[str, int]]) + # Should be LargeListType + assert arrow_type.__class__.__name__.endswith("ListType") + # Next level should also be LargeListType + arrow_type = cast(pa.ListType, arrow_type) + inner_list = arrow_type.value_type + assert inner_list.__class__.__name__.endswith("ListType") + # Innermost should be StructType + struct_type = inner_list.value_type + assert isinstance(struct_type, pa.StructType) + assert struct_type[0].name == "key" + assert struct_type[0].type == pa.large_string() + assert struct_type[1].name == "value" + assert struct_type[1].type == pa.int64() + + +def test_python_type_to_arrow_type_dict_of_list(): + # dict[str, list[int]] should be a LargeListType of StructType, with value field as LargeListType + arrow_type = universal_converter.python_type_to_arrow_type(dict[str, list[int]]) + assert arrow_type.__class__.__name__.endswith("ListType") + arrow_type = cast(pa.ListType, arrow_type) + struct_type = arrow_type.value_type + assert isinstance(struct_type, pa.StructType) + assert struct_type[0].name == "key" + assert struct_type[0].type == pa.large_string() + assert struct_type[1].name == "value" + value_type = struct_type[1].type + assert value_type.__class__.__name__.endswith("ListType") + assert value_type.value_type == pa.int64() + + +def test_python_type_to_arrow_type_list_of_list(): + arrow_type = universal_converter.python_type_to_arrow_type(list[list[int]]) + assert arrow_type.__class__.__name__.endswith("ListType") + arrow_type = cast(pa.ListType, arrow_type) + inner_list = arrow_type.value_type + assert inner_list.__class__.__name__.endswith("ListType") + assert inner_list.value_type == pa.int64() + + +def test_python_type_to_arrow_type_deeply_nested(): + # dict[str, list[list[dict[str, float]]]] + complex_type = dict[str, list[list[dict[str, float]]]] + arrow_type = universal_converter.python_type_to_arrow_type(complex_type) + # Should be a LargeListType of StructType + assert arrow_type.__class__.__name__.endswith("ListType") + arrow_type = cast(pa.ListType, arrow_type) + struct_type = arrow_type.value_type + assert isinstance(struct_type, pa.StructType) + assert struct_type[0].name == "key" + assert struct_type[0].type == pa.large_string() + assert struct_type[1].name == "value" + outer_list = struct_type[1].type + assert outer_list.__class__.__name__.endswith("ListType") + inner_list = outer_list.value_type + assert inner_list.__class__.__name__.endswith("ListType") + inner_struct_list = inner_list.value_type + assert inner_struct_list.__class__.__name__.endswith("ListType") + inner_struct = inner_struct_list.value_type + assert isinstance(inner_struct, pa.StructType) + assert inner_struct[0].name == "key" + assert inner_struct[0].type == pa.large_string() + assert inner_struct[1].name == "value" + assert inner_struct[1].type == pa.float64() + + +# Roundtrip tests for complex types +def test_roundtrip_list_of_int(): + py_val = [1, 2, 3, 4] + to_arrow, to_python = universal_converter.get_conversion_functions(list[int]) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + assert py_val == py_val2 + + +def test_roundtrip_dict_str_int(): + py_val = {"a": 1, "b": 2} + to_arrow, to_python = universal_converter.get_conversion_functions(dict[str, int]) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + # dict roundtrip may come back as dict or list of pairs + if isinstance(py_val2, dict): + assert py_val == py_val2 + else: + # Accept list of pairs + assert sorted(py_val.items()) == sorted( + [(d["key"], d["value"]) for d in py_val2] + ) + + +def test_roundtrip_list_of_list_of_float(): + py_val = [[1.1, 2.2], [3.3, 4.4]] + to_arrow, to_python = universal_converter.get_conversion_functions( + list[list[float]] + ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + assert py_val == py_val2 + + +def test_roundtrip_set_of_int(): + py_val = {1, 2, 3} + to_arrow, to_python = universal_converter.get_conversion_functions(set[int]) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + # set will come back as list + assert py_val != py_val2 + assert set(py_val) == set(py_val2) + + +def test_roundtrip_various_complex_types(): + cases = [ + ([1, 2, 3], list[int]), + ([["a", "b"], ["c"]], list[list[str]]), + ({"a": 1, "b": 2}, dict[str, int]), + ([{"x": 1.1, "y": 2.2}, {"x": 3.3, "y": 4.4}], list[dict[str, float]]), + ({"a": [1, 2], "b": [3]}, dict[str, list[int]]), + ( + [{"a": [1, 2]}, {"b": [3], "c": [4, 5, 6]}], + list[dict[str, list[int]]], + ), + ( + [[{"k": "a", "v": 1.1}, {"k": "b", "v": 2.2}], [{"k": "c", "v": 3.3}]], + list[list[dict[str, float]]], + ), + ( + {"outer": [{"inner": [1, 2]}, {"inner": [3, 4]}]}, + dict[str, list[dict[str, list[int]]]], + ), + ({"a": {"b": {"c": 42}}}, dict[str, dict[str, dict[str, int]]]), + ({"a": None, "b": 2}, dict[str, int]), + ( + [{"x": [1, 2], "y": [3, 4]}, {"x": [5], "y": [6, 7]}], + list[dict[str, list[int]]], + ), + ] + for py_val, typ in cases: + to_arrow, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + assert py_val == py_val2, f"Failed roundtrip for type {typ} with value {py_val}" + + +def test_incomplete_roundtrip_types(): + cases = [({"a": {1, 2}, "b": {3}}, dict[str, set[int]], {"a": [1, 2], "b": [3]})] + + for py_val, typ, expected_return in cases: + to_arrow, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + assert py_val2 == expected_return, ( + f"Failed roundtrip for type {typ} with value {py_val}" + ) + + +def test_roundtrip_minimal_key_list_issue(): + py_val = [{"test": [1, 2, 3], "next": [3, 4]}] + typ = list[dict[str, list[int]]] + to_arrow, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + print("Original:", py_val) + print("Roundtrip:", py_val2) + assert py_val == py_val2 + + +def test_roundtrip_simpler_key_issue_dict_str_list(): + py_val = {"a": [1, 2]} + typ = dict[str, list[int]] + to_arrow, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + print("Original dict[str, list[int]]:", py_val) + print("Roundtrip:", py_val2) + assert py_val == py_val2 + + +def test_roundtrip_simpler_key_issue_list_dict_str_int(): + py_val = [{"key": "a", "value": 1}] + typ = list[dict[str, int]] + to_arrow, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow(py_val) + py_val2 = to_python(arr) + print("Original list[dict[str, int]]:", py_val) + print("Roundtrip:", py_val2) + assert py_val == py_val2 + + +def test_inspect_arrow_schema_dict_str_list(): + py_val = {"test": [1, 2]} + typ = dict[str, list[int]] + arrow_type = universal_converter.python_type_to_arrow_type(typ) + print("Arrow type for dict[str, list[int]]:", arrow_type) + to_arrow_struct, to_python = universal_converter.get_conversion_functions(typ) + arr = to_arrow_struct(py_val) + assert arr == [{"key": "test", "value": [1, 2]}] diff --git a/tests/test_types/test_inference/test_extract_function_data_types.py b/tests/test_types/test_inference/test_extract_function_data_types.py index 8ae1ea5..dc20b90 100644 --- a/tests/test_types/test_inference/test_extract_function_data_types.py +++ b/tests/test_types/test_inference/test_extract_function_data_types.py @@ -11,7 +11,7 @@ import pytest from collections.abc import Collection -from orcapod.types.typespec_utils import extract_function_typespecs +from orcapod.utils.types_utils import extract_function_typespecs class TestExtractFunctionDataTypes: From e67631a3ceb571e613ef83ff8af3702cc31dccd1 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 19:27:33 -0700 Subject: [PATCH 204/224] refactor: remove unused code in semantic types --- .../unused/complete_converter_test.py | 686 --------- .../unused/python_arrow_types.py | 1235 ----------------- src/orcapod/semantic_types/unused/schemas.py | 357 ----- .../unused/semantic_converters.py | 1005 -------------- .../semantic_types/unused/struct_types.py | 312 ----- .../semantic_types/unused/table_converters.py | 362 ----- 6 files changed, 3957 deletions(-) delete mode 100644 src/orcapod/semantic_types/unused/complete_converter_test.py delete mode 100644 src/orcapod/semantic_types/unused/python_arrow_types.py delete mode 100644 src/orcapod/semantic_types/unused/schemas.py delete mode 100644 src/orcapod/semantic_types/unused/semantic_converters.py delete mode 100644 src/orcapod/semantic_types/unused/struct_types.py delete mode 100644 src/orcapod/semantic_types/unused/table_converters.py diff --git a/src/orcapod/semantic_types/unused/complete_converter_test.py b/src/orcapod/semantic_types/unused/complete_converter_test.py deleted file mode 100644 index 52ff4d1..0000000 --- a/src/orcapod/semantic_types/unused/complete_converter_test.py +++ /dev/null @@ -1,686 +0,0 @@ -""" -Comprehensive test suite for Python Type Hint ↔ PyArrow Type Converter -with full semantic type support. - -This test suite validates: -- Basic type conversions -- Complex nested structures -- Set handling with deterministic ordering -- Dictionary representations -- Semantic type integration -- Schema inference -- Round-trip conversion fidelity -- Error handling and edge cases -""" - -from unittest.mock import Mock -import pyarrow as pa -import typing -from typing import Any, Optional, Union -from collections.abc import Collection, Sequence, Set, Mapping -from pathlib import Path -import tempfile -import uuid -from datetime import datetime, date -import json - -# Import the converter functions -# (In real usage, these would be imported from your module) -from orcapod.semantic_types import SemanticTypeRegistry -from orcapod.semantic_types.semantic_converters import ( - python_type_to_arrow, - arrow_type_to_python, - python_dicts_to_arrow_table, - arrow_table_to_python_dicts, - infer_schema_from_data, - dict_to_arrow_list, - arrow_list_to_dict, - arrow_list_to_set, -) - - -# Mock Semantic Type System for Testing -class MockSemanticRegistry: - """Mock semantic registry that supports Path, UUID, and custom types.""" - - def __init__(self): - self.converters = { - Path: MockPathConverter(), - uuid.UUID: MockUUIDConverter(), # Use uuid.UUID directly - "CustomData": MockCustomDataConverter(), - } - - def get_converter_for_python_type(self, python_type): - # Handle direct type lookups - if python_type in self.converters: - return self.converters[python_type] - - # Handle subclass relationships - add safety check - for registered_type, converter in self.converters.items(): - try: - if ( - isinstance(registered_type, type) - and isinstance(python_type, type) - and issubclass(python_type, registered_type) - ): - return converter - except TypeError: - # Handle cases where issubclass fails (e.g., with generic types) - continue - - # Handle string-based lookups (for custom types) - type_name = getattr(python_type, "__name__", str(python_type)) - if type_name in self.converters: - return self.converters[type_name] - - return None - - def get_converter_for_struct_type(self, struct_type): - if not pa.types.is_struct(struct_type): - return None - - field_names = {f.name for f in struct_type} - - # Path struct detection - if field_names == {"semantic_type", "path"}: - return self.converters[Path] - - # UUID struct detection - if field_names == {"semantic_type", "uuid_str"}: - return self.converters[uuid.UUID] - - # Custom data struct detection - if field_names == {"semantic_type", "data", "metadata"}: - return self.converters["CustomData"] - - return None - - -class MockPathConverter: - """Mock converter for pathlib.Path objects.""" - - @property - def semantic_type_name(self) -> str: - return "path" - - @property - def python_type(self): - return Path - - @property - def arrow_struct_type(self): - return pa.struct([("semantic_type", pa.string()), ("path", pa.large_string())]) - - def python_to_struct_dict(self, value): - if not isinstance(value, Path): - raise TypeError(f"Expected Path, got {type(value)}") - return {"semantic_type": self.semantic_type_name, "path": str(value)} - - def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != self.semantic_type_name: - raise ValueError("Not a path semantic type") - return Path(struct_dict["path"]) - - def can_handle_python_type(self, python_type: type) -> bool: - return python_type is Path - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { - "semantic_type", - "path", - } - - -class MockUUIDConverter: - """Mock converter for UUID objects.""" - - @property - def semantic_type_name(self) -> str: - return "uuid" - - @property - def python_type(self) -> type: - return uuid.UUID - - @property - def arrow_struct_type(self): - return pa.struct([("semantic_type", pa.string()), ("uuid_str", pa.string())]) - - def python_to_struct_dict(self, value): - if not isinstance(value, uuid.UUID): - raise TypeError(f"Expected UUID, got {type(value)}") - return {"semantic_type": self.semantic_type_name, "uuid_str": str(value)} - - def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != self.semantic_type_name: - raise ValueError("Not a uuid semantic type") - return uuid.UUID(struct_dict["uuid_str"]) - - def can_handle_python_type(self, python_type: type) -> bool: - return python_type is uuid.UUID - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { - "semantic_type", - "uuid_str", - } - - -class CustomData: - """Custom data class for testing complex semantic types.""" - - def __init__(self, data: dict, metadata: dict | None = None): - self.data = data - self.metadata = metadata or {} - - def __eq__(self, other): - if not isinstance(other, CustomData): - return False - return self.data == other.data and self.metadata == other.metadata - - def __repr__(self): - return f"CustomData(data={self.data}, metadata={self.metadata})" - - -class MockCustomDataConverter: - """Mock converter for CustomData objects.""" - - @property - def semantic_type_name(self) -> str: - return "custom_data" - - @property - def python_type(self) -> type: - return CustomData - - @property - def arrow_struct_type(self): - return pa.struct( - [ - ("semantic_type", pa.string()), - ("data", pa.large_string()), # JSON serialized - ("metadata", pa.large_string()), # JSON serialized - ] - ) - - def python_to_struct_dict(self, value): - if not isinstance(value, CustomData): - raise TypeError(f"Expected CustomData, got {type(value)}") - return { - "semantic_type": self.semantic_type_name, - "data": json.dumps(value.data), - "metadata": json.dumps(value.metadata), - } - - def struct_dict_to_python(self, struct_dict): - if struct_dict.get("semantic_type") != self.semantic_type_name: - raise ValueError("Not a custom_data semantic type") - - data = json.loads(struct_dict["data"]) - metadata = json.loads(struct_dict["metadata"]) - return CustomData(data, metadata) - - def can_handle_python_type(self, python_type: type) -> bool: - return python_type is CustomData - - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: - return pa.types.is_struct(struct_type) and set(f.name for f in struct_type) == { - "semantic_type", - "data", - "metadata", - } - - -def run_comprehensive_tests(): - """Run comprehensive test suite for the type converter.""" - - print("🚀 COMPREHENSIVE PYTHON ↔ ARROW TYPE CONVERTER TEST SUITE") - print("=" * 80) - - # Initialize mock semantic registry - # semantic_registry = MockSemanticRegistry() - semantic_registry = SemanticTypeRegistry() - semantic_registry.register_converter(MockPathConverter()) - semantic_registry.register_converter(MockUUIDConverter()) - semantic_registry.register_converter(MockCustomDataConverter()) - - # Test counters - total_tests = 0 - passed_tests = 0 - - def test_case(name: str, test_func): - """Helper to run individual test cases.""" - nonlocal total_tests, passed_tests - total_tests += 1 - - print(f"\n📋 Testing: {name}") - try: - test_func() - print(f" ✅ PASSED") - passed_tests += 1 - except Exception as e: - print(f" ❌ FAILED: {e}") - import traceback - - traceback.print_exc() - - # Test 1: Basic Type Conversions - def test_basic_types(): - basic_tests = [ - (int, pa.int64()), - (str, pa.large_string()), - (float, pa.float64()), - (bool, pa.bool_()), - (bytes, pa.large_binary()), - ] - - for python_type, expected_arrow_type in basic_tests: - result = python_type_to_arrow(python_type) - assert result == expected_arrow_type, ( - f"Expected {expected_arrow_type}, got {result}" - ) - - # Test reverse conversion - recovered_type = arrow_type_to_python(result) - assert recovered_type == python_type, ( - f"Round-trip failed: {python_type} -> {result} -> {recovered_type}" - ) - - test_case("Basic Type Conversions", test_basic_types) - - # Test 2: Complex Nested Structures - def test_complex_nested(): - complex_tests = [ - # Nested dictionaries - dict[str, dict[str, int]], - # Mixed tuples with complex types (remove set[float] as it gets converted to list[float]) - tuple[dict[str, int], list[str]], - # Deep nesting - list[dict[str, list[tuple[int, str]]]], - # Complex mappings - dict[str, tuple[list[int], list[str]]], # Changed set[str] to list[str] - ] - - for complex_type in complex_tests: - arrow_type = python_type_to_arrow(complex_type) - recovered_type = arrow_type_to_python(arrow_type) - assert recovered_type == complex_type, ( - f"Complex round-trip failed: {complex_type} -> {arrow_type} -> {recovered_type}" - ) - - test_case("Complex Nested Structures", test_complex_nested) - - # Test 3: Set Handling with Deterministic Ordering - def test_set_deterministic_ordering(): - # Test data with sets that should be sorted deterministically - set_data = [ - {"tags": {3, 1, 4, 1, 5}, "name": "Alice"}, # Duplicate should be removed - {"tags": {9, 2, 6, 5, 3}, "name": "Bob"}, - {"tags": {"python", "arrow", "data"}, "name": "Charlie"}, # String set - ] - - # Test with numeric sets - numeric_schema = {"tags": set[int], "name": str} - table1 = python_dicts_to_arrow_table(set_data[:2], numeric_schema) - result1 = arrow_table_to_python_dicts(table1) - - # Verify deterministic ordering (should be sorted) - assert result1[0]["tags"] == [1, 3, 4, 5], ( - f"Expected sorted [1, 3, 4, 5], got {result1[0]['tags']}" - ) - assert result1[1]["tags"] == [2, 3, 5, 6, 9], ( - f"Expected sorted [2, 3, 5, 6, 9], got {result1[1]['tags']}" - ) - - # Test with string sets - string_schema = {"tags": set[str], "name": str} - table2 = python_dicts_to_arrow_table([set_data[2]], string_schema) - result2 = arrow_table_to_python_dicts(table2) - - # Verify alphabetical ordering - note: sets become lists in round-trip - expected_sorted = sorted(["python", "arrow", "data"]) - assert result2[0]["tags"] == expected_sorted, ( - f"Expected {expected_sorted}, got {result2[0]['tags']}" - ) - - # Test that we can convert back to set if needed - recovered_set = set(result2[0]["tags"]) - original_set = {"python", "arrow", "data"} - assert recovered_set == original_set, "Set contents should be preserved" - - test_case("Set Deterministic Ordering", test_set_deterministic_ordering) - - # Test 4: Abstract Collection Types - def test_abstract_collections(): - abstract_tests = [ - (Collection[int], pa.large_list(pa.int64())), - (Sequence[str], pa.large_list(pa.large_string())), - (Set[float], pa.large_list(pa.float64())), - ( - Mapping[str, int], - pa.large_list( - pa.struct([("key", pa.large_string()), ("value", pa.int64())]) - ), - ), - ] - - for python_type, expected_arrow_type in abstract_tests: - result = python_type_to_arrow(python_type) - assert result == expected_arrow_type, ( - f"Abstract type conversion failed for {python_type}" - ) - - test_case("Abstract Collection Types", test_abstract_collections) - - # Test 5: Semantic Type Integration - def test_semantic_types(): - # Create test data with various semantic types - test_uuid = uuid.uuid4() - custom_data = CustomData( - data={"key": "value", "count": 42}, - metadata={"created": "2024-01-01", "version": 1}, - ) - - semantic_data = [ - { - "id": 1, - "name": "Alice", - "file_path": Path("/home/alice/data.csv"), - "unique_id": test_uuid, - "custom": custom_data, - "tags": ["analysis", "data"], - }, - { - "id": 2, - "name": "Bob", - "file_path": Path("/home/bob/results.json"), - "unique_id": uuid.uuid4(), - "custom": CustomData({"type": "result"}, {"source": "experiment"}), - "tags": ["results", "ml"], - }, - ] - - semantic_schema = { - "id": int, - "name": str, - "file_path": Path, - "unique_id": uuid.UUID, # Use uuid.UUID directly - "custom": CustomData, - "tags": list[str], - } - - # Convert to Arrow table with semantic types - table = python_dicts_to_arrow_table( - semantic_data, semantic_schema, semantic_registry - ) - - # Verify schema contains semantic struct types - schema_types = {field.name: field.type for field in table.schema} - assert pa.types.is_struct(schema_types["file_path"]), ( - "Path should be converted to struct" - ) - assert pa.types.is_struct(schema_types["unique_id"]), ( - "UUID should be converted to struct" - ) - assert pa.types.is_struct(schema_types["custom"]), ( - "CustomData should be converted to struct" - ) - - # Test round-trip conversion - recovered_data = arrow_table_to_python_dicts(table, semantic_registry) - - # Verify semantic types were properly reconstructed - assert isinstance(recovered_data[0]["file_path"], Path), ( - "Path not properly recovered" - ) - assert isinstance(recovered_data[0]["unique_id"], uuid.UUID), ( - "UUID not properly recovered" - ) - assert isinstance(recovered_data[0]["custom"], CustomData), ( - "CustomData not properly recovered" - ) - - # Verify values are correct - assert str(recovered_data[0]["file_path"]) == "/home/alice/data.csv" - assert recovered_data[0]["unique_id"] == test_uuid - assert recovered_data[0]["custom"] == custom_data - - test_case("Semantic Type Integration", test_semantic_types) - - # Test 6: Schema Inference - def test_schema_inference(): - # Test data with mixed types for inference - make sure data matches what we expect - inference_data = [ - { - "name": "Alice", - "age": 25, - "scores": [95, 87, 92], - "active": True, - "metadata": {"role": "admin", "level": "5"}, # Make level a string - "tags": {"python", "data", "ml"}, - }, - { - "name": "Bob", - "age": 30, - "scores": [78, 85], - "active": False, - "metadata": {"role": "user", "level": "2"}, # Make level a string - "tags": {"javascript", "web"}, - }, - ] - - # Test inference without semantic types - inferred_schema = infer_schema_from_data(inference_data) - print(f"Inferred schema: {inferred_schema}") - - expected_types = { - "name": str, - "age": int, - "scores": list[int], - "active": bool, - "metadata": dict[str, str], # Now all values are strings - "tags": set[str], - } - - for field, expected_type in expected_types.items(): - assert field in inferred_schema, f"Field {field} not in inferred schema" - # For complex types, just check the origin matches - if hasattr(expected_type, "__origin__"): - assert inferred_schema[field].__origin__ == expected_type.__origin__, ( - f"Field {field}: expected {expected_type.__origin__}, got {inferred_schema[field].__origin__}" - ) - else: - assert inferred_schema[field] == expected_type, ( - f"Field {field}: expected {expected_type}, got {inferred_schema[field]}" - ) - - # Test table creation with inferred schema - table = python_dicts_to_arrow_table(inference_data) # No explicit schema - recovered = arrow_table_to_python_dicts(table) - - # Verify basic round-trip works - assert len(recovered) == 2 - assert recovered[0]["name"] == "Alice" - assert recovered[0]["age"] == 25 - assert recovered[0]["metadata"]["role"] == "admin" - - test_case("Schema Inference", test_schema_inference) - - # Test 7: Optional and Union Types - def test_optional_union_types(): - # Test Optional types - optional_data = [ - {"name": "Alice", "middle_name": "Marie", "age": 25}, - {"name": "Bob", "middle_name": None, "age": 30}, # None value - ] - - optional_schema = { - "name": str, - "middle_name": Optional[str], # Should handle None values - "age": int, - } - - table = python_dicts_to_arrow_table(optional_data, optional_schema) - recovered = arrow_table_to_python_dicts(table) - - assert recovered[0]["middle_name"] == "Marie" - assert recovered[1]["middle_name"] is None # None should be preserved - - test_case("Optional and Union Types", test_optional_union_types) - - # Test 8: Error Handling and Edge Cases - def test_error_handling(): - # Test with empty data - try: - python_dicts_to_arrow_table([]) - assert False, "Should raise error for empty data" - except ValueError as e: - assert "empty data list" in str(e) - - # Test with unsupported type - try: - python_type_to_arrow(complex) # complex numbers not supported - assert False, "Should raise error for unsupported type" - except ValueError: - pass # Expected - - # Test with mismatched data and schema - this should fail gracefully - mismatch_data = [{"name": "Alice", "age": "twenty-five"}] # age as string - mismatch_schema = {"name": str, "age": int} # expects int - - # This should raise an error due to type mismatch - try: - table = python_dicts_to_arrow_table(mismatch_data, mismatch_schema) - assert False, "Should raise error for type mismatch" - except (ValueError, pa.ArrowInvalid) as e: - # Expected - conversion should fail for incompatible types - assert "convert" in str(e).lower() or "invalid" in str(e).lower() - - test_case("Error Handling and Edge Cases", test_error_handling) - - # Test 9: Large Data Performance Test - def test_large_data_performance(): - import time - - # Generate larger dataset - large_data = [] - for i in range(1000): - large_data.append( - { - "id": i, - "name": f"User_{i}", - "scores": [i % 100, (i * 2) % 100, (i * 3) % 100], - "metadata": { - "group": str(i % 10), - "active": str(i % 2 == 0), - }, # Convert to strings - "tags": {f"tag_{i % 5}", f"category_{i % 3}"}, - } - ) - - schema = { - "id": int, - "name": str, - "scores": list[int], - "metadata": dict[str, str], # Change from Any to str - "tags": set[str], - } - - # Time the conversion - start_time = time.time() - table = python_dicts_to_arrow_table(large_data, schema) - conversion_time = time.time() - start_time - - # Time the round-trip - start_time = time.time() - recovered = arrow_table_to_python_dicts(table) - recovery_time = time.time() - start_time - - print(f" 📊 Performance: {len(large_data)} records") - print(f" Conversion: {conversion_time:.3f}s") - print(f" Recovery: {recovery_time:.3f}s") - - # Verify correctness on sample - assert len(recovered) == 1000 - assert recovered[0]["id"] == 0 - assert recovered[999]["id"] == 999 - assert isinstance(recovered[0]["tags"], list) # Sets become lists - - test_case("Large Data Performance", test_large_data_performance) - - # Test 10: File I/O Round-trip Test - def test_file_io_roundtrip(): - # Test saving to and loading from Parquet file - test_data = [ - { - "name": "Alice", - "path": Path("/tmp/alice.txt"), - "scores": {"math": 95, "english": 87}, - "tags": {"student", "honor_roll"}, - }, - { - "name": "Bob", - "path": Path("/tmp/bob.txt"), - "scores": {"math": 78, "english": 92}, - "tags": {"student", "debate_team"}, - }, - ] - - schema = {"name": str, "path": Path, "scores": dict[str, int], "tags": set[str]} - - # Convert to Arrow table - table = python_dicts_to_arrow_table(test_data, schema, semantic_registry) - - # Write to temporary Parquet file - with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f: - temp_path = f.name - - try: - # Write to Parquet - import pyarrow.parquet as pq - - pq.write_table(table, temp_path) - - # Read back from Parquet - loaded_table = pq.read_table(temp_path) - - # Convert back to Python - recovered_data = arrow_table_to_python_dicts( - loaded_table, semantic_registry - ) - - # Verify data integrity - assert len(recovered_data) == 2 - assert isinstance(recovered_data[0]["path"], Path) - assert str(recovered_data[0]["path"]) == "/tmp/alice.txt" - assert recovered_data[0]["scores"]["math"] == 95 - - print(f" 💾 Successfully wrote and read {temp_path}") - - finally: - # Clean up - import os - - if os.path.exists(temp_path): - os.unlink(temp_path) - - test_case("File I/O Round-trip", test_file_io_roundtrip) - - # Print final results - print("\n" + "=" * 80) - print(f"🏁 TEST RESULTS: {passed_tests}/{total_tests} tests passed") - - if passed_tests == total_tests: - print("🎉 ALL TESTS PASSED! The converter is working perfectly.") - else: - failed = total_tests - passed_tests - print(f"⚠️ {failed} test(s) failed. Please review the failures above.") - - print("=" * 80) - - return passed_tests == total_tests - - -if __name__ == "__main__": - success = run_comprehensive_tests() - exit(0 if success else 1) diff --git a/src/orcapod/semantic_types/unused/python_arrow_types.py b/src/orcapod/semantic_types/unused/python_arrow_types.py deleted file mode 100644 index e85e98f..0000000 --- a/src/orcapod/semantic_types/unused/python_arrow_types.py +++ /dev/null @@ -1,1235 +0,0 @@ -import pyarrow as pa -from typing import get_origin, get_args, Any -import typing -from collections.abc import Collection, Sequence, Mapping, Iterable, Set -import sys - -# Basic type mapping for Python -> Arrow conversion -_PYTHON_TO_ARROW_MAP = { - # Python built-ins - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), # Use large_string by default for Polars compatibility - bool: pa.bool_(), - bytes: pa.large_binary(), # Use large_binary by default for Polars compatibility - # String representations (for when we get type names as strings) - "int": pa.int64(), - "float": pa.float64(), - "str": pa.large_string(), - "bool": pa.bool_(), - "bytes": pa.large_binary(), - # Specific integer types - "int8": pa.int8(), - "int16": pa.int16(), - "int32": pa.int32(), - "int64": pa.int64(), - "uint8": pa.uint8(), - "uint16": pa.uint16(), - "uint32": pa.uint32(), - "uint64": pa.uint64(), - # Specific float types - "float32": pa.float32(), - "float64": pa.float64(), - # Date/time types - "date": pa.date32(), - "datetime": pa.timestamp("us"), - "timestamp": pa.timestamp("us"), -} - -# Reverse mapping for Arrow -> Python conversion (handles both regular and large variants) -_ARROW_TO_PYTHON_MAP = { - # Integer types - pa.int8(): int, - pa.int16(): int, - pa.int32(): int, - pa.int64(): int, - pa.uint8(): int, - pa.uint16(): int, - pa.uint32(): int, - pa.uint64(): int, - # Float types - pa.float32(): float, - pa.float64(): float, - # String types (both regular and large) - pa.string(): str, - pa.large_string(): str, - # Boolean - pa.bool_(): bool, - # Binary types (both regular and large) - pa.binary(): bytes, - pa.large_binary(): bytes, -} - -# Add numpy types if available -try: - import numpy as np - - _PYTHON_TO_ARROW_MAP.update( - { - np.int8: pa.int8(), - np.int16: pa.int16(), - np.int32: pa.int32(), - np.int64: pa.int64(), - np.uint8: pa.uint8(), - np.uint16: pa.uint16(), - np.uint32: pa.uint32(), - np.uint64: pa.uint64(), - np.float32: pa.float32(), - np.float64: pa.float64(), - np.bool_: pa.bool_(), - } - ) -except ImportError: - pass - - -def python_type_to_arrow(type_hint, semantic_registry=None) -> pa.DataType: - """ - Convert Python type hints to PyArrow data types. - - Args: - type_hint: Python type hint to convert - semantic_registry: Optional semantic type registry to check for semantic types - - Examples: - list[int] -> pa.large_list(pa.int64()) - tuple[int, int] -> pa.list_(pa.int64(), 2) - tuple[int, str] -> pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) - dict[str, int] -> pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) - """ - - # Handle basic types first - if type_hint in _PYTHON_TO_ARROW_MAP: - return _PYTHON_TO_ARROW_MAP[type_hint] - - # Check if this is a registered semantic type - if semantic_registry and hasattr( - semantic_registry, "get_converter_for_python_type" - ): - converter = semantic_registry.get_converter_for_python_type(type_hint) - if converter: - return converter.arrow_struct_type - - # Get the origin (e.g., list, tuple, dict) and args (e.g., int, str) - origin = get_origin(type_hint) - args = get_args(type_hint) - - if origin is None: - # Handle non-generic types that might not be in basic map - if hasattr(type_hint, "__name__"): - type_name = type_hint.__name__ - if type_name in _PYTHON_TO_ARROW_MAP: - return _PYTHON_TO_ARROW_MAP[type_name] - raise ValueError(f"Unsupported type: {type_hint}") - - # Handle list types - if origin is list: - if len(args) != 1: - raise ValueError( - f"list type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) # Use large_list for Polars compatibility - - # Handle tuple types - elif origin is tuple: - if len(args) == 0: - raise ValueError("Empty tuple type not supported") - - # Check if all elements are the same type - if len(set(args)) == 1: - # Homogeneous tuple: tuple[int, int, int] -> fixed-size list - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.list_( - element_type, len(args) - ) # Fixed-size lists are always regular lists - else: - # Heterogeneous tuple: tuple[int, str] -> struct with indexed fields - fields = [] - for i, arg_type in enumerate(args): - field_type = python_type_to_arrow(arg_type, semantic_registry) - fields.append((f"f{i}", field_type)) - return pa.struct(fields) - - # Handle dict types - elif origin is dict: - if len(args) != 2: - raise ValueError( - f"dict type must have exactly two type arguments, got: {args}" - ) - key_type = python_type_to_arrow(args[0], semantic_registry) - value_type = python_type_to_arrow(args[1], semantic_registry) - - # Use large_list> representation for better compatibility - # This works reliably across Arrow, Polars, Parquet, etc. - key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) - return pa.large_list(key_value_struct) - - # Handle abstract base classes and collections - elif origin in {Collection, Sequence, Iterable}: - # Treat as list - most common concrete implementation - if len(args) != 1: - raise ValueError( - f"{origin.__name__} type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) - - elif origin is Set or origin is set: - # Sets -> lists (Arrow doesn't have native set type) - if len(args) != 1: - raise ValueError( - f"set type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) - - elif origin is Mapping: - # Mapping -> dict representation - if len(args) != 2: - raise ValueError( - f"Mapping type must have exactly two type arguments, got: {args}" - ) - key_type = python_type_to_arrow(args[0], semantic_registry) - value_type = python_type_to_arrow(args[1], semantic_registry) - key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) - return pa.large_list(key_value_struct) - elif origin is typing.Union: - # Handle Optional[T] which is Union[T, NoneType] - if len(args) == 2 and type(None) in args: - # This is Optional[T] - non_none_type = args[0] if args[1] is type(None) else args[1] - base_type = python_type_to_arrow(non_none_type, semantic_registry) - # PyArrow handles nullability at the field level, so we just return the base type - return base_type - else: - # Complex unions - convert to a union type - union_types = [python_type_to_arrow(arg, semantic_registry) for arg in args] - # PyArrow union types are complex - for now, just use the first type as fallback - # TODO: Implement proper union support when needed - return union_types[0] # Simplified - take first type - - else: - raise ValueError(f"Unsupported generic type: {origin}") - - -# TODO: change back the return type to `type` -def arrow_type_to_python(arrow_type: pa.DataType) -> Any: - """ - Convert PyArrow data types back to Python type hints. - - Args: - arrow_type: PyArrow data type to convert - - Returns: - Python type annotation - - Examples: - pa.int64() -> int - pa.large_list(pa.large_string()) -> list[str] - pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) -> dict[str, int] - pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) -> tuple[int, str] - - Raises: - TypeError: If the Arrow type cannot be converted to a Python type - """ - - # Handle basic types - if arrow_type in _ARROW_TO_PYTHON_MAP: - return _ARROW_TO_PYTHON_MAP[arrow_type] - - # Check by Arrow type categories - if pa.types.is_integer(arrow_type): - return int - elif pa.types.is_floating(arrow_type): - return float - elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): - return str - elif pa.types.is_boolean(arrow_type): - return bool - elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): - return bytes - - # Handle complex types - elif ( - pa.types.is_list(arrow_type) - or pa.types.is_large_list(arrow_type) - or pa.types.is_fixed_size_list(arrow_type) - ): - element_type = arrow_type.value_type - - # Check if this is a dict representation: list> - if pa.types.is_struct(element_type): - field_names = [field.name for field in element_type] - - # Dict pattern: must have exactly 'key' and 'value' fields - if set(field_names) == {"key", "value"}: - # Find key and value types - key_field = next(f for f in element_type if f.name == "key") - value_field = next(f for f in element_type if f.name == "value") - - key_python_type = arrow_type_to_python(key_field.type) - value_python_type = arrow_type_to_python(value_field.type) - - return dict[key_python_type, value_python_type] - - # Regular list - element_python_type = arrow_type_to_python(element_type) - - # Check if this is a fixed-size list (homogeneous tuple representation) - if pa.types.is_fixed_size_list(arrow_type): - # Fixed-size list -> homogeneous tuple - size = arrow_type.list_size - return tuple[tuple(element_python_type for _ in range(size))] - else: - # Variable-size list -> list - return list[element_python_type] - - elif pa.types.is_struct(arrow_type): - # Check if this is a heterogeneous tuple representation - field_names = [field.name for field in arrow_type] - - # Tuple pattern: fields named f0, f1, f2, etc. - if all(name.startswith("f") and name[1:].isdigit() for name in field_names): - # Sort by field index to maintain order - sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) - field_types = [arrow_type_to_python(field.type) for field in sorted_fields] - return tuple[tuple(field_types)] - else: - # TODO: Could support NamedTuple or dataclass conversion here - raise TypeError( - f"Cannot convert struct type to Python type hint. " - f"Struct has fields: {field_names}. " - f"Only tuple-like structs (f0, f1, ...) are supported." - ) - - elif pa.types.is_map(arrow_type): - # Handle pa.map_ types (though we prefer list representation) - key_python_type = arrow_type_to_python(arrow_type.key_type) - value_python_type = arrow_type_to_python(arrow_type.item_type) - return dict[key_python_type, value_python_type] - - elif pa.types.is_union(arrow_type): - # Handle union types -> Union[T1, T2, ...] - import typing - - # Get the child types from the union - child_types = [] - for i in range(arrow_type.num_fields): - child_field = arrow_type[i] - child_types.append(arrow_type_to_python(child_field.type)) - - if len(child_types) == 2 and type(None) in child_types: - # This is Optional[T] - non_none_type = next(t for t in child_types if t is not type(None)) - return typing.Optional[non_none_type] - else: - return typing.Union[tuple(child_types)] - - else: - raise TypeError( - f"Cannot convert Arrow type '{arrow_type}' to Python type hint. " - f"Supported types: int, float, str, bool, bytes, list, large_list, fixed_size_list, tuple, dict, struct, map, union. " - f"Arrow type category: {arrow_type}" - ) - - -def parse_type_string(type_string: str): - """ - Parse a type hint from a string representation. - Useful when you have type hints as strings. - - Example: - parse_type_string("list[int]") -> pa.large_list(pa.int64()) - """ - # This is a simplified version - for production use, consider using ast.literal_eval - # or a proper type hint parser - try: - # Try to evaluate the string as a type hint - # Note: This uses eval which can be dangerous - use with trusted input only - import typing - - namespace = { - "list": list, - "tuple": tuple, - "dict": dict, - "int": int, - "str": str, - "float": float, - "bool": bool, - "bytes": bytes, - "Optional": typing.Optional, - "Union": typing.Union, - } - type_hint = eval(type_string, {"__builtins__": {}}, namespace) - return python_type_to_arrow(type_hint) - except Exception as e: - raise ValueError(f"Could not parse type string '{type_string}': {e}") - - -def infer_schema_from_data(data: list[dict]) -> dict[str, type]: - """ - Infer schema from sample data (best effort). - - Args: - data: List of sample dictionaries - - Returns: - Dictionary mapping field names to inferred Python types - - Note: This is best-effort inference and may not handle all edge cases. - For production use, explicit schemas are recommended. - """ - if not data: - return {} - - schema = {} - - # Get all possible field names - all_fields = set() - for record in data: - all_fields.update(record.keys()) - - # Infer type for each field - for field_name in all_fields: - field_values = [ - record.get(field_name) - for record in data - if field_name in record and record[field_name] is not None - ] - - if not field_values: - schema[field_name] = Any # No non-null values found - continue - - # Get types of all values - value_types = {type(v) for v in field_values} - - if len(value_types) == 1: - # All values have same type - value_type = next(iter(value_types)) - - # For containers, try to infer element types - if value_type is list and field_values: - # Infer list element type from first non-empty list - for lst in field_values: - if lst: # non-empty list - element_types = {type(elem) for elem in lst} - if len(element_types) == 1: - element_type = next(iter(element_types)) - schema[field_name] = list[element_type] - else: - schema[field_name] = list[Any] # Mixed types - break - else: - schema[field_name] = list[Any] # All lists empty - - elif value_type in {set, frozenset} and field_values: - # Infer set element type from first non-empty set - for s in field_values: - if s: # non-empty set - element_types = {type(elem) for elem in s} - if len(element_types) == 1: - element_type = next(iter(element_types)) - schema[field_name] = set[element_type] - else: - schema[field_name] = set[Any] # Mixed types - break - else: - schema[field_name] = set[Any] # All sets empty - - elif value_type is dict and field_values: - # Infer dict types from first non-empty dict - for d in field_values: - if d: # non-empty dict - key_types = {type(k) for k in d.keys()} - value_types = {type(v) for v in d.values()} - - if len(key_types) == 1 and len(value_types) == 1: - key_type = next(iter(key_types)) - val_type = next(iter(value_types)) - schema[field_name] = dict[key_type, val_type] - else: - schema[field_name] = dict[Any, Any] # Mixed types - break - else: - schema[field_name] = dict[Any, Any] # All dicts empty - - else: - schema[field_name] = value_type - - else: - # Mixed types - use Union or Any - schema[field_name] = Any - - return schema - - -def arrow_list_to_set(lst: list) -> set: - """Convert Arrow list back to Python set (removes duplicates).""" - return set(lst) if lst is not None else set() - - -def dict_to_arrow_list(d: dict) -> list[dict]: - """Convert Python dict to Arrow-compatible list of key-value structs.""" - return [{"key": k, "value": v} for k, v in d.items()] - - -def arrow_list_to_dict(lst: list[dict]) -> dict: - """Convert Arrow list of key-value structs back to Python dict.""" - return {item["key"]: item["value"] for item in lst if item is not None} - - -def python_dicts_to_arrow_table( - data: list[dict], schema: dict[str, type] | None = None -) -> pa.Table: - """ - Convert list of Python dictionaries to PyArrow table with proper type conversion. - - Args: - data: List of Python dictionaries - schema: Dictionary mapping field names to Python type hints - - Returns: - PyArrow table with proper types - - Examples: - data = [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] - schema = {"x": int, "y": list[int]} - -> PyArrow table with x: int64, y: large_list - - data = [{"name": "Alice", "scores": {"math": 95, "english": 87}}] - schema = {"name": str, "scores": dict[str, int]} - -> PyArrow table with name: large_string, scores: large_list> - """ - if not data: - raise ValueError("Cannot create table from empty data list") - - if not schema: - raise ValueError("Schema cannot be empty") - - # Convert schema to Arrow schema - arrow_fields = [] - for field_name, python_type in schema.items(): - arrow_type = python_type_to_arrow(python_type) - arrow_fields.append(pa.field(field_name, arrow_type)) - - arrow_schema = pa.schema(arrow_fields) - - # Convert data with proper type transformations - converted_data = [] - for record in data: - converted_record = {} - for field_name, python_type in schema.items(): - value = record.get(field_name) - if value is not None: - converted_value = _convert_python_value_for_arrow(value, python_type) - converted_record[field_name] = converted_value - else: - converted_record[field_name] = None - converted_data.append(converted_record) - - # Create table with explicit schema - try: - table = pa.table(converted_data, schema=arrow_schema) - return table - except Exception as e: - # Fallback: create each column separately - arrays = [] - for field in arrow_schema: - field_name = field.name - field_type = field.type - - # Extract column data - column_data = [record.get(field_name) for record in converted_data] - - # Create array with explicit type - array = pa.array(column_data, type=field_type) - arrays.append(array) - - return pa.table(arrays, schema=arrow_schema) - - -def _convert_python_value_for_arrow(value, python_type): - """ - Convert a Python value to Arrow-compatible format based on expected type. - - Args: - value: Python value to convert - python_type: Expected Python type hint - - Returns: - Value in Arrow-compatible format - """ - origin = get_origin(python_type) - args = get_args(python_type) - - # Handle basic types - no conversion needed - if python_type in {int, float, str, bool, bytes} or origin is None: - return value - - # Handle Optional types - if origin is typing.Union and len(args) == 2 and type(None) in args: - if value is None: - return None - non_none_type = args[0] if args[1] is type(None) else args[1] - return _convert_python_value_for_arrow(value, non_none_type) - - # Handle abstract collections - elif origin is list or origin in {Collection, Sequence, Iterable}: - if not isinstance(value, (list, tuple)): - raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") - element_type = args[0] if args else Any - return [_convert_python_value_for_arrow(item, element_type) for item in value] - - # Handle set types - elif origin is set or origin is Set: - if not isinstance(value, (set, frozenset, list, tuple)): - raise TypeError( - f"Expected set/list/tuple for {python_type}, got {type(value)}" - ) - element_type = args[0] if args else Any - - # Convert set to sorted list for deterministic ordering - if isinstance(value, (set, frozenset)): - try: - # Sort if elements are comparable - value_list = sorted(value) - except TypeError: - # If elements aren't comparable (e.g., mixed types), convert to list as-is - # This maintains some order but isn't guaranteed to be deterministic - value_list = list(value) - else: - # Already a list/tuple, keep as-is - value_list = list(value) - - return [ - _convert_python_value_for_arrow(item, element_type) for item in value_list - ] - - # Handle mapping types - elif origin is dict or origin is Mapping: - if not isinstance(value, dict): - raise TypeError(f"Expected dict for {python_type}, got {type(value)}") - - key_type, value_type = (args[0], args[1]) if len(args) >= 2 else (Any, Any) - # Convert dict to list of key-value structs - key_value_list = [] - for k, v in value.items(): - converted_key = _convert_python_value_for_arrow(k, key_type) - converted_value = _convert_python_value_for_arrow(v, value_type) - key_value_list.append({"key": converted_key, "value": converted_value}) - return key_value_list - - # Handle tuple types - elif origin is tuple: - if not isinstance(value, (list, tuple)): - raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") - - if len(set(args)) == 1: - # Homogeneous tuple - convert to list - element_type = args[0] - return [ - _convert_python_value_for_arrow(item, element_type) for item in value - ] - else: - # Heterogeneous tuple - convert to struct dict - if len(value) != len(args): - raise ValueError( - f"Tuple length mismatch: expected {len(args)}, got {len(value)}" - ) - struct_dict = {} - for i, (item, item_type) in enumerate(zip(value, args)): - struct_dict[f"f{i}"] = _convert_python_value_for_arrow(item, item_type) - return struct_dict - - # Handle dict types - elif origin is dict: - if not isinstance(value, dict): - raise TypeError(f"Expected dict for {python_type}, got {type(value)}") - - key_type, value_type = args - # Convert dict to list of key-value structs - key_value_list = [] - for k, v in value.items(): - converted_key = _convert_python_value_for_arrow(k, key_type) - converted_value = _convert_python_value_for_arrow(v, value_type) - key_value_list.append({"key": converted_key, "value": converted_value}) - return key_value_list - - else: - # For unsupported types, return as-is and let Arrow handle it - return value - - -def arrow_table_to_python_dicts(table: pa.Table) -> list[dict]: - """ - Convert PyArrow table back to list of Python dictionaries with proper type conversion. - - Args: - table: PyArrow table to convert - - Returns: - List of Python dictionaries with proper Python types - - Examples: - Arrow table with x: int64, y: large_list - -> [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] - - Arrow table with scores: large_list> - -> [{"name": "Alice", "scores": {"math": 95, "english": 87}}] - """ - # Convert table to list of raw dictionaries - raw_dicts = table.to_pylist() - - # Convert each dictionary with proper type transformations - converted_dicts = [] - for raw_dict in raw_dicts: - converted_dict = {} - for field_name, value in raw_dict.items(): - if value is not None: - # Get the Arrow field type - field = table.schema.field(field_name) - arrow_type = field.type - - # Convert based on Arrow type - converted_value = _convert_arrow_value_to_python(value, arrow_type) - converted_dict[field_name] = converted_value - else: - converted_dict[field_name] = None - converted_dicts.append(converted_dict) - - return converted_dicts - - -def _convert_arrow_value_to_python(value, arrow_type): - """ - Convert Arrow value back to proper Python type. - - Args: - value: Value from Arrow table (as returned by to_pylist()) - arrow_type: PyArrow type of the field - - Returns: - Value converted to proper Python type - """ - # Handle basic types - no conversion needed - if ( - pa.types.is_integer(arrow_type) - or pa.types.is_floating(arrow_type) - or pa.types.is_boolean(arrow_type) - or pa.types.is_string(arrow_type) - or pa.types.is_large_string(arrow_type) - or pa.types.is_binary(arrow_type) - or pa.types.is_large_binary(arrow_type) - ): - return value - - # Handle list types (including large_list and fixed_size_list) - elif ( - pa.types.is_list(arrow_type) - or pa.types.is_large_list(arrow_type) - or pa.types.is_fixed_size_list(arrow_type) - ): - if value is None: - return None - - element_type = arrow_type.value_type - - # Check if this is a dict representation: list> - if pa.types.is_struct(element_type): - field_names = [field.name for field in element_type] - if set(field_names) == {"key", "value"}: - # This is a dict - convert list of key-value structs to dict - result_dict = {} - for item in value: - if item is not None: - key_field = element_type.field("key") - value_field = element_type.field("value") - - converted_key = _convert_arrow_value_to_python( - item["key"], key_field.type - ) - converted_value = _convert_arrow_value_to_python( - item["value"], value_field.type - ) - result_dict[converted_key] = converted_value - return result_dict - - # Regular list - convert each element - converted_list = [] - for item in value: - converted_item = _convert_arrow_value_to_python(item, element_type) - converted_list.append(converted_item) - - # For fixed-size lists, convert to tuple if all elements are same type - if pa.types.is_fixed_size_list(arrow_type): - return tuple(converted_list) - else: - return converted_list - - # Handle struct types - elif pa.types.is_struct(arrow_type): - if value is None: - return None - - field_names = [field.name for field in arrow_type] - - # Check if this is a tuple representation (f0, f1, f2, ...) - if all(name.startswith("f") and name[1:].isdigit() for name in field_names): - # Convert struct to tuple - sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) - tuple_values = [] - for field in sorted_fields: - field_value = value.get(field.name) - converted_value = _convert_arrow_value_to_python( - field_value, field.type - ) - tuple_values.append(converted_value) - return tuple(tuple_values) - else: - # Regular struct - convert each field - converted_struct = {} - for field in arrow_type: - field_name = field.name - field_value = value.get(field_name) - converted_value = _convert_arrow_value_to_python( - field_value, field.type - ) - converted_struct[field_name] = converted_value - return converted_struct - - # Handle map types - elif pa.types.is_map(arrow_type): - if value is None: - return None - - # Maps are returned as list of {'key': k, 'value': v} dicts - result_dict = {} - key_type = arrow_type.key_type - item_type = arrow_type.item_type - - for item in value: - if item is not None: - converted_key = _convert_arrow_value_to_python(item["key"], key_type) - converted_value = _convert_arrow_value_to_python( - item["value"], item_type - ) - result_dict[converted_key] = converted_value - return result_dict - - else: - # For unsupported types, return as-is - return value - - -if __name__ == "__main__": - print("=== Complete Python Type Hint ↔ PyArrow Type Converter ===\n") - - # Test basic functionality first - print("Testing basic round-trip:") - try: - # Simple test - python_type = dict[str, int] - arrow_type = python_type_to_arrow(python_type) - recovered_type = arrow_type_to_python(arrow_type) - print(f"✓ {python_type} -> {arrow_type} -> {recovered_type}") - print(f" Match: {recovered_type == python_type}") - except Exception as e: - print(f"✗ Basic test failed: {e}") - - print("\n" + "=" * 60) - print("Testing complex nested structures:") - - complex_nested_tests = [ - # Nested dictionaries - ( - dict[str, dict[str, int]], - pa.large_list( - pa.struct( - [ - ("key", pa.large_string()), - ( - "value", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.int64())] - ) - ), - ), - ] - ) - ), - ), - # Mixed complex types in tuples - ( - tuple[dict[str, int], list[str]], - pa.struct( - [ - ( - "f0", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.int64())] - ) - ), - ), - ("f1", pa.large_list(pa.large_string())), - ] - ), - ), - # Complex value types in dicts - ( - dict[str, list[int]], - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.large_list(pa.int64()))] - ) - ), - ), - # Triple nesting - ( - list[dict[str, list[int]]], - pa.large_list( - pa.large_list( - pa.struct( - [ - ("key", pa.large_string()), - ("value", pa.large_list(pa.int64())), - ] - ) - ) - ), - ), - # Complex tuple with nested structures - ( - tuple[list[int], dict[str, float], str], - pa.struct( - [ - ("f0", pa.large_list(pa.int64())), - ( - "f1", - pa.large_list( - pa.struct( - [("key", pa.large_string()), ("value", pa.float64())] - ) - ), - ), - ("f2", pa.large_string()), - ] - ), - ), - ] - - for python_type, expected_arrow_type in complex_nested_tests: - try: - result = python_type_to_arrow(python_type) - success = result == expected_arrow_type - status = "✓" if success else "✗" - print(f"{status} {python_type}") - print(f" -> {result}") - if not success: - print(f" Expected: {expected_arrow_type}") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing complex nested round-trips:") - - complex_round_trip_tests = [ - dict[str, dict[str, int]], - tuple[dict[str, int], list[str]], - dict[str, list[int]], - list[dict[str, list[int]]], - tuple[list[int], dict[str, float], str], - dict[str, tuple[int, str]], - list[tuple[dict[str, int], list[str]]], - ] - - for python_type in complex_round_trip_tests: - try: - # Python -> Arrow -> Python - arrow_type = python_type_to_arrow(python_type) - recovered_python_type = arrow_type_to_python(arrow_type) - success = recovered_python_type == python_type - status = "✓" if success else "✗" - print(f"{status} {python_type}") - print(f" -> {arrow_type}") - print(f" -> {recovered_python_type}") - if not success: - print(f" Round-trip failed!") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing Python -> Arrow conversion:") - - # Test cases for Python -> Arrow - python_to_arrow_tests = [ - # Basic types - (int, pa.int64()), - (str, pa.large_string()), - (float, pa.float64()), - (bool, pa.bool_()), - # Lists (both regular and large) - (list[int], pa.large_list(pa.int64())), - (list[str], pa.large_list(pa.large_string())), - (list[float], pa.large_list(pa.float64())), - # Homogeneous tuples (always use regular fixed-size lists) - (tuple[int, int], pa.list_(pa.int64(), 2)), - (tuple[str, str, str], pa.list_(pa.large_string(), 3)), - # Heterogeneous tuples - (tuple[int, str], pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), - ( - tuple[int, str, float], - pa.struct( - [("f0", pa.int64()), ("f1", pa.large_string()), ("f2", pa.float64())] - ), - ), - # Dict types - using large_list> for Polars compatibility - ( - dict[str, int], - pa.large_list( - pa.struct([("key", pa.large_string()), ("value", pa.int64())]) - ), - ), - ( - dict[int, str], - pa.large_list( - pa.struct([("key", pa.int64()), ("value", pa.large_string())]) - ), - ), - # Nested types - (list[list[int]], pa.large_list(pa.large_list(pa.int64()))), - ( - list[tuple[int, str]], - pa.large_list(pa.struct([("f0", pa.int64()), ("f1", pa.large_string())])), - ), - ] - - for python_type, expected_arrow_type in python_to_arrow_tests: - try: - result = python_type_to_arrow(python_type) - success = result == expected_arrow_type - status = "✓" if success else "✗" - print(f"{status} {python_type} -> {result}") - if not success: - print(f" Expected: {expected_arrow_type}") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing Arrow -> Python type conversion:") - - arrow_to_python_tests = [ - # Basic types (both regular and large variants) - (pa.int64(), int), - (pa.string(), str), - (pa.large_string(), str), - (pa.float64(), float), - (pa.bool_(), bool), - (pa.binary(), bytes), - (pa.large_binary(), bytes), - # Lists (both regular and large) - (pa.list_(pa.int64(), -1), list[int]), - (pa.large_list(pa.int64()), list[int]), - (pa.list_(pa.string(), -1), list[str]), - (pa.large_list(pa.large_string()), list[str]), - # Fixed-size lists (homogeneous tuples) - (pa.list_(pa.int64(), 3), tuple[int, int, int]), - (pa.list_(pa.large_string(), 2), tuple[str, str]), - # Dict representation: both regular and large list variants - ( - pa.list_(pa.struct([("key", pa.string()), ("value", pa.int64())]), -1), - dict[str, int], - ), - ( - pa.large_list( - pa.struct([("key", pa.large_string()), ("value", pa.int64())]) - ), - dict[str, int], - ), - ( - pa.list_(pa.struct([("key", pa.int64()), ("value", pa.string())]), -1), - dict[int, str], - ), - ( - pa.large_list( - pa.struct([("key", pa.int64()), ("value", pa.large_string())]) - ), - dict[int, str], - ), - # Heterogeneous tuples: struct - (pa.struct([("f0", pa.int64()), ("f1", pa.string())]), tuple[int, str]), - (pa.struct([("f0", pa.int64()), ("f1", pa.large_string())]), tuple[int, str]), - ( - pa.struct([("f0", pa.int64()), ("f1", pa.string()), ("f2", pa.float64())]), - tuple[int, str, float], - ), - # Maps (if encountered) - (pa.map_(pa.string(), pa.int64()), dict[str, int]), - (pa.map_(pa.large_string(), pa.int64()), dict[str, int]), - # Nested structures - (pa.list_(pa.list_(pa.int64(), -1), -1), list[list[int]]), - (pa.large_list(pa.large_list(pa.int64())), list[list[int]]), - ] - - for arrow_type, expected_python_type in arrow_to_python_tests: - try: - result = arrow_type_to_python(arrow_type) - success = result == expected_python_type - status = "✓" if success else "✗" - print(f"{status} {arrow_type} -> {result}") - if not success: - print(f" Expected: {expected_python_type}") - except Exception as e: - print(f"✗ {arrow_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing round-trip conversion:") - - round_trip_tests = [ - dict[str, int], - list[int], - tuple[int, str], - tuple[str, str, str], - list[dict[str, int]], - list[list[str]], - tuple[int, float, bool], - ] - - for python_type in round_trip_tests: - try: - # Python -> Arrow -> Python - arrow_type = python_type_to_arrow(python_type) - recovered_python_type = arrow_type_to_python(arrow_type) - success = recovered_python_type == python_type - status = "✓" if success else "✗" - print(f"{status} {python_type} -> {arrow_type} -> {recovered_python_type}") - if not success: - print(f" Round-trip failed!") - except Exception as e: - print(f"✗ {python_type} -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing string parsing:") - - string_tests = [ - "list[int]", - "tuple[int, str]", - "dict[str, int]", - "list[dict[str, float]]", - ] - - for type_str in string_tests: - try: - result = parse_type_string(type_str) - print(f"✓ '{type_str}' -> {result}") - except Exception as e: - print(f"✗ '{type_str}' -> ERROR: {e}") - - print("\n" + "=" * 60) - print("Testing practical data conversion:") - - # Test actual data conversion - try: - # Create some test data - test_data = [ - {"name": "Alice", "scores": {"math": 95, "english": 87}}, - {"name": "Bob", "scores": {"math": 78, "english": 92}}, - ] - - # Create schema with nested dict using large_list representation - dict_type = python_type_to_arrow(dict[str, int]) - schema = pa.schema([("name", pa.large_string()), ("scores", dict_type)]) - - print(f"Dict type representation: {dict_type}") - - # Convert Python dicts to the expected list format - converted_data = [] - for record in test_data: - converted_record = record.copy() - if "scores" in converted_record: - # Convert dict to list of key-value structs - scores_dict = converted_record["scores"] - converted_record["scores"] = dict_to_arrow_list(scores_dict) - converted_data.append(converted_record) - - # Create Arrow table - need to handle the conversion properly - try: - table = pa.table(converted_data, schema=schema) - except Exception as table_error: - # If direct conversion fails, convert each column separately - print(f" Direct table creation failed: {table_error}") - print(" Trying column-by-column conversion...") - - # Convert each field separately - arrays = [] - for field in schema: - field_name = field.name - field_type = field.type - - # Extract column data - column_data = [record.get(field_name) for record in converted_data] - - # Create array with explicit type - array = pa.array(column_data, type=field_type) - arrays.append(array) - - # Create table from arrays - table = pa.table(arrays, schema=schema) - print(f"✓ Created PyArrow table with large_list representation") - - # Convert back to Python and reconstruct dicts - result_data = table.to_pylist() - for record in result_data: - if "scores" in record and record["scores"]: - # Convert list of key-value structs back to dict - record["scores"] = arrow_list_to_dict(record["scores"]) - - print(f"✓ Round-trip successful: {result_data[0]['scores']}") - - except Exception as e: - print(f"✗ Practical conversion test failed: {e}") - - print("Testing edge cases and limitations:") - - edge_case_tests = [ - # Complex key types - these are challenging but let's see what happens - "dict[tuple[str, int], str]", # tuple keys - "dict[str, dict[int, list[str]]]", # deeply nested - "Optional[dict[str, int]]", # optional complex types - ] - - for type_str in edge_case_tests: - try: - # Parse and convert - namespace = { - "list": list, - "tuple": tuple, - "dict": dict, - "int": int, - "str": str, - "float": float, - "bool": bool, - "bytes": bytes, - "Optional": typing.Optional, - "Union": typing.Union, - } - python_type = eval(type_str, {"__builtins__": {}}, namespace) - arrow_type = python_type_to_arrow(python_type) - recovered_type = arrow_type_to_python(arrow_type) - - success = recovered_type == python_type - status = "✓" if success else "⚠" - print(f"{status} {type_str}") - print(f" -> {arrow_type}") - print(f" -> {recovered_type}") - if not success: - print(f" Note: Complex key types may have limitations") - - except Exception as e: - print(f"✗ {type_str} -> ERROR: {e}") - - print(f"\n{'=' * 60}") - print("All tests completed!") diff --git a/src/orcapod/semantic_types/unused/schemas.py b/src/orcapod/semantic_types/unused/schemas.py deleted file mode 100644 index a028608..0000000 --- a/src/orcapod/semantic_types/unused/schemas.py +++ /dev/null @@ -1,357 +0,0 @@ -# from typing import Self -# from orcapod.types.core import DataType, TypeSpec -# from orcapod.types.semantic_types import ( -# SemanticType, -# SemanticTypeRegistry, -# PythonArrowConverter, -# ) -# import pyarrow as pa -# import datetime - -# # This mapping is expected to be stable -# # Be sure to test this assumption holds true -# DEFAULT_ARROW_TYPE_LUT = { -# int: pa.int64(), -# float: pa.float64(), -# str: pa.large_string(), -# bool: pa.bool_(), -# } - - -# def python_to_arrow_type(python_type: type) -> pa.DataType: -# if python_type in DEFAULT_ARROW_TYPE_LUT: -# return DEFAULT_ARROW_TYPE_LUT[python_type] -# raise TypeError(f"Converstion of python type {python_type} is not supported yet") - - -# def arrow_to_python_type(arrow_type: pa.DataType) -> type: -# if pa.types.is_integer(arrow_type): -# return int -# elif pa.types.is_floating(arrow_type): -# return float -# elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): -# return str -# elif pa.types.is_boolean(arrow_type): -# return bool -# elif pa.types.is_date(arrow_type): -# return datetime.date -# elif pa.types.is_timestamp(arrow_type): -# return datetime.datetime -# elif pa.types.is_binary(arrow_type): -# return bytes -# else: -# raise TypeError(f"Conversion of arrow type {arrow_type} is not supported") - - -# class PythonSchema(dict[str, DataType]): -# """ -# A schema for Python data types, mapping string keys to Python types. - -# This is used to define the expected structure of data packets in OrcaPod. - -# Attributes -# ---------- -# keys : str -# The keys of the schema. -# values : type -# The types corresponding to each key. - -# Examples -# -------- -# >>> schema = PythonSchema(name=str, age=int) -# >>> print(schema) -# {'name': , 'age': } -# """ - -# def copy(self) -> "PythonSchema": -# return PythonSchema(self) - -# def to_semantic_schema( -# self, semantic_type_registry: SemanticTypeRegistry -# ) -> "SemanticSchema": -# """ -# Convert the Python schema to a semantic schema using the provided semantic type registry. - -# Parameters -# ---------- -# semantic_type_registry : SemanticTypeRegistry -# The registry containing semantic type information. - -# Returns -# ------- -# SemanticSchema -# A new schema mapping keys to tuples of Python types and optional semantic type identifiers. - -# Examples -# -------- -# >>> python_schema = PythonSchema(name=str, age=int) -# >>> semantic_schema = python_schema.to_semantic_schema(registry) -# >>> print(semantic_schema) -# {'name': (str, None), 'age': (int, None)} -# """ -# return SemanticSchema.from_typespec(self, semantic_type_registry) - -# def to_arrow_schema( -# self, -# semantic_type_registry: SemanticTypeRegistry | None = None, -# converters: dict[str, PythonArrowConverter] | None = None, -# ) -> pa.Schema: -# """ -# Convert the Python schema to an Arrow schema. -# If converters are provided, they are used to convert the schema. Note that -# no validation is performed on the converters, so they must be compatible with the schema. -# """ -# if converters is not None: -# # If converters are provided, use them to convert the schema -# fields = [] -# for field_name, python_type in self.items(): -# if field_name in converters: -# converter = converters[field_name] -# arrow_type = converter.arrow_type -# metadata = None -# if converter.semantic_type_name is not None: -# metadata = { -# b"semantic_type": converter.semantic_type_name.encode( -# "utf-8" -# ) -# } -# else: -# arrow_type = python_to_arrow_type(python_type) -# metadata = None -# fields.append(pa.field(field_name, arrow_type, metadata=metadata)) -# return pa.schema(fields) - -# if semantic_type_registry is None: -# raise ValueError( -# "semantic_type_registry must be provided if converters are not" -# ) -# # Otherwise, convert using the semantic type registry -# return self.to_semantic_schema(semantic_type_registry).to_arrow_schema() - -# @classmethod -# def from_semantic_schema(cls, semantic_schema: "SemanticSchema") -> Self: -# """ -# Create a PythonSchema from a SemanticSchema. - -# Parameters -# ---------- -# semantic_schema : SemanticSchema -# The semantic schema to convert. - -# Returns -# ------- -# PythonSchema -# A new schema mapping keys to Python types. -# """ -# return cls(semantic_schema.get_python_types()) - -# @classmethod -# def from_arrow_schema( -# cls, -# arrow_schema: pa.Schema, -# semantic_type_registry: SemanticTypeRegistry | None = None, -# converters: dict[str, PythonArrowConverter] | None = None, -# ) -> Self: -# """ -# Create a PythonSchema from an Arrow schema. - -# Parameters -# ---------- -# arrow_schema : pa.Schema -# The Arrow schema to convert. -# semantic_type_registry : SemanticTypeRegistry -# The registry containing semantic type information. -# skip_system_columns : bool, optional -# Whether to skip system columns (default is True). -# converters : dict[str, PythonArrowConverter], optional -# A dictionary of converters to use for converting the schema. If provided, the schema will be -# converted using the converters. If not provided, the schema will be converted using the semantic type -# registry. - -# Returns -# ------- -# PythonSchema -# A new schema mapping keys to Python types. -# """ -# if converters is not None: -# # If converters are provided, use them to convert the schema -# python_types = {} -# for field in arrow_schema: -# # TODO: consider performing validation of semantic type -# if field.name in converters: -# converter = converters[field.name] -# python_types[field.name] = converter.python_type -# else: -# python_types[field.name] = arrow_to_python_type(field.type) -# return cls(python_types) - -# if semantic_type_registry is None: -# raise ValueError( -# "semantic_type_registry must be provided if converters are not" -# ) -# semantic_schema = SemanticSchema.from_arrow_schema( -# arrow_schema, -# semantic_type_registry, -# ) -# return cls(semantic_schema.get_python_types()) - - -# class SemanticSchema(dict[str, type | SemanticType]): -# """ -# A schema for semantic types, mapping string keys to tuples of Python types and optional metadata. - -# This is used to define the expected structure of data packets with semantic types in OrcaPod. - -# Attributes -# ---------- -# keys : str -# The keys of the schema. -# values : type | SemanticType -# Either type for simple fields or SemanticType for semantic fields. - -# Examples -# -------- -# >>> schema = SemanticSchema(image=SemanticType('path'), age=int) -# >>> print(schema) -# {"image": SemanticType(name='path'), "age": })} -# """ - -# def get_semantic_fields(self) -> dict[str, SemanticType]: -# """ -# Get a dictionary of semantic fields in the schema. - -# Returns -# ------- -# dict[str, SemanticType] -# A dictionary mapping keys to their corresponding SemanticType. -# """ -# return {k: v for k, v in self.items() if isinstance(v, SemanticType)} - -# def get_python_types(self) -> dict[str, type]: -# """ -# Get the Python types for all keys in the schema. - -# Returns -# ------- -# dict[str, type] -# A dictionary mapping keys to their corresponding Python types. -# """ -# return { -# k: v.get_default_python_type() if isinstance(v, SemanticType) else v -# for k, v in self.items() -# } - -# def get_arrow_types(self) -> dict[str, tuple[pa.DataType, str | None]]: -# """ -# Get the Arrow types for all keys in the schema. - -# Returns -# ------- -# dict[str, tuple[pa.DataType, str|None]] -# A dictionary mapping keys to tuples of Arrow types. If the field has a semantic type, -# the second element of the tuple is the semantic type name; otherwise, it is None. -# """ -# return { -# k: (v.get_default_arrow_type(), v.name) -# if isinstance(v, SemanticType) -# else (python_to_arrow_type(v), None) -# for k, v in self.items() -# } - -# def to_arrow_schema(self) -> pa.Schema: -# """ -# Get the Arrow schema, which is a PythonSchema representation of the semantic schema. - -# Returns -# ------- -# PythonSchema -# A new schema mapping keys to Python types. -# """ -# fields = [] -# for k, (arrow_type, semantic_type_name) in self.get_arrow_types().items(): -# if semantic_type_name is not None: -# field = pa.field( -# k, -# arrow_type, -# metadata={b"semantic_type": semantic_type_name.encode("utf-8")}, -# ) -# else: -# field = pa.field(k, arrow_type) -# fields.append(field) - -# return pa.schema(fields) - -# def to_python_schema(self) -> PythonSchema: -# """ -# Get the Python schema, which is a PythonSchema representation of the semantic schema. - -# Returns -# ------- -# PythonSchema -# A new schema mapping keys to Python types. -# """ -# return PythonSchema.from_semantic_schema(self) - -# @classmethod -# def from_arrow_schema( -# cls, -# arrow_schema: pa.Schema, -# semantic_type_registry: SemanticTypeRegistry, -# ) -> Self: -# """ -# Create a SemanticSchema from an Arrow schema. - -# Parameters -# ---------- -# arrow_schema : pa.Schema -# The Arrow schema to convert. - -# Returns -# ------- -# SemanticSchema -# A new schema mapping keys to tuples of Python types and optional semantic type identifiers. -# """ - -# semantic_schema = {} -# for field in arrow_schema: -# field_type = None -# if field.metadata is not None: -# semantic_type_name = field.metadata.get(b"semantic_type", b"").decode() -# if semantic_type_name: -# semantic_type = semantic_type_registry.get_semantic_type( -# semantic_type_name -# ) -# if semantic_type is None: -# raise ValueError( -# f"Semantic type '{semantic_type_name}' not found in registry" -# ) -# if not semantic_type.supports_arrow_type(field.type): -# raise ValueError( -# f"Semantic type '{semantic_type.name}' does not support Arrow field of type '{field.type}'" -# ) -# field_type = semantic_type - -# if ( -# field_type is None -# ): # was not set to semantic type, so fallback to simple conversion -# field_type = arrow_to_python_type(field.type) - -# semantic_schema[field.name] = field_type -# return cls(semantic_schema) - -# @classmethod -# def from_typespec( -# cls, -# typespec: TypeSpec, -# semantic_type_registry: SemanticTypeRegistry, -# ) -> Self: -# semantic_schema = {} -# for key, python_type in typespec.items(): -# semantic_type = semantic_type_registry.get_semantic_type_for_python_type( -# python_type -# ) -# if semantic_type is not None: -# semantic_schema[key] = semantic_type -# else: -# semantic_schema[key] = python_type -# return cls(semantic_schema) diff --git a/src/orcapod/semantic_types/unused/semantic_converters.py b/src/orcapod/semantic_types/unused/semantic_converters.py deleted file mode 100644 index 62fd2ec..0000000 --- a/src/orcapod/semantic_types/unused/semantic_converters.py +++ /dev/null @@ -1,1005 +0,0 @@ -import pyarrow as pa -from typing import get_origin, get_args, Any -import typing -from collections.abc import Collection, Sequence, Mapping, Iterable, Set -from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.types import PythonSchema - - -# Basic type mapping for Python -> Arrow conversion -_PYTHON_TO_ARROW_MAP = { - # Python built-ins - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), # Use large_string by default for Polars compatibility - bool: pa.bool_(), - bytes: pa.large_binary(), # Use large_binary by default for Polars compatibility - # String representations (for when we get type names as strings) - "int": pa.int64(), - "float": pa.float64(), - "str": pa.large_string(), - "bool": pa.bool_(), - "bytes": pa.large_binary(), - # Specific integer types - "int8": pa.int8(), - "int16": pa.int16(), - "int32": pa.int32(), - "int64": pa.int64(), - "uint8": pa.uint8(), - "uint16": pa.uint16(), - "uint32": pa.uint32(), - "uint64": pa.uint64(), - # Specific float types - "float32": pa.float32(), - "float64": pa.float64(), - # Date/time types - "date": pa.date32(), - "datetime": pa.timestamp("us"), - "timestamp": pa.timestamp("us"), -} - -# Reverse mapping for Arrow -> Python conversion (handles both regular and large variants) -_ARROW_TO_PYTHON_MAP = { - # Integer types - pa.int8(): int, - pa.int16(): int, - pa.int32(): int, - pa.int64(): int, - pa.uint8(): int, - pa.uint16(): int, - pa.uint32(): int, - pa.uint64(): int, - # Float types - pa.float32(): float, - pa.float64(): float, - # String types (both regular and large) - pa.string(): str, - pa.large_string(): str, - # Boolean - pa.bool_(): bool, - # Binary types (both regular and large) - pa.binary(): bytes, - pa.large_binary(): bytes, -} - -# Add numpy types if available -try: - import numpy as np - - _PYTHON_TO_ARROW_MAP.update( - { - np.int8: pa.int8(), - np.int16: pa.int16(), - np.int32: pa.int32(), - np.int64: pa.int64(), - np.uint8: pa.uint8(), - np.uint16: pa.uint16(), - np.uint32: pa.uint32(), - np.uint64: pa.uint64(), - np.float32: pa.float32(), - np.float64: pa.float64(), - np.bool_: pa.bool_(), - } - ) -except ImportError: - pass - - -def python_type_to_arrow( - type_hint, semantic_registry: SemanticTypeRegistry | None = None -) -> pa.DataType: - """ - Convert Python type hints to PyArrow data types. - - Args: - type_hint: Python type hint to convert - semantic_registry: Optional semantic type registry to check for semantic types - - Examples: - list[int] -> pa.large_list(pa.int64()) - tuple[int, int] -> pa.list_(pa.int64(), 2) - tuple[int, str] -> pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) - dict[str, int] -> pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) - Path -> pa.struct([('path', pa.large_string())]) # if registered in semantic registry - """ - - # Handle basic types first - if type_hint in _PYTHON_TO_ARROW_MAP: - return _PYTHON_TO_ARROW_MAP[type_hint] - - # Check if this is a registered semantic type - if semantic_registry is not None: - converter = semantic_registry.get_converter_for_python_type(type_hint) - if converter: - return converter.arrow_struct_type - - # Get the origin (e.g., list, tuple, dict) and args (e.g., int, str) - origin = get_origin(type_hint) - args = get_args(type_hint) - - if origin is None: - # Handle non-generic types that might not be in basic map - if hasattr(type_hint, "__name__"): - type_name = type_hint.__name__ - if type_name in _PYTHON_TO_ARROW_MAP: - return _PYTHON_TO_ARROW_MAP[type_name] - raise ValueError(f"Unsupported type: {type_hint}") - - # Handle list types - if origin is list: - if len(args) != 1: - raise ValueError( - f"list type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) # Use large_list for Polars compatibility - - # Handle tuple types - elif origin is tuple: - if len(args) == 0: - raise ValueError("Empty tuple type not supported") - - # Check if all elements are the same type - if len(set(args)) == 1: - # Homogeneous tuple: tuple[int, int, int] -> fixed-size list - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.list_( - element_type, len(args) - ) # Fixed-size lists are always regular lists - else: - # Heterogeneous tuple: tuple[int, str] -> struct with indexed fields - fields = [] - for i, arg_type in enumerate(args): - field_type = python_type_to_arrow(arg_type, semantic_registry) - fields.append((f"f{i}", field_type)) - return pa.struct(fields) - - # Handle dict types - elif origin is dict: - if len(args) != 2: - raise ValueError( - f"dict type must have exactly two type arguments, got: {args}" - ) - key_type = python_type_to_arrow(args[0], semantic_registry) - value_type = python_type_to_arrow(args[1], semantic_registry) - - # Use large_list> representation for better compatibility - # This works reliably across Arrow, Polars, Parquet, etc. - key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) - return pa.large_list(key_value_struct) - - # Handle abstract base classes and collections - elif origin in {Collection, Sequence, Iterable}: - # Treat as list - most common concrete implementation - if len(args) != 1: - raise ValueError( - f"{origin.__name__} type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) - - elif origin is Set or origin is set: - # Sets -> lists (Arrow doesn't have native set type) - if len(args) != 1: - raise ValueError( - f"set type must have exactly one type argument, got: {args}" - ) - element_type = python_type_to_arrow(args[0], semantic_registry) - return pa.large_list(element_type) - - elif origin is Mapping: - # Mapping -> dict representation - if len(args) != 2: - raise ValueError( - f"Mapping type must have exactly two type arguments, got: {args}" - ) - key_type = python_type_to_arrow(args[0], semantic_registry) - value_type = python_type_to_arrow(args[1], semantic_registry) - key_value_struct = pa.struct([("key", key_type), ("value", value_type)]) - return pa.large_list(key_value_struct) - elif origin is typing.Union: - # Handle Optional[T] which is Union[T, NoneType] - if len(args) == 2 and type(None) in args: - # This is Optional[T] - non_none_type = args[0] if args[1] is type(None) else args[1] - base_type = python_type_to_arrow(non_none_type, semantic_registry) - # PyArrow handles nullability at the field level, so we just return the base type - return base_type - else: - # Complex unions - convert to a union type - union_types = [python_type_to_arrow(arg, semantic_registry) for arg in args] - # PyArrow union types are complex - for now, just use the first type as fallback - # TODO: Implement proper union support when needed - return union_types[0] # Simplified - take first type - - else: - raise ValueError(f"Unsupported generic type: {origin}") - - -def python_schema_to_arrow( - python_schema: PythonSchema, semantic_registry: SemanticTypeRegistry | None = None -) -> pa.Schema: - """ - Convert a Python schema (TypeSpec) to a PyArrow schema. - - Args: - python_schema: TypeSpec representing the Python schema - semantic_registry: Optional semantic type registry to check for semantic types - - Returns: - PyArrow Schema object - - Raises: - ValueError: If the Python schema cannot be converted to Arrow schema - """ - - arrow_fields = [] - for field_name, field_type in python_schema.items(): - arrow_type = python_type_to_arrow( - field_type, semantic_registry=semantic_registry - ) - arrow_fields.append(pa.field(field_name, arrow_type)) - return pa.schema(arrow_fields) - - -def arrow_type_to_python( - arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None -) -> type: - """ - Convert PyArrow data types back to Python type hints. - - Args: - arrow_type: PyArrow data type to convert - semantic_registry: Optional semantic type registry for semantic types - - Returns: - Python type annotation - - Examples: - pa.int64() -> int - pa.large_list(pa.large_string()) -> list[str] - pa.large_list(pa.struct([('key', pa.large_string()), ('value', pa.int64())])) -> dict[str, int] - pa.struct([('f0', pa.int64()), ('f1', pa.large_string())]) -> tuple[int, str] - pa.struct([('path', pa.large_string())]) -> Path # if registered in semantic registry - - Raises: - TypeError: If the Arrow type cannot be converted to a Python type - """ - - # Handle basic types - if arrow_type in _ARROW_TO_PYTHON_MAP: - return _ARROW_TO_PYTHON_MAP[arrow_type] - - # Check by Arrow type categories - if pa.types.is_integer(arrow_type): - return int - elif pa.types.is_floating(arrow_type): - return float - elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): - return str - elif pa.types.is_boolean(arrow_type): - return bool - elif pa.types.is_binary(arrow_type) or pa.types.is_large_binary(arrow_type): - return bytes - - # Handle complex types - elif ( - pa.types.is_list(arrow_type) - or pa.types.is_large_list(arrow_type) - or pa.types.is_fixed_size_list(arrow_type) - ): - element_type = arrow_type.value_type - - # Check if this is a dict representation: list> - if pa.types.is_struct(element_type): - field_names = [field.name for field in element_type] - - # Dict pattern: must have exactly 'key' and 'value' fields - if set(field_names) == {"key", "value"}: - # Find key and value types - key_field = next(f for f in element_type if f.name == "key") - value_field = next(f for f in element_type if f.name == "value") - - key_python_type = arrow_type_to_python( - key_field.type, semantic_registry - ) - value_python_type = arrow_type_to_python( - value_field.type, semantic_registry - ) - - return dict[key_python_type, value_python_type] - - # Regular list - element_python_type = arrow_type_to_python(element_type, semantic_registry) - - # Check if this is a fixed-size list (homogeneous tuple representation) - if pa.types.is_fixed_size_list(arrow_type): - # Fixed-size list -> homogeneous tuple - size = arrow_type.list_size - return tuple[tuple(element_python_type for _ in range(size))] - else: - # Variable-size list -> list - return list[element_python_type] - - elif pa.types.is_struct(arrow_type): - # First check if this is a semantic type using struct signature recognition - if semantic_registry: - python_type = semantic_registry.get_python_type_for_struct_signature( - arrow_type - ) - if python_type: - return python_type - - # Check if this is a heterogeneous tuple representation - field_names = [field.name for field in arrow_type] - - # Tuple pattern: fields named f0, f1, f2, etc. - if all(name.startswith("f") and name[1:].isdigit() for name in field_names): - # Sort by field index to maintain order - sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) - field_types = [ - arrow_type_to_python(field.type, semantic_registry) - for field in sorted_fields - ] - return tuple[tuple(field_types)] - else: - # Unknown struct type - cannot convert - raise TypeError( - f"Cannot convert struct type to Python type hint. " - f"Struct has fields: {field_names}. " - f"Only tuple-like structs (f0, f1, ...) or registered semantic structs are supported." - ) - - elif pa.types.is_map(arrow_type): - # Handle pa.map_ types (though we prefer list representation) - key_python_type = arrow_type_to_python(arrow_type.key_type, semantic_registry) - value_python_type = arrow_type_to_python( - arrow_type.item_type, semantic_registry - ) - return dict[key_python_type, value_python_type] - - elif pa.types.is_union(arrow_type): - # Handle union types -> Union[T1, T2, ...] - import typing - - # Get the child types from the union - child_types = [] - for i in range(arrow_type.num_fields): - child_field = arrow_type[i] - child_types.append( - arrow_type_to_python(child_field.type, semantic_registry) - ) - - if len(child_types) == 2 and type(None) in child_types: - # This is Optional[T] - non_none_type = next(t for t in child_types if t is not type(None)) - return typing.Optional[non_none_type] # type: ignore - else: - return typing.Union[tuple(child_types)] # type: ignore - - else: - raise TypeError( - f"Cannot convert Arrow type '{arrow_type}' to Python type hint. " - f"Supported types: int, float, str, bool, bytes, list, large_list, fixed_size_list, tuple, dict, struct, map, union. " - f"Arrow type category: {arrow_type}" - ) - - -def arrow_schema_to_python( - arrow_schema: pa.Schema, semantic_registry: SemanticTypeRegistry | None = None -) -> PythonSchema: - """ - Convert a PyArrow schema to a Python schema (TypeSpec). - - Args: - arrow_schema: PyArrow Schema object - semantic_registry: Optional semantic type registry for semantic types - - Returns: - TypeSpec representing the Python schema - - Raises: - TypeError: If the Arrow schema cannot be converted to Python schema - """ - return { - field.name: arrow_type_to_python(field.type, semantic_registry) - for field in arrow_schema - } - - -def parse_type_string( - type_string: str, semantic_registry: SemanticTypeRegistry | None = None -): - """ - Parse a type hint from a string representation. - Useful when you have type hints as strings. - - Example: - parse_type_string("list[int]") -> pa.large_list(pa.int64()) - """ - # This is a simplified version - for production use, consider using ast.literal_eval - # or a proper type hint parser - try: - # Try to evaluate the string as a type hint - # Note: This uses eval which can be dangerous - use with trusted input only - import typing - - namespace = { - "list": list, - "tuple": tuple, - "dict": dict, - "int": int, - "str": str, - "float": float, - "bool": bool, - "bytes": bytes, - "Optional": typing.Optional, - "Union": typing.Union, - } - type_hint = eval(type_string, {"__builtins__": {}}, namespace) - return python_type_to_arrow(type_hint, semantic_registry) - except Exception as e: - raise ValueError(f"Could not parse type string '{type_string}': {e}") - - -def infer_schema_from_data( - data: list[dict], semantic_registry: SemanticTypeRegistry | None = None -) -> dict[str, type]: - """ - Infer schema from sample data (best effort). - - Args: - data: List of sample dictionaries - semantic_registry: Optional semantic type registry for detecting semantic types - - Returns: - Dictionary mapping field names to inferred Python types - - Note: This is best-effort inference and may not handle all edge cases. - For production use, explicit schemas are recommended. - """ - if not data: - return {} - - schema = {} - - # Get all possible field names - all_fields = set() - for record in data: - all_fields.update(record.keys()) - - # Infer type for each field - for field_name in all_fields: - field_values = [ - record.get(field_name) - for record in data - if field_name in record and record[field_name] is not None - ] - - if not field_values: - schema[field_name] = str # Default fallback instead of Any - continue - - # Get types of all values - value_types = {type(v) for v in field_values} - - if len(value_types) == 1: - # All values have same type - value_type = next(iter(value_types)) - - # Check if this is a semantic type first - if semantic_registry: - converter = semantic_registry.get_converter_for_python_type(value_type) - if converter: - schema[field_name] = value_type - continue - - # For containers, try to infer element types - if value_type is list and field_values: - # Infer list element type from first non-empty list - for lst in field_values: - if lst: # non-empty list - element_types = {type(elem) for elem in lst} - if len(element_types) == 1: - element_type = next(iter(element_types)) - schema[field_name] = list[element_type] - else: - # Mixed types - use str as fallback instead of Any - schema[field_name] = list[str] - break - else: - schema[field_name] = list[str] # Default fallback instead of Any - - elif value_type in {set, frozenset} and field_values: - # Infer set element type from first non-empty set - for s in field_values: - if s: # non-empty set - element_types = {type(elem) for elem in s} - if len(element_types) == 1: - element_type = next(iter(element_types)) - schema[field_name] = set[element_type] - else: - schema[field_name] = set[ - str - ] # Mixed types - fallback to str - break - else: - schema[field_name] = set[str] # All sets empty - fallback to str - - elif value_type is dict and field_values: - # Infer dict types from first non-empty dict - for d in field_values: - if d: # non-empty dict - key_types = {type(k) for k in d.keys()} - value_types = {type(v) for v in d.values()} - - if len(key_types) == 1 and len(value_types) == 1: - key_type = next(iter(key_types)) - val_type = next(iter(value_types)) - schema[field_name] = dict[key_type, val_type] - else: - # Mixed types - use most common types or fallback to str - key_type = ( - str if str in key_types else next(iter(key_types)) - ) - val_type = ( - str if str in value_types else next(iter(value_types)) - ) - schema[field_name] = dict[key_type, val_type] - break - else: - schema[field_name] = dict[ - str, str - ] # Default fallback instead of Any - - else: - schema[field_name] = value_type - - else: - # Mixed types - use str as fallback instead of Any - schema[field_name] = str - - return schema - - -def arrow_list_to_set(lst: list) -> set: - """Convert Arrow list back to Python set (removes duplicates).""" - return set(lst) if lst is not None else set() - - -def dict_to_arrow_list(d: dict) -> list[dict]: - """Convert Python dict to Arrow-compatible list of key-value structs.""" - return [{"key": k, "value": v} for k, v in d.items()] - - -def arrow_list_to_dict(lst: list[dict]) -> dict: - """Convert Arrow list of key-value structs back to Python dict.""" - return {item["key"]: item["value"] for item in lst if item is not None} - - -def python_dicts_to_arrow_table( - data: list[dict], - schema: dict[str, type] | None = None, - semantic_registry: SemanticTypeRegistry | None = None, -) -> pa.Table: - """ - Convert list of Python dictionaries to PyArrow table with proper type conversion. - - Args: - data: List of Python dictionaries - schema: Dictionary mapping field names to Python type hints (optional) - semantic_registry: Optional semantic type registry for complex Python objects - - Returns: - PyArrow table with proper types - - Examples: - # Basic usage - data = [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] - schema = {"x": int, "y": list[int]} - - # With semantic types - from pathlib import Path - data = [{"name": "Alice", "file": Path("/home/alice/data.csv")}] - schema = {"name": str, "file": Path} - table = python_dicts_to_arrow_table(data, schema, semantic_registry) - """ - if not data: - raise ValueError("Cannot create table from empty data list") - - # Auto-infer schema if not provided - if schema is None: - schema = infer_schema_from_data(data, semantic_registry) - print(f"Auto-inferred schema: {schema}") - - if not schema: - raise ValueError("Schema cannot be empty (and could not be inferred)") - - # Convert schema to Arrow schema (with semantic type support) - arrow_fields = [] - for field_name, python_type in schema.items(): - arrow_type = python_type_to_arrow(python_type, semantic_registry) - arrow_fields.append(pa.field(field_name, arrow_type)) - - arrow_schema = pa.schema(arrow_fields) - - # Convert data with proper type transformations (with semantic type support) - converted_data = [] - for record in data: - converted_record = {} - for field_name, python_type in schema.items(): - value = record.get(field_name) - if value is not None: - converted_value = _convert_python_value_for_arrow( - value, python_type, semantic_registry - ) - converted_record[field_name] = converted_value - else: - converted_record[field_name] = None - converted_data.append(converted_record) - - # Create table with explicit schema - try: - table = pa.table(converted_data, schema=arrow_schema) - return table - except Exception: - # Fallback: create each column separately - arrays = [] - for field in arrow_schema: - field_name = field.name - field_type = field.type - - # Extract column data - column_data = [record.get(field_name) for record in converted_data] - - # Create array with explicit type - array = pa.array(column_data, type=field_type) - arrays.append(array) - - return pa.table(arrays, schema=arrow_schema) - - -def _convert_python_value_for_arrow( - value, python_type, semantic_registry: SemanticTypeRegistry | None = None -): - """ - Convert a Python value to Arrow-compatible format based on expected type. - - Args: - value: Python value to convert - python_type: Expected Python type hint - semantic_registry: Optional semantic type registry - - Returns: - Value in Arrow-compatible format - """ - # First, check if this is a semantic type - if semantic_registry: - converter = semantic_registry.get_converter_for_python_type(python_type) - if converter: - # Convert using semantic type converter - return converter.python_to_struct_dict(value) - - # Fall back to standard type conversion - origin = get_origin(python_type) - args = get_args(python_type) - - # Handle basic types - no conversion needed - if python_type in {int, float, str, bool, bytes} or origin is None: - return value - - # Handle Optional types - if origin is typing.Union and len(args) == 2 and type(None) in args: - if value is None: - return None - non_none_type = args[0] if args[1] is type(None) else args[1] - return _convert_python_value_for_arrow(value, non_none_type, semantic_registry) - - # Handle abstract collections - elif origin is list or origin in {Collection, Sequence, Iterable}: - if not isinstance(value, (list, tuple)): - raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") - element_type = args[0] if args else Any - return [ - _convert_python_value_for_arrow(item, element_type, semantic_registry) - for item in value - ] - - # Handle set types - elif origin is set or origin is Set: - if not isinstance(value, (set, frozenset, list, tuple)): - raise TypeError( - f"Expected set/list/tuple for {python_type}, got {type(value)}" - ) - element_type = args[0] if args else Any - - # Convert set to sorted list for deterministic ordering - if isinstance(value, (set, frozenset)): - try: - # Sort if elements are comparable - value_list = sorted(value) - except TypeError: - # If elements aren't comparable (e.g., mixed types), convert to list as-is - # This maintains some order but isn't guaranteed to be deterministic - value_list = list(value) - else: - # Already a list/tuple, keep as-is - value_list = list(value) - - return [ - _convert_python_value_for_arrow(item, element_type, semantic_registry) - for item in value_list - ] - - # Handle mapping types - elif origin is dict or origin is Mapping: - if not isinstance(value, dict): - raise TypeError(f"Expected dict for {python_type}, got {type(value)}") - - key_type, value_type = (args[0], args[1]) if len(args) >= 2 else (Any, Any) - # Convert dict to list of key-value structs - key_value_list = [] - for k, v in value.items(): - converted_key = _convert_python_value_for_arrow( - k, key_type, semantic_registry - ) - converted_value = _convert_python_value_for_arrow( - v, value_type, semantic_registry - ) - key_value_list.append({"key": converted_key, "value": converted_value}) - return key_value_list - - # Handle tuple types - elif origin is tuple: - if not isinstance(value, (list, tuple)): - raise TypeError(f"Expected list/tuple for {python_type}, got {type(value)}") - - if len(set(args)) == 1: - # Homogeneous tuple - convert to list - element_type = args[0] - return [ - _convert_python_value_for_arrow(item, element_type, semantic_registry) - for item in value - ] - else: - # Heterogeneous tuple - convert to struct dict - if len(value) != len(args): - raise ValueError( - f"Tuple length mismatch: expected {len(args)}, got {len(value)}" - ) - struct_dict = {} - for i, (item, item_type) in enumerate(zip(value, args)): - struct_dict[f"f{i}"] = _convert_python_value_for_arrow( - item, item_type, semantic_registry - ) - return struct_dict - - # Handle dict types - elif origin is dict: - if not isinstance(value, dict): - raise TypeError(f"Expected dict for {python_type}, got {type(value)}") - - key_type, value_type = args - # Convert dict to list of key-value structs - key_value_list = [] - for k, v in value.items(): - converted_key = _convert_python_value_for_arrow( - k, key_type, semantic_registry - ) - converted_value = _convert_python_value_for_arrow( - v, value_type, semantic_registry - ) - key_value_list.append({"key": converted_key, "value": converted_value}) - return key_value_list - - else: - # For unsupported types, return as-is and let Arrow handle it - return value - - -def arrow_table_to_python_dicts( - table: pa.Table, semantic_registry: SemanticTypeRegistry | None = None -) -> list[dict]: - """ - Convert PyArrow table back to list of Python dictionaries with proper type conversion. - - Args: - table: PyArrow table to convert - semantic_registry: Optional semantic type registry for complex Python objects - - Returns: - List of Python dictionaries with proper Python types - - Examples: - Arrow table with x: int64, y: large_list - -> [{"x": 5, "y": [1, 2, 3]}, {"x": 9, "y": [2, 4]}] - - Arrow table with semantic types (Path stored as struct) - -> [{"name": "Alice", "file": Path("/home/alice/data.csv")}] - """ - # Convert table to list of raw dictionaries - raw_dicts = table.to_pylist() - - # Convert each dictionary with proper type transformations - converted_dicts = [] - for raw_dict in raw_dicts: - converted_dict = {} - for field_name, value in raw_dict.items(): - if value is not None: - # Get the Arrow field type - field = table.schema.field(field_name) - arrow_type = field.type - - # Convert based on Arrow type (with semantic type support) - converted_value = _convert_arrow_value_to_python( - value, arrow_type, semantic_registry - ) - converted_dict[field_name] = converted_value - else: - converted_dict[field_name] = None - converted_dicts.append(converted_dict) - - return converted_dicts - - -def _convert_arrow_value_to_python( - value, arrow_type, semantic_registry: SemanticTypeRegistry | None = None -): - """ - Convert Arrow value back to proper Python type. - - Args: - value: Value from Arrow table (as returned by to_pylist()) - arrow_type: PyArrow type of the field - semantic_registry: Optional semantic type registry - - Returns: - Value converted to proper Python type - """ - # First, check if this is a semantic struct type using signature recognition - if semantic_registry and pa.types.is_struct(arrow_type): - python_type = semantic_registry.get_python_type_for_struct_signature(arrow_type) - if python_type: - converter = semantic_registry.get_converter_for_python_type(python_type) - if converter and isinstance(value, dict): - # Convert using semantic type converter - return converter.struct_dict_to_python(value) - - # Fall back to standard type conversion - # Handle basic types - no conversion needed - if ( - pa.types.is_integer(arrow_type) - or pa.types.is_floating(arrow_type) - or pa.types.is_boolean(arrow_type) - or pa.types.is_string(arrow_type) - or pa.types.is_large_string(arrow_type) - or pa.types.is_binary(arrow_type) - or pa.types.is_large_binary(arrow_type) - ): - return value - - # Handle list types (including large_list and fixed_size_list) - elif ( - pa.types.is_list(arrow_type) - or pa.types.is_large_list(arrow_type) - or pa.types.is_fixed_size_list(arrow_type) - ): - if value is None: - return None - - element_type = arrow_type.value_type - - # Check if this is a dict representation: list> - if pa.types.is_struct(element_type): - field_names = [field.name for field in element_type] - if set(field_names) == {"key", "value"}: - # This is a dict - convert list of key-value structs to dict - result_dict = {} - for item in value: - if item is not None: - key_field = element_type.field("key") - value_field = element_type.field("value") - - converted_key = _convert_arrow_value_to_python( - item["key"], key_field.type, semantic_registry - ) - converted_value = _convert_arrow_value_to_python( - item["value"], value_field.type, semantic_registry - ) - result_dict[converted_key] = converted_value - return result_dict - - # Regular list - convert each element - converted_list = [] - for item in value: - converted_item = _convert_arrow_value_to_python( - item, element_type, semantic_registry - ) - converted_list.append(converted_item) - - # For fixed-size lists, convert to tuple if all elements are same type - if pa.types.is_fixed_size_list(arrow_type): - return tuple(converted_list) - else: - return converted_list - - # Handle struct types - elif pa.types.is_struct(arrow_type): - if value is None: - return None - - field_names = [field.name for field in arrow_type] - - # Check if this is a tuple representation (f0, f1, f2, ...) - if all(name.startswith("f") and name[1:].isdigit() for name in field_names): - # Convert struct to tuple - sorted_fields = sorted(arrow_type, key=lambda f: int(f.name[1:])) - tuple_values = [] - for field in sorted_fields: - field_value = value.get(field.name) - converted_value = _convert_arrow_value_to_python( - field_value, field.type, semantic_registry - ) - tuple_values.append(converted_value) - return tuple(tuple_values) - else: - # Regular struct - convert each field (could be semantic type handled above) - converted_struct = {} - for field in arrow_type: - field_name = field.name - field_value = value.get(field_name) - converted_value = _convert_arrow_value_to_python( - field_value, field.type, semantic_registry - ) - converted_struct[field_name] = converted_value - return converted_struct - - # Handle map types - elif pa.types.is_map(arrow_type): - if value is None: - return None - - # Maps are returned as list of {'key': k, 'value': v} dicts - result_dict = {} - key_type = arrow_type.key_type - item_type = arrow_type.item_type - - for item in value: - if item is not None: - converted_key = _convert_arrow_value_to_python( - item["key"], key_type, semantic_registry - ) - converted_value = _convert_arrow_value_to_python( - item["value"], item_type, semantic_registry - ) - result_dict[converted_key] = converted_value - return result_dict - - else: - # For unsupported types, return as-is - return value - - -if __name__ == "__main__": - print("=== Semantic Type System with Struct Signature Recognition ===\n") - - # This system now uses struct signature recognition instead of special marker fields - print("Key improvements:") - print("- Clean, self-documenting struct schemas") - print("- Zero storage overhead (no marker fields)") - print("- Natural field names for user queries") - print("- Struct signature uniquely identifies semantic types") - print("- Registry maps Python types ↔ struct signatures") - - print("\n" + "=" * 60) - print("Example struct signatures:") - print("Path: struct") - print("UUID: struct") - print("Email: struct") - print("GeoLocation: struct") - - print("\n" + "=" * 60) - print("Clean user queries enabled:") - print("SELECT file_info.path FROM my_table") - print("SELECT location.latitude, location.longitude FROM my_table") - print("SELECT user_id.uuid FROM my_table") diff --git a/src/orcapod/semantic_types/unused/struct_types.py b/src/orcapod/semantic_types/unused/struct_types.py deleted file mode 100644 index 3d34588..0000000 --- a/src/orcapod/semantic_types/unused/struct_types.py +++ /dev/null @@ -1,312 +0,0 @@ -""" -Dynamic TypedDict creation for preserving Arrow struct field information in Python type hints. - -This solves the problem of converting Arrow struct types back to Python type hints -that preserve full field name and type information. -""" - -from typing import TypedDict, Dict, Type, Any, get_type_hints -import pyarrow as pa -from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - - -class StructTypeManager: - """ - Manages dynamic TypedDict creation for Arrow struct types. - - This ensures that Arrow struct types can be converted to Python type hints - that preserve all field information. - """ - - def __init__(self): - # Cache created TypedDict classes to avoid duplicates - self._struct_signature_to_typeddict: Dict[pa.StructType, Type] = {} - self._typeddict_to_struct_signature: Dict[Type, pa.StructType] = {} - self._created_type_names: set[str] = set() - - def get_or_create_typeddict_for_struct( - self, - struct_type: pa.StructType, - semantic_registry: SemanticTypeRegistry | None = None, - ) -> Type: - """ - Get or create a TypedDict class that represents the Arrow struct type. - - Args: - struct_type: PyArrow struct type - semantic_registry: Optional semantic registry for nested types - - Returns: - TypedDict class that preserves all field information - """ - # Check cache first - if struct_type in self._struct_signature_to_typeddict: - return self._struct_signature_to_typeddict[struct_type] - - # Create field specifications for TypedDict - field_specs = {} - for field in struct_type: - field_name = field.name - python_type = self._convert_arrow_type_to_python_type( - field.type, semantic_registry - ) - field_specs[field_name] = python_type - - # Generate unique name for the TypedDict - type_name = self._generate_unique_type_name(field_specs) - - # Create TypedDict dynamically - typeddict_class = TypedDict(type_name, field_specs) - - # Cache the mapping - self._struct_signature_to_typeddict[struct_type] = typeddict_class - self._typeddict_to_struct_signature[typeddict_class] = struct_type - - return typeddict_class - - def get_struct_type_for_typeddict( - self, typeddict_class: Type - ) -> pa.StructType | None: - """Get the Arrow struct type for a dynamically created TypedDict.""" - return self._typeddict_to_struct_signature.get(typeddict_class) - - def is_dynamic_typeddict(self, python_type: Type) -> bool: - """Check if a type is one of our dynamically created TypedDicts.""" - return python_type in self._typeddict_to_struct_signature - - def _convert_arrow_type_to_python_type( - self, arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None - ) -> Type: - """Convert Arrow type to Python type, handling nested structs.""" - - # Handle nested struct types recursively - if pa.types.is_struct(arrow_type): - # Check if it's a registered semantic type first - if semantic_registry: - python_type = semantic_registry.get_python_type_for_struct_signature( - arrow_type - ) - if python_type: - return python_type - - # Create dynamic TypedDict for unregistered struct - return self.get_or_create_typeddict_for_struct( - arrow_type, semantic_registry - ) - - # For non-struct types, use standard conversion - from orcapod.semantic_types.semantic_converters import arrow_type_to_python - - return arrow_type_to_python(arrow_type, semantic_registry) - - def _generate_unique_type_name(self, field_specs: Dict[str, Type]) -> str: - """Generate a unique name for the TypedDict based on field specifications.""" - - # Create a descriptive name based on field names - field_names = sorted(field_specs.keys()) - if len(field_names) <= 3: - base_name = "Struct_" + "_".join(field_names) - else: - base_name = f"Struct_{len(field_names)}fields" - - # Ensure uniqueness - counter = 1 - type_name = base_name - while type_name in self._created_type_names: - type_name = f"{base_name}_{counter}" - counter += 1 - - self._created_type_names.add(type_name) - return type_name - - -# Global instance for managing struct types -_struct_type_manager = StructTypeManager() - - -def arrow_struct_to_python_type( - struct_type: pa.StructType, semantic_registry: SemanticTypeRegistry | None = None -) -> Type: - """ - Convert Arrow struct type to Python type hint that preserves field information. - - This creates a TypedDict that exactly matches the Arrow struct fields. - - Args: - struct_type: PyArrow struct type to convert - semantic_registry: Optional semantic registry for registered types - - Returns: - TypedDict class that preserves all field names and types - - Example: - struct -> TypedDict with name: str, age: int - """ - # First check if it's a registered semantic type - if semantic_registry: - python_type = semantic_registry.get_python_type_for_struct_signature( - struct_type - ) - if python_type: - return python_type - - # Create dynamic TypedDict for unregistered struct - return _struct_type_manager.get_or_create_typeddict_for_struct( - struct_type, semantic_registry - ) - - -def is_dynamic_struct_type(python_type: Type) -> bool: - """Check if a Python type is a dynamically created struct TypedDict.""" - return _struct_type_manager.is_dynamic_typeddict(python_type) - - -def get_struct_signature_for_dynamic_type(python_type: Type) -> pa.StructType | None: - """Get the Arrow struct signature for a dynamically created TypedDict.""" - return _struct_type_manager.get_struct_type_for_typeddict(python_type) - - -class DynamicStructConverter: - """Converter for dynamically created TypedDict structs.""" - - def __init__(self, typeddict_class: Type, struct_type: pa.StructType): - self.typeddict_class = typeddict_class - self.struct_type = struct_type - self._semantic_type_name = f"dynamic_struct_{typeddict_class.__name__.lower()}" - - @property - def semantic_type_name(self) -> str: - return self._semantic_type_name - - @property - def python_type(self) -> Type: - return self.typeddict_class - - @property - def arrow_struct_type(self) -> pa.StructType: - return self.struct_type - - def python_to_struct_dict(self, value: dict) -> dict: - """Convert TypedDict to Arrow struct dict (no conversion needed).""" - if not isinstance(value, dict): - raise TypeError( - f"Expected dict for {self.typeddict_class}, got {type(value)}" - ) - - # Validate that all required fields are present - type_hints = get_type_hints(self.typeddict_class) - for field_name in type_hints: - if field_name not in value: - raise ValueError( - f"Missing required field '{field_name}' for {self.typeddict_class}" - ) - - return value.copy() - - def struct_dict_to_python(self, struct_dict: dict) -> dict: - """Convert Arrow struct dict to TypedDict (no conversion needed).""" - return struct_dict.copy() - - def can_handle_python_type(self, python_type: Type) -> bool: - return python_type == self.typeddict_class - - -def register_dynamic_struct_converter( - registry: SemanticTypeRegistry, typeddict_class: Type, struct_type: pa.StructType -) -> None: - """Register a converter for a dynamically created TypedDict struct.""" - converter = DynamicStructConverter(typeddict_class, struct_type) - registry.register_converter(converter) - - -# Updated arrow_type_to_python function that preserves struct field information -def enhanced_arrow_type_to_python( - arrow_type: pa.DataType, semantic_registry: SemanticTypeRegistry | None = None -) -> Type: - """ - Enhanced version of arrow_type_to_python that preserves struct field information. - - For struct types, this creates TypedDict classes that preserve all field names and types. - """ - - # Handle struct types with full field preservation - if pa.types.is_struct(arrow_type): - return arrow_struct_to_python_type(arrow_type, semantic_registry) - - # For non-struct types, use standard conversion - from orcapod.semantic_types.semantic_converters import arrow_type_to_python - - return arrow_type_to_python(arrow_type, semantic_registry) - - -# Example usage and demonstration -if __name__ == "__main__": - print("=== Dynamic TypedDict Creation for Arrow Structs ===\n") - - from sample_converters import create_standard_semantic_registry - - # Create semantic registry - registry = create_standard_semantic_registry() - - # Test with various Arrow struct types - test_structs = [ - pa.struct([("name", pa.string()), ("age", pa.int64())]), - pa.struct([("x", pa.float64()), ("y", pa.float64()), ("z", pa.float64())]), - pa.struct( - [ - ("person", pa.struct([("name", pa.string()), ("age", pa.int64())])), - ("active", pa.bool_()), - ] - ), - ] - - print("Converting Arrow struct types to Python type hints:") - print("=" * 55) - - created_types = [] - for i, struct_type in enumerate(test_structs): - python_type = arrow_struct_to_python_type(struct_type, registry) - created_types.append(python_type) - - print(f"\nStruct {i + 1}:") - print(f" Arrow: {struct_type}") - print(f" Python: {python_type}") - print(f" Type name: {python_type.__name__}") - - # Show field information - type_hints = get_type_hints(python_type) - print(f" Fields: {type_hints}") - - print(f"\n" + "=" * 55) - print("Testing usage of created TypedDict types:") - - # Test the first created type (name, age) - PersonType = created_types[0] - person_data: PersonType = {"name": "Alice", "age": 30} - print(f"\nPerson data: {person_data}") - print( - f"Type check: {isinstance(person_data, dict)}" - ) # Still a regular dict at runtime - print(f"Field access: name={person_data['name']}, age={person_data['age']}") - - # Test nested struct type - if len(created_types) > 2: - NestedType = created_types[2] - # For nested struct, we need to create the inner struct too - inner_person: PersonType = {"name": "Bob", "age": 25} - nested_data: NestedType = {"person": inner_person, "active": True} - print(f"\nNested data: {nested_data}") - print(f"Nested access: person.name={nested_data['person']['name']}") - - print(f"\n" + "=" * 55) - print("Benefits of this approach:") - print("✓ Full field information preserved in type hints") - print("✓ Arrow struct -> Python type conversion is complete") - print("✓ Type checkers understand the structure") - print("✓ Runtime is still regular dicts (zero overhead)") - print("✓ Perfect round-trip: Python -> Arrow -> Python") - print("✓ Handles nested structs recursively") - - print( - f"\nDynamic TypedDict creation successfully preserves all Arrow struct field information!" - ) diff --git a/src/orcapod/semantic_types/unused/table_converters.py b/src/orcapod/semantic_types/unused/table_converters.py deleted file mode 100644 index d9161ef..0000000 --- a/src/orcapod/semantic_types/unused/table_converters.py +++ /dev/null @@ -1,362 +0,0 @@ -""" -Schema system for struct-based semantic types. - -This replaces the metadata-based schema handling with explicit struct types -in the Arrow schema itself. -""" - -from collections.abc import Mapping -from typing import Any, Protocol, Self -import pyarrow as pa - -from orcapod.types import TypeSpec -from .struct_converters import ( - StructConverter, - SemanticTypeRegistry, - SemanticStructConverter, -) - - -class SemanticSchema: - """Schema that handles semantic types as explicit struct fields.""" - - def __init__(self, python_schema: TypeSpec, registry: SemanticTypeRegistry): - """ - Create a semantic schema. - - Args: - schema_dict: Mapping of field names to Python types - registry: Semantic type registry to use - """ - self.python_schema = dict(python_schema) - # TODO: integrate with data context system - self.registry = registry # or DEFAULT_REGISTRY - self.converter = SemanticStructConverter(self.registry) - - def to_arrow_schema(self) -> pa.Schema: - """Convert to Arrow schema with semantic types as structs.""" - fields = [] - - for field_name, python_type in self.python_schema.items(): - # Check if this is a semantic type - converter = self.registry.get_converter_for_python_type(python_type) - - if converter: - # Use the struct type for semantic types - arrow_type = converter.arrow_struct_type - else: - # Use standard Arrow types for regular types - arrow_type = self._python_to_arrow_type(python_type) - - fields.append(pa.field(field_name, arrow_type)) - - return pa.schema(fields) - - def _python_to_arrow_type(self, python_type: type) -> pa.DataType: - """Convert Python type to Arrow type for non-semantic types.""" - type_mapping = { - int: pa.int64(), - float: pa.float64(), - str: pa.large_string(), - bool: pa.bool_(), - bytes: pa.binary(), - } - - if python_type in type_mapping: - return type_mapping[python_type] - else: - raise TypeError(f"Unsupported Python type: {python_type}") - - @classmethod - def from_arrow_schema( - cls, arrow_schema: pa.Schema, registry: SemanticTypeRegistry - ) -> "SemanticSchema": - """Create SemanticSchema from Arrow schema.""" - schema_dict = {} - - for field in arrow_schema: - if pa.types.is_struct(field.type): - # Check if this is a semantic struct - converter = registry.get_converter_for_struct_type(field.type) - if converter: - schema_dict[field.name] = converter.python_type - else: - # Regular struct - not supported yet - # TODO: support by constructing typed dictionary - raise ValueError( - f"Non-semantic struct types not supported: {field.type}" - ) - else: - # Regular Arrow type - schema_dict[field.name] = cls._arrow_to_python_type(field.type) - - return cls(schema_dict, registry) - - @staticmethod - def _arrow_to_python_type(arrow_type: pa.DataType) -> type: - """Convert Arrow type to Python type.""" - if pa.types.is_integer(arrow_type): - return int - elif pa.types.is_floating(arrow_type): - return float - elif pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type): - return str - elif pa.types.is_boolean(arrow_type): - return bool - elif pa.types.is_binary(arrow_type): - return bytes - else: - raise TypeError(f"Unsupported Arrow type: {arrow_type}") - - def get_semantic_fields(self) -> dict[str, type]: - """Get fields that are semantic types.""" - semantic_fields = {} - for field_name, python_type in self.python_schema.items(): - if self.registry.has_python_type(python_type): - semantic_fields[field_name] = python_type - return semantic_fields - - def get_regular_fields(self) -> dict[str, type]: - """Get fields that are regular (non-semantic) types.""" - regular_fields = {} - for field_name, python_type in self.python_schema.items(): - if not self.registry.has_python_type(python_type): - regular_fields[field_name] = python_type - return regular_fields - - -class SemanticTableConverter(Protocol): - """Protocol for semantic table converters. - - This defines the interface for converting between Python dicts and Arrow tables - with semantic types. - """ - - def get_struct_converter(self, field: str) -> StructConverter | None: - """Get struct converter for a specific field in table.""" - ... - - def python_dict_to_struct_dict( - self, data_dict: Mapping[str, Any] - ) -> dict[str, Any]: - """Convert Python dict to struct dict for semantic fields.""" - ... - - def struct_dict_to_python_dict( - self, struct_dict: Mapping[str, Any] - ) -> dict[str, Any]: - """Convert struct dict back to Python dict for semantic fields.""" - ... - - def python_dict_to_arrow_table(self, data_dict: dict[str, Any]) -> pa.Table: - """Convert single Python dict to Arrow table.""" - ... - - def python_dicts_to_arrow_table(self, data_dicts: list[dict[str, Any]]) -> pa.Table: - """Convert list of Python dicts to Arrow table with semantic structs.""" - ... - - def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: - """Convert Arrow table back to list of Python dicts.""" - ... - - -class SchemaSemanticTableConverter: - """Schema-specific semantic converter that pre-resolves semantic type converters for efficiency. - - This converter is optimized for batch processing of data with a consistent schema. - It pre-resolves all semantic type converters during initialization to avoid - repeated registry lookups during data conversion. - """ - - def __init__(self, schema: SemanticSchema): - """ - Create converter for a specific schema. - - Args: - schema: Semantic schema defining field types and semantic mappings - """ - self.schema = schema - - # Pre-resolve converters for each semantic field (performance optimization) - self.field_converters: dict[str, StructConverter] = {} - self.semantic_fields = set() - self.regular_fields = set() - - for field_name, python_type in schema.python_schema.items(): - converter = self.schema.registry.get_converter_for_python_type(python_type) - if converter: - self.field_converters[field_name] = converter - self.semantic_fields.add(field_name) - else: - self.regular_fields.add(field_name) - - def get_semantic_fields(self) -> tuple[str, ...]: - """Get names of fields that are semantic types.""" - return tuple(self.field_converters.keys()) - - def get_struct_converter_for_field(self, field: str) -> StructConverter | None: - """Get struct converter for a specific field.""" - return self.field_converters.get(field) - - @classmethod - def from_python_schema( - cls, python_schema: TypeSpec, registry: SemanticTypeRegistry - ) -> Self: - """Factory method to create converter from schema.""" - return cls(SemanticSchema(python_schema, registry)) - - @classmethod - def from_arrow_schema( - cls, arrow_schema: "pa.Schema", registry: SemanticTypeRegistry - ) -> Self: - return cls(SemanticSchema.from_arrow_schema(arrow_schema, registry)) - - def python_dict_to_struct_dict( - self, data_dict: Mapping[str, Any] - ) -> dict[str, Any]: - """Convert Python dict to struct dict for semantic fields.""" - result = dict(data_dict) - - for field_name, converter in self.field_converters.items(): - if field_name in result and result[field_name] is not None: - result[field_name] = converter.python_to_struct_dict(result[field_name]) - - return result - - def struct_dict_to_python_dict( - self, struct_dict: Mapping[str, Any] - ) -> dict[str, Any]: - """Convert struct dict back to Python dict for semantic fields.""" - result = dict(struct_dict) - - for field_name, converter in self.field_converters.items(): - if field_name in result and result[field_name] is not None: - if isinstance(result[field_name], dict): - result[field_name] = converter.struct_dict_to_python( - result[field_name] - ) - - return result - - def python_dicts_to_arrow_table(self, data_dicts: list[dict[str, Any]]) -> pa.Table: - """Convert list of Python dicts to Arrow table with semantic structs.""" - if not data_dicts: - raise ValueError("Cannot create table from empty list") - - # Process each field using pre-resolved converters - arrow_data = {} - - for field_name in self.schema.python_schema.keys(): - values = [d.get(field_name) for d in data_dicts] - - if field_name in self.field_converters: - # Semantic field - convert to structs using pre-resolved converter - converter = self.field_converters[field_name] - struct_dicts = [] - for value in values: - if value is not None: - struct_dicts.append(converter.python_to_struct_dict(value)) - else: - struct_dicts.append(None) - arrow_data[field_name] = pa.array( - struct_dicts, type=converter.arrow_struct_type - ) - else: - # Regular field - arrow_data[field_name] = pa.array(values) - - return pa.table(arrow_data, schema=self.schema.to_arrow_schema()) - - def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: - """Convert Arrow table back to list of Python dicts.""" - # Convert table to list of dictionaries - raw_dicts = table.to_pylist() - - # Process each dictionary to convert structs back to Python objects - python_dicts = [] - for raw_dict in raw_dicts: - python_dict = {} - for field_name, value in raw_dict.items(): - if field_name in self.field_converters and isinstance(value, dict): - # Convert semantic struct back to Python object using pre-resolved converter - converter = self.field_converters[field_name] - python_dict[field_name] = converter.struct_dict_to_python(value) - else: - # Regular value - python_dict[field_name] = value - python_dicts.append(python_dict) - - return python_dicts - - def python_dict_to_arrow_table(self, data_dict: dict[str, Any]) -> pa.Table: - """Convert single Python dict to Arrow table.""" - return self.python_dicts_to_arrow_table([data_dict]) - - -class AutoSemanticTableConverter: - """General-purpose converter for working with semantic types without pre-defined schema.""" - - def __init__(self, registry: SemanticTypeRegistry): - self.registry = registry - self.struct_converter = SemanticStructConverter(self.registry) - - def python_dict_to_arrow_table( - self, data_dict: dict[str, Any], schema: SemanticSchema | None = None - ) -> pa.Table: - """Convert dictionary of Python values to Arrow table.""" - if schema is None: - # Infer schema from data - schema_dict = {key: type(value) for key, value in data_dict.items()} - schema = SemanticSchema(schema_dict, self.registry) - - # Use schema-specific converter for efficiency - converter = SchemaSemanticTableConverter(schema) - return converter.python_dict_to_arrow_table(data_dict) - - def arrow_table_to_python_dicts(self, table: pa.Table) -> list[dict[str, Any]]: - """Convert Arrow table back to list of Python dictionaries.""" - # Infer schema from Arrow table - schema = SemanticSchema.from_arrow_schema(table.schema, self.registry) - - # Use schema-specific converter for efficiency - converter = SchemaSemanticTableConverter(schema) - return converter.arrow_table_to_python_dicts(table) - - def python_dicts_to_arrow_table( - self, dicts: list[dict[str, Any]], schema: SemanticSchema | None = None - ) -> pa.Table: - """Convert list of Python dictionaries to Arrow table.""" - if not dicts: - raise ValueError("Cannot create table from empty list") - - if schema is None: - # Infer schema from first dictionary - schema_dict = {key: type(value) for key, value in dicts[0].items()} - schema = SemanticSchema(schema_dict, self.registry) - - # Use schema-specific converter for efficiency - converter = SchemaSemanticTableConverter(schema) - return converter.python_dicts_to_arrow_table(dicts) - - -# Utility functions for working with semantic tables -def create_semantic_table( - data: dict[str, Any] | list[dict[str, Any]], - registry: SemanticTypeRegistry, -) -> pa.Table: - """Convenience function to create Arrow table with semantic types.""" - converter = SemanticTableConverter(registry) - - if isinstance(data, dict): - return converter.python_dict_to_arrow_table(data) - else: - return converter.python_dicts_to_arrow_table(data) - - -def extract_python_data( - table: pa.Table, registry: SemanticTypeRegistry -) -> list[dict[str, Any]]: - """Convenience function to extract Python data from semantic table.""" - converter = SemanticTableConverter(registry) - return converter.arrow_table_to_python_dicts(table) From c575c80c589dc2d700009723b7b5280218f6b635 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 19:38:53 -0700 Subject: [PATCH 205/224] type: handle input PythonSchema case --- src/orcapod/data/datagrams/arrow_datagram.py | 2 +- src/orcapod/data/datagrams/dict_datagram.py | 4 ++-- src/orcapod/data/operators/base.py | 14 +++++++------- src/orcapod/data/streams.py | 2 +- src/orcapod/protocols/semantic_types_protocols.py | 9 +++++---- src/orcapod/types.py | 4 +++- 6 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/data/datagrams/arrow_datagram.py index a6cc85a..2ff463e 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/data/datagrams/arrow_datagram.py @@ -215,7 +215,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """ Return Python schema for the datagram. diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index dcee19d..6912f57 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -7,7 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_python_schema_from_pylist_data -from orcapod.types import DataValue +from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.protocols.hashing_protocols import ContentHash @@ -215,7 +215,7 @@ def types( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """ Return Python schema for the datagram. diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/data/operators/base.py index 7cf5bf4..1d5f07c 100644 --- a/src/orcapod/data/operators/base.py +++ b/src/orcapod/data/operators/base.py @@ -1,7 +1,7 @@ from ast import Not from orcapod.data.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp -from orcapod.types import TypeSpec +from orcapod.types import PythonSchema from abc import abstractmethod from typing import Any from collections.abc import Collection @@ -63,7 +63,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: stream = streams[0] return self.op_output_types(stream, include_system_tags=include_system_tags) @@ -98,7 +98,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: @abstractmethod def op_output_types( self, stream: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. @@ -145,7 +145,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: left_stream, right_stream = streams return self.op_output_types( left_stream, right_stream, include_system_tags=include_system_tags @@ -187,7 +187,7 @@ def op_output_types( left_stream: dp.Stream, right_stream: dp.Stream, include_system_tags: bool = False, - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. @@ -240,7 +240,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: return self.op_output_types(*streams, include_system_tags=include_system_tags) def kernel_identity_structure( @@ -271,7 +271,7 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: @abstractmethod def op_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[TypeSpec, TypeSpec]: + ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes at least one stream as input and returns a tuple of typespecs. diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index d928b51..43365c7 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -687,7 +687,7 @@ def keys( def types( self, include_system_tags: bool = False - ) -> tuple[dict[str, type], dict[str, type]]: + ) -> tuple[PythonSchema, PythonSchema]: """ Returns the types of the tag and packet columns in the stream. This is useful for accessing the types of the columns in the stream. diff --git a/src/orcapod/protocols/semantic_types_protocols.py b/src/orcapod/protocols/semantic_types_protocols.py index 52a78cf..855f8a0 100644 --- a/src/orcapod/protocols/semantic_types_protocols.py +++ b/src/orcapod/protocols/semantic_types_protocols.py @@ -1,5 +1,6 @@ from typing import TYPE_CHECKING, Any, Protocol from collections.abc import Callable +from orcapod.types import PythonSchema, PythonSchemaLike if TYPE_CHECKING: @@ -10,19 +11,19 @@ class TypeConverter(Protocol): def python_type_to_arrow_type(self, python_type: type) -> "pa.DataType": ... def python_schema_to_arrow_schema( - self, python_schema: dict[str, type] + self, python_schema: PythonSchemaLike ) -> "pa.Schema": ... def arrow_type_to_python_type(self, arrow_type: "pa.DataType") -> type: ... def arrow_schema_to_python_schema( self, arrow_schema: "pa.Schema" - ) -> dict[str, type]: ... + ) -> PythonSchema: ... def python_dicts_to_struct_dicts( self, python_dicts: list[dict[str, Any]], - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, ) -> list[dict[str, Any]]: ... def struct_dicts_to_python_dicts( @@ -34,7 +35,7 @@ def struct_dicts_to_python_dicts( def python_dicts_to_arrow_table( self, python_dicts: list[dict[str, Any]], - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, arrow_schema: "pa.Schema | None" = None, ) -> "pa.Table": ... diff --git a/src/orcapod/types.py b/src/orcapod/types.py index 32b87df..745568e 100644 --- a/src/orcapod/types.py +++ b/src/orcapod/types.py @@ -9,7 +9,9 @@ DataType: TypeAlias = type | UnionType -PythonSchema: TypeAlias = dict[ +PythonSchema: TypeAlias = dict[str, DataType] # dict of parameter names to their types + +PythonSchemaLike: TypeAlias = Mapping[ str, DataType ] # Mapping of parameter names to their types From 4acd1574e05cac97510ebeb7313aaf85e1a3d540 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 19:43:18 -0700 Subject: [PATCH 206/224] type: further fix on python schema --- src/orcapod/data/datagrams/arrow_tag_packet.py | 4 ++-- src/orcapod/data/datagrams/dict_datagram.py | 4 ++-- src/orcapod/data/datagrams/dict_tag_packet.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/data/datagrams/arrow_tag_packet.py index d57e906..e0be151 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/data/datagrams/arrow_tag_packet.py @@ -92,7 +92,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, @@ -303,7 +303,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/data/datagrams/dict_datagram.py index 6912f57..a56bf1a 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/data/datagrams/dict_datagram.py @@ -7,7 +7,7 @@ from orcapod import contexts from orcapod.data.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_python_schema_from_pylist_data -from orcapod.types import DataValue, PythonSchema +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils from orcapod.protocols.hashing_protocols import ContentHash @@ -57,7 +57,7 @@ class DictDatagram(BaseDatagram): def __init__( self, data: Mapping[str, DataValue], - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/data/datagrams/dict_tag_packet.py index 64575d3..be6160b 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/data/datagrams/dict_tag_packet.py @@ -9,7 +9,7 @@ from orcapod.data.datagrams.dict_datagram import DictDatagram from orcapod.utils import arrow_utils from orcapod.semantic_types import infer_python_schema_from_pylist_data -from orcapod.types import DataValue +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def __init__( ) self._system_tags = {**extracted_system_tags, **(system_tags or {})} - self._system_tags_python_schema: dict[str, type] = ( + self._system_tags_python_schema: PythonSchema = ( infer_python_schema_from_pylist_data([self._system_tags]) ) self._cached_system_tags_table: pa.Table | None = None @@ -141,7 +141,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, @@ -266,7 +266,7 @@ def __init__( data: Mapping[str, DataValue], meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: # normalize the data content and remove any source info keys @@ -391,7 +391,7 @@ def types( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> dict[str, type]: + ) -> PythonSchema: """Return copy of the Python schema.""" schema = super().types( include_all_info=include_all_info, From d5c53401b463826b5a51f0720f1d481feda4d12d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 20:37:02 -0700 Subject: [PATCH 207/224] feat: add support for system tags and source id info --- src/orcapod/data/arrow_data_utils.py | 60 +++++++++++++ .../data/sources/arrow_table_source.py | 48 ++++++++--- src/orcapod/data/sources/dict_source.py | 86 ++++--------------- 3 files changed, 113 insertions(+), 81 deletions(-) create mode 100644 src/orcapod/data/arrow_data_utils.py diff --git a/src/orcapod/data/arrow_data_utils.py b/src/orcapod/data/arrow_data_utils.py new file mode 100644 index 0000000..1c47599 --- /dev/null +++ b/src/orcapod/data/arrow_data_utils.py @@ -0,0 +1,60 @@ +# Collection of functions to work with Arrow table data that underlies streams and/or datagrams +from orcapod.utils.lazy_module import LazyModule +from typing import TYPE_CHECKING +from orcapod.data.system_constants import constants +from collections.abc import Collection + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +def drop_columns_with_prefix( + table: "pa.Table", + prefix: str | tuple[str, ...], + exclude_columns: Collection[str] = (), +) -> "pa.Table": + """Drop columns with a specific prefix from an Arrow table.""" + columns_to_drop = [ + col + for col in table.column_names + if col.startswith(prefix) and col not in exclude_columns + ] + return table.drop(columns=columns_to_drop) + + +def drop_system_columns( + table, + system_column_prefix: tuple[str, ...] = ( + constants.META_PREFIX, + constants.DATAGRAM_PREFIX, + ), +) -> "pa.Table": + return drop_columns_with_prefix(table, system_column_prefix) + + +def add_source_info( + table: "pa.Table", + source_info: str | None, + exclude_prefixes: Collection[str] = ( + constants.META_PREFIX, + constants.DATAGRAM_PREFIX, + ), + exclude_columns: Collection[str] = (), +) -> "pa.Table": + """Add source information to an Arrow table.""" + # Create a new column with the source information + source_column = pa.array([source_info] * table.num_rows) + + # identify columns for which source columns should be created + + for col in table.column_names: + if col.startswith(tuple(exclude_prefixes)) or col in exclude_columns: + continue + source_column = pa.array( + [f"{source_info}:{col}"] * table.num_rows, type=pa.large_string() + ) + table = table.append_column(f"{constants.SOURCE_PREFIX}{col}", source_column) + + return table diff --git a/src/orcapod/data/sources/arrow_table_source.py b/src/orcapod/data/sources/arrow_table_source.py index fc98109..8e71b0d 100644 --- a/src/orcapod/data/sources/arrow_table_source.py +++ b/src/orcapod/data/sources/arrow_table_source.py @@ -1,14 +1,13 @@ -from collections.abc import Collection, Mapping +from collections.abc import Collection from typing import TYPE_CHECKING, Any from orcapod.data.streams import TableStream from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue -from orcapod.utils import arrow_utils +from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants -from orcapod.semantic_types import infer_python_schema_from_pylist_data +from orcapod.data import arrow_data_utils if TYPE_CHECKING: import pyarrow as pa @@ -25,26 +24,47 @@ def __init__( self, table: "pa.Table", tag_columns: Collection[str] = (), - system_tag_columns: Collection[str] = (), - source_info: dict[str, str | None] | None = None, + source_info: str | None = None, **kwargs, ): super().__init__(**kwargs) - self.table = table - self.tag_columns = tag_columns - self.system_tag_columns = system_tag_columns + + # clean the table, dropping any system columns + # TODO: consider special treatment of system columns if provided + table = arrow_data_utils.drop_system_columns(table) + self.table_hash = self.data_context.arrow_hasher.hash_table(table) + + if source_info is None: + source_info = f"arrow_table:{self.table_hash.to_hex(char_count=16)}" self.source_info = source_info - self.table_hash = self.data_context.arrow_hasher.hash_table(self.table) + + self.tag_columns = [col for col in tag_columns if col in table.column_names] + + # add system tag column, indexing into the array + system_tag_column = pa.array(list(range(table.num_rows)), pa.int64()) + + table = table.add_column( + 0, f"{constants.SYSTEM_TAG_PREFIX}{self.source_info}", system_tag_column + ) + + # add source info + self._table = arrow_data_utils.add_source_info( + table, self.source_info, exclude_columns=tag_columns + ) + self._table_stream = TableStream( - table=self.table, + table=self._table, tag_columns=self.tag_columns, - system_tag_columns=self.system_tag_columns, source=self, upstreams=(), ) + @property + def table(self) -> "pa.Table": + return self._table + def source_identity_structure(self) -> Any: - return (self.__class__.__name__, self.table_hash) + return (self.__class__.__name__, self.source_info, self.table_hash) def get_all_records( self, include_system_columns: bool = False @@ -61,6 +81,6 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: def source_output_types( self, include_system_tags: bool = False - ) -> tuple[dict[str, type], dict[str, type]]: + ) -> tuple[PythonSchema, PythonSchema]: """Return tag and packet types based on provided typespecs.""" return self._table_stream.types(include_system_tags=include_system_tags) diff --git a/src/orcapod/data/sources/dict_source.py b/src/orcapod/data/sources/dict_source.py index 42456e8..bd5d114 100644 --- a/src/orcapod/data/sources/dict_source.py +++ b/src/orcapod/data/sources/dict_source.py @@ -1,14 +1,12 @@ -from collections.abc import Collection, Mapping +from collections.abc import Collection from typing import TYPE_CHECKING, Any -from orcapod.data.streams import TableStream from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue -from orcapod.utils import arrow_utils +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants -from orcapod.semantic_types import infer_python_schema_from_pylist_data +from orcapod.data.sources.arrow_table_source import ArrowTableSource if TYPE_CHECKING: import pyarrow as pa @@ -65,49 +63,28 @@ class DictSource(SourceBase): def __init__( self, data: Collection[dict[str, DataValue]], - tag_columns: Collection[str], - tag_schema: Mapping[str, type] | None = None, - packet_schema: Mapping[str, type] | None = None, + tag_columns: Collection[str] = (), + system_tag_columns: Collection[str] = (), + data_schema: PythonSchemaLike | None = None, **kwargs, ): super().__init__(**kwargs) - data = list(data) - tags = [] - packets = [] - for item in data: - tags.append({k: item[k] for k in tag_columns}) - packets.append({k: item[k] for k in item if k not in tag_columns}) - - # TODO: visit source info logic - source_info = ":".join(self.kernel_id) - - raw_data, system_data = split_system_columns(data) - - self.tags = tags - self.packets = [add_source_field(packet, source_info) for packet in packets] - - self.tag_schema = ( - dict(tag_schema) - if tag_schema - else infer_python_schema_from_pylist_data(self.tags) + arrow_table = self.data_context.type_converter.python_dicts_to_arrow_table( + list(data), python_schema=data_schema ) - self.packet_schema = ( - dict(packet_schema) - if packet_schema - else infer_python_schema_from_pylist_data(self.packets) + self._table_source = ArrowTableSource( + arrow_table, tag_columns=tag_columns, system_tag_columns=system_tag_columns ) def source_identity_structure(self) -> Any: - return ( - self.__class__.__name__, - tuple(self.tag_schema.items()), - tuple(self.packet_schema.items()), - ) + return self._table_source.source_identity_structure() def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": - return self().as_table(include_source=include_system_columns) + return self._table_source.get_all_records( + include_system_columns=include_system_columns + ) def forward(self, *streams: dp.Stream) -> dp.Stream: """ @@ -115,38 +92,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: This is called by forward() and creates a fresh snapshot each time. """ - tag_arrow_schema = ( - self._data_context.type_converter.python_schema_to_arrow_schema( - self.tag_schema - ) - ) - packet_arrow_schema = ( - self._data_context.type_converter.python_schema_to_arrow_schema( - self.packet_schema - ) - ) - - joined_data = [ - {**tag, **packet} for tag, packet in zip(self.tags, self.packets) - ] - - table = pa.Table.from_pylist( - joined_data, - schema=arrow_utils.join_arrow_schemas( - tag_arrow_schema, packet_arrow_schema - ), - ) - - return TableStream( - table=table, - tag_columns=self.tag_keys, - source=self, - upstreams=(), - ) + return self._table_source.forward(*streams) def source_output_types( self, include_system_tags: bool = False - ) -> tuple[dict[str, type], dict[str, type]]: + ) -> tuple[PythonSchema, PythonSchema]: """Return tag and packet types based on provided typespecs.""" # TODO: add system tag - return self.tag_schema, self.packet_schema + return self._table_source.source_output_types( + include_system_tags=include_system_tags + ) From 42308f9821d5287fd705097d35cafbd882ae904c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Tue, 26 Aug 2025 20:38:22 -0700 Subject: [PATCH 208/224] build: ignore test notebooks --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1e38613..76b840b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,9 @@ notebooks/**/*.db # Ignore profiler output *.prof -# Ignore any notebook that starts with an underscore +# Ignore any notebook that starts with an underscore or test notebooks/**/_*.ipynb +notebooks/**/test*.ipynb # Ignore vscode settings .vscode/ From e60a6e7ab3565828c391881f9f5bb878f312499c Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 27 Aug 2025 15:48:08 -0700 Subject: [PATCH 209/224] refactor: rename stores to databases --- src/orcapod/__init__.py | 4 +-- src/orcapod/data/pods.py | 30 +++++++++---------- src/orcapod/{stores => databases}/__init__.py | 2 +- .../delta_lake_databases.py} | 6 ++-- .../{stores => databases}/file_utils.py | 0 .../legacy/delta_table_arrow_data_store.py | 0 .../legacy/dict_data_stores.py | 2 +- .../legacy/dict_transfer_data_store.py | 2 +- .../legacy/legacy_arrow_data_stores.py | 2 +- .../legacy/safe_dir_data_store.py | 0 .../{stores => databases}/legacy/types.py | 0 src/orcapod/pipeline/graph.py | 30 +++++++++---------- src/orcapod/pipeline/nodes.py | 30 +++++++++---------- ...ore_protocols.py => database_protocols.py} | 6 ++-- tests/test_store/test_dir_data_store.py | 2 +- tests/test_store/test_integration.py | 2 +- tests/test_store/test_noop_data_store.py | 4 +-- tests/test_store/test_transfer_data_store.py | 4 +-- 18 files changed, 62 insertions(+), 64 deletions(-) rename src/orcapod/{stores => databases}/__init__.py (89%) rename src/orcapod/{stores/delta_lake_stores.py => databases/delta_lake_databases.py} (99%) rename src/orcapod/{stores => databases}/file_utils.py (100%) rename src/orcapod/{stores => databases}/legacy/delta_table_arrow_data_store.py (100%) rename src/orcapod/{stores => databases}/legacy/dict_data_stores.py (99%) rename src/orcapod/{stores => databases}/legacy/dict_transfer_data_store.py (97%) rename src/orcapod/{stores => databases}/legacy/legacy_arrow_data_stores.py (99%) rename src/orcapod/{stores => databases}/legacy/safe_dir_data_store.py (100%) rename src/orcapod/{stores => databases}/legacy/types.py (100%) rename src/orcapod/protocols/{store_protocols.py => database_protocols.py} (91%) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index cd9f09a..2a83782 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -2,7 +2,7 @@ from .data.pods import function_pod, FunctionPod, CachedPod from .data import streams from .data import operators -from . import stores +from . import databases from .pipeline import Pipeline @@ -15,7 +15,7 @@ "FunctionPod", "CachedPod", "streams", - "stores", + "databases", "operators", "Pipeline", ] diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index bd41151..bc7eb49 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -16,7 +16,7 @@ from orcapod.data.system_constants import constants from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp -from orcapod.protocols.store_protocols import ArrowDataStore +from orcapod.protocols.database_protocols import ArrowDatabase from orcapod.types import DataValue, PythonSchema from orcapod.utils import types_utils from orcapod.utils.lazy_module import LazyModule @@ -341,7 +341,7 @@ def tiered_pod_id(self) -> dict[str, str]: } @property - def kernel_id(self) -> tuple[str, ...]: + def reference(self) -> tuple[str, ...]: return ( self.function_name, self._output_packet_type_hash, @@ -360,14 +360,14 @@ def get_record_id( prefix_hasher_id=True, ) - def input_packet_types(self) -> dict[str, type]: + def input_packet_types(self) -> PythonSchema: """ Return the input typespec for the function pod. This is used to validate the input streams. """ return self._input_packet_schema.copy() - def output_packet_types(self) -> dict[str, type]: + def output_packet_types(self) -> PythonSchema: """ Return the output typespec for the function pod. This is used to validate the output streams. @@ -420,7 +420,7 @@ def call( # if record_id is not provided, generate it from the packet record_id = self.get_record_id(packet, execution_engine_hash) source_info = { - k: ":".join(self.kernel_id + (record_id, k)) for k in output_data + k: ":".join(self.reference + (record_id, k)) for k in output_data } output_packet = DictPacket( @@ -470,7 +470,7 @@ async def async_call( # if record_id is not provided, generate it from the packet record_id = self.get_record_id(packet, execution_engine_hash) source_info = { - k: ":".join(self.kernel_id + (record_id, k)) for k in output_data + k: ":".join(self.reference + (record_id, k)) for k in output_data } output_packet = DictPacket( @@ -504,7 +504,7 @@ def process_function_output(self, values: Any) -> dict[str, DataValue]: def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None ) -> Any: - id_struct = (self.__class__.__name__,) + self.kernel_id + id_struct = (self.__class__.__name__,) + self.reference # if streams are provided, perform pre-processing step, validate, and add the # resulting single stream to the identity structure if streams is not None and len(streams) != 0: @@ -538,12 +538,12 @@ def __init__( self.pod = pod @property - def kernel_id(self) -> tuple[str, ...]: + def reference(self) -> tuple[str, ...]: """ Return the pod ID, which is the function name of the wrapped pod. This is used to identify the pod in the system. """ - return self.pod.kernel_id + return self.pod.reference def get_record_id(self, packet: dp.Packet, execution_engine_hash: str) -> str: return self.pod.get_record_id(packet, execution_engine_hash) @@ -621,7 +621,7 @@ class CachedPod(WrappedPod): def __init__( self, pod: dp.Pod, - result_store: ArrowDataStore, + result_database: ArrowDatabase, record_path_prefix: tuple[str, ...] = (), match_tier: str | None = None, retrieval_mode: Literal["latest", "most_specific"] = "latest", @@ -629,7 +629,7 @@ def __init__( ): super().__init__(pod, **kwargs) self.record_path_prefix = record_path_prefix - self.result_store = result_store + self.result_database = result_database self.match_tier = match_tier self.retrieval_mode = retrieval_mode @@ -643,7 +643,7 @@ def record_path(self) -> tuple[str, ...]: Return the path to the record in the result store. This is used to store the results of the pod. """ - return self.record_path_prefix + self.kernel_id + return self.record_path_prefix + self.reference def call( self, @@ -754,7 +754,7 @@ def record_packet( input_packet, execution_engine_hash=execution_engine_hash ) - self.result_store.add_record( + self.result_database.add_record( self.record_path, record_id, data_table, @@ -788,7 +788,7 @@ def get_cached_output_for_packet(self, input_packet: dp.Packet) -> dp.Packet | N self.pod.tiered_pod_id[self.match_tier] ) - result_table = self.result_store.get_records_with_column_value( + result_table = self.result_database.get_records_with_column_value( self.record_path, constraints, ) @@ -848,7 +848,7 @@ def get_all_cached_outputs( record_id_column = ( constants.PACKET_RECORD_ID if include_system_columns else None ) - result_table = self.result_store.get_all_records( + result_table = self.result_database.get_all_records( self.record_path, record_id_column=record_id_column ) if result_table is None or result_table.num_rows == 0: diff --git a/src/orcapod/stores/__init__.py b/src/orcapod/databases/__init__.py similarity index 89% rename from src/orcapod/stores/__init__.py rename to src/orcapod/databases/__init__.py index cd9ae0f..f47c734 100644 --- a/src/orcapod/stores/__init__.py +++ b/src/orcapod/databases/__init__.py @@ -13,4 +13,4 @@ # "SimpleParquetDataStore", # ] -from .delta_lake_stores import DeltaTableStore +from .delta_lake_databases import DeltaTableDatabase diff --git a/src/orcapod/stores/delta_lake_stores.py b/src/orcapod/databases/delta_lake_databases.py similarity index 99% rename from src/orcapod/stores/delta_lake_stores.py rename to src/orcapod/databases/delta_lake_databases.py index 8f2f48b..4457f0f 100644 --- a/src/orcapod/stores/delta_lake_stores.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -1,4 +1,3 @@ -from multiprocessing import Value from pathlib import Path from typing import Any, Literal, TYPE_CHECKING, cast import logging @@ -7,7 +6,6 @@ from collections import defaultdict from collections.abc import Collection, Mapping -from pyarrow import Table from orcapod.data import constants from orcapod.utils.lazy_module import LazyModule @@ -24,9 +22,9 @@ logger = logging.getLogger(__name__) -class DeltaTableStore: +class DeltaTableDatabase: """ - A Delta table store with clear insert vs update semantics. + A Delta table database with clear insert vs update semantics. - insert(): Never overwrites existing records by default. Can skip duplicates if requested. Can be batched for performance. Supports composite keys. diff --git a/src/orcapod/stores/file_utils.py b/src/orcapod/databases/file_utils.py similarity index 100% rename from src/orcapod/stores/file_utils.py rename to src/orcapod/databases/file_utils.py diff --git a/src/orcapod/stores/legacy/delta_table_arrow_data_store.py b/src/orcapod/databases/legacy/delta_table_arrow_data_store.py similarity index 100% rename from src/orcapod/stores/legacy/delta_table_arrow_data_store.py rename to src/orcapod/databases/legacy/delta_table_arrow_data_store.py diff --git a/src/orcapod/stores/legacy/dict_data_stores.py b/src/orcapod/databases/legacy/dict_data_stores.py similarity index 99% rename from src/orcapod/stores/legacy/dict_data_stores.py rename to src/orcapod/databases/legacy/dict_data_stores.py index 718fef0..63d7974 100644 --- a/src/orcapod/stores/legacy/dict_data_stores.py +++ b/src/orcapod/databases/legacy/dict_data_stores.py @@ -7,7 +7,7 @@ from orcapod.hashing.legacy_core import hash_packet from orcapod.hashing.types import LegacyPacketHasher from orcapod.hashing.defaults import get_default_composite_file_hasher -from orcapod.stores.legacy.types import DataStore +from orcapod.databases.legacy.types import DataStore from orcapod.types import Packet, PacketLike logger = logging.getLogger(__name__) diff --git a/src/orcapod/stores/legacy/dict_transfer_data_store.py b/src/orcapod/databases/legacy/dict_transfer_data_store.py similarity index 97% rename from src/orcapod/stores/legacy/dict_transfer_data_store.py rename to src/orcapod/databases/legacy/dict_transfer_data_store.py index fe7a52a..99709e8 100644 --- a/src/orcapod/stores/legacy/dict_transfer_data_store.py +++ b/src/orcapod/databases/legacy/dict_transfer_data_store.py @@ -1,6 +1,6 @@ # Implements transfer data store that lets you transfer memoized packets between data stores. -from orcapod.stores.legacy.types import DataStore +from orcapod.databases.legacy.types import DataStore from orcapod.types import PacketLike diff --git a/src/orcapod/stores/legacy/legacy_arrow_data_stores.py b/src/orcapod/databases/legacy/legacy_arrow_data_stores.py similarity index 99% rename from src/orcapod/stores/legacy/legacy_arrow_data_stores.py rename to src/orcapod/databases/legacy/legacy_arrow_data_stores.py index 0a9a7e9..acac198 100644 --- a/src/orcapod/stores/legacy/legacy_arrow_data_stores.py +++ b/src/orcapod/databases/legacy/legacy_arrow_data_stores.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timedelta import logging -from orcapod.stores.types import DuplicateError +from orcapod.databases.types import DuplicateError from pathlib import Path # Module-level logger diff --git a/src/orcapod/stores/legacy/safe_dir_data_store.py b/src/orcapod/databases/legacy/safe_dir_data_store.py similarity index 100% rename from src/orcapod/stores/legacy/safe_dir_data_store.py rename to src/orcapod/databases/legacy/safe_dir_data_store.py diff --git a/src/orcapod/stores/legacy/types.py b/src/orcapod/databases/legacy/types.py similarity index 100% rename from src/orcapod/stores/legacy/types.py rename to src/orcapod/databases/legacy/types.py diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 7b7f488..783c2ed 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -3,7 +3,7 @@ from orcapod.protocols.pipeline_protocols import Node from orcapod import contexts from orcapod.protocols import data_protocols as dp -from orcapod.protocols import store_protocols as sp +from orcapod.protocols import database_protocols as dbp from typing import Any from collections.abc import Collection import logging @@ -21,8 +21,8 @@ class Pipeline(GraphTracker): def __init__( self, name: str | tuple[str, ...], - pipeline_store: sp.ArrowDataStore, - results_store: sp.ArrowDataStore | None = None, + pipeline_database: dbp.ArrowDatabase, + results_database: dbp.ArrowDatabase | None = None, tracker_manager: dp.TrackerManager | None = None, data_context: str | contexts.DataContext | None = None, auto_compile: bool = True, @@ -33,15 +33,15 @@ def __init__( self.name = name self.pipeline_store_path_prefix = self.name self.results_store_path_prefix = () - if results_store is None: - if pipeline_store is None: + if results_database is None: + if pipeline_database is None: raise ValueError( - "Either pipeline_store or results_store must be provided" + "Either pipeline_database or results_database must be provided" ) - results_store = pipeline_store + results_database = pipeline_database self.results_store_path_prefix = self.name + ("_results",) - self.pipeline_store = pipeline_store - self.results_store = results_store + self.pipeline_database = pipeline_database + self.results_database = results_database self.nodes: dict[str, Node] = {} self.auto_compile = auto_compile self._dirty = False @@ -56,8 +56,8 @@ def __exit__(self, exc_type=None, exc_value=None, traceback=None): self.compile() def flush(self) -> None: - self.pipeline_store.flush() - self.results_store.flush() + self.pipeline_database.flush() + self.results_database.flush() def record_kernel_invocation( self, @@ -119,9 +119,9 @@ def wrap_invocation( node = PodNode( pod=pod, input_streams=new_input_streams, - result_store=self.results_store, + result_database=self.results_database, record_path_prefix=self.results_store_path_prefix, - pipeline_store=self.pipeline_store, + pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) @@ -130,7 +130,7 @@ def wrap_invocation( node = KernelNode( kernel=source, input_streams=new_input_streams, - pipeline_store=self.pipeline_store, + pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) @@ -138,7 +138,7 @@ def wrap_invocation( node = KernelNode( kernel=invocation.kernel, input_streams=new_input_streams, - pipeline_store=self.pipeline_store, + pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, ) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 0ab94ba..e2b2796 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,7 +1,7 @@ from orcapod.data.kernels import KernelStream, WrappedKernel from orcapod.data.sources import SourceBase -from orcapod.data.pods import ArrowDataStore, CachedPod -from orcapod.protocols import data_protocols as dp +from orcapod.data.pods import CachedPod +from orcapod.protocols import data_protocols as dp, database_protocols as dbp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any @@ -29,7 +29,7 @@ class NodeBase( def __init__( self, input_streams: Collection[dp.Stream], - pipeline_store: ArrowDataStore, + pipeline_database: dbp.ArrowDatabase, pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ): @@ -46,7 +46,7 @@ def __init__( self.tag_schema_hash = self.data_context.object_hasher.hash_object( tag_types ).to_string() - self.pipeline_store = pipeline_store + self.pipeline_database = pipeline_database @property def contained_kernel(self) -> dp.Kernel: @@ -63,7 +63,7 @@ def pipeline_path(self) -> tuple[str, ...]: # TODO: include output tag hash! return ( self.pipeline_path_prefix - + self.kernel_id + + self.reference + (self.invocation_hash, self.tag_schema_hash) ) @@ -117,14 +117,14 @@ def __init__( self, kernel: dp.Kernel, input_streams: Collection[dp.Stream], - pipeline_store: ArrowDataStore, + pipeline_database: dbp.ArrowDatabase, pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ) -> None: super().__init__( kernel=kernel, input_streams=input_streams, - pipeline_store=pipeline_store, + pipeline_database=pipeline_database, pipeline_path_prefix=pipeline_path_prefix, **kwargs, ) @@ -153,7 +153,7 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: include_source=True, include_content_hash=key_column_name, ) - self.pipeline_store.add_records( + self.pipeline_database.add_records( self.pipeline_path, output_table, record_id_column=key_column_name, @@ -163,7 +163,7 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": - results = self.pipeline_store.get_all_records(self.pipeline_path) + results = self.pipeline_database.get_all_records(self.pipeline_path) if results is None: return None @@ -185,22 +185,22 @@ def __init__( self, pod: dp.Pod, input_streams: Collection[dp.Stream], - pipeline_store: ArrowDataStore, - result_store: ArrowDataStore | None = None, + pipeline_database: dbp.ArrowDatabase, + result_database: dbp.ArrowDatabase | None = None, record_path_prefix: tuple[str, ...] = (), pipeline_path_prefix: tuple[str, ...] = (), **kwargs, ) -> None: super().__init__( pod=pod, - result_store=result_store, + result_database=result_database, record_path_prefix=record_path_prefix, input_streams=input_streams, - pipeline_store=pipeline_store, + pipeline_database=pipeline_database, pipeline_path_prefix=pipeline_path_prefix, **kwargs, ) - self.pipeline_store = pipeline_store + self.pipeline_store = pipeline_database @property def contained_kernel(self) -> dp.Kernel: @@ -350,7 +350,7 @@ def add_pipeline_record( def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": - results = self.result_store.get_all_records( + results = self.result_database.get_all_records( self.record_path, record_id_column=constants.PACKET_RECORD_ID ) diff --git a/src/orcapod/protocols/store_protocols.py b/src/orcapod/protocols/database_protocols.py similarity index 91% rename from src/orcapod/protocols/store_protocols.py rename to src/orcapod/protocols/database_protocols.py index 3933bf7..dd37903 100644 --- a/src/orcapod/protocols/store_protocols.py +++ b/src/orcapod/protocols/database_protocols.py @@ -5,7 +5,7 @@ import pyarrow as pa -class ArrowDataStore(Protocol): +class ArrowDatabase(Protocol): def add_record( self, record_path: tuple[str, ...], @@ -82,7 +82,7 @@ def validate_metadata( ) -> Collection[str]: ... -class ArrowDataStoreWithMetadata(ArrowDataStore, MetadataCapable, Protocol): - """A protocol that combines ArrowDataStore with metadata capabilities.""" +class ArrowDatabaseWithMetadata(ArrowDatabase, MetadataCapable, Protocol): + """A protocol that combines ArrowDatabase with metadata capabilities.""" pass diff --git a/tests/test_store/test_dir_data_store.py b/tests/test_store/test_dir_data_store.py index d7f6a3c..1e91272 100644 --- a/tests/test_store/test_dir_data_store.py +++ b/tests/test_store/test_dir_data_store.py @@ -13,7 +13,7 @@ LegacyPacketHasher, LegacyPathSetHasher, ) -from orcapod.stores.legacy.dict_data_stores import DirDataStore +from orcapod.databases.legacy.dict_data_stores import DirDataStore class MockFileHasher(LegacyFileHasher): diff --git a/tests/test_store/test_integration.py b/tests/test_store/test_integration.py index 0c50292..88c081b 100644 --- a/tests/test_store/test_integration.py +++ b/tests/test_store/test_integration.py @@ -12,7 +12,7 @@ LegacyDefaultCompositeFileHasher, ) from orcapod.hashing.string_cachers import InMemoryCacher -from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.databases.legacy.dict_data_stores import DirDataStore, NoOpDataStore def test_integration_with_cached_file_hasher(temp_dir, sample_files): diff --git a/tests/test_store/test_noop_data_store.py b/tests/test_store/test_noop_data_store.py index 564b449..4091d7f 100644 --- a/tests/test_store/test_noop_data_store.py +++ b/tests/test_store/test_noop_data_store.py @@ -3,7 +3,7 @@ import pytest -from orcapod.stores.legacy.dict_data_stores import NoOpDataStore +from orcapod.databases.legacy.dict_data_stores import NoOpDataStore def test_noop_data_store_memoize(): @@ -43,7 +43,7 @@ def test_noop_data_store_retrieve_memoized(): def test_noop_data_store_is_data_store_subclass(): """Test that NoOpDataStore is a subclass of DataStore.""" - from orcapod.stores import DataStore + from orcapod.databases import DataStore store = NoOpDataStore() assert isinstance(store, DataStore) diff --git a/tests/test_store/test_transfer_data_store.py b/tests/test_store/test_transfer_data_store.py index f4076d6..036825d 100644 --- a/tests/test_store/test_transfer_data_store.py +++ b/tests/test_store/test_transfer_data_store.py @@ -6,8 +6,8 @@ import pytest from orcapod.hashing.types import LegacyPacketHasher -from orcapod.stores.legacy.dict_data_stores import DirDataStore, NoOpDataStore -from orcapod.stores.legacy.dict_transfer_data_store import TransferDataStore +from orcapod.databases.legacy.dict_data_stores import DirDataStore, NoOpDataStore +from orcapod.databases.legacy.dict_transfer_data_store import TransferDataStore class MockPacketHasher(LegacyPacketHasher): From b99844d5bd55a542ee592d65c91a6c4d5c2dd07b Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 27 Aug 2025 15:48:40 -0700 Subject: [PATCH 210/224] feat: set default hash length to 20 characters --- src/orcapod/protocols/hashing_protocols.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/orcapod/protocols/hashing_protocols.py b/src/orcapod/protocols/hashing_protocols.py index e9fb268..c47b322 100644 --- a/src/orcapod/protocols/hashing_protocols.py +++ b/src/orcapod/protocols/hashing_protocols.py @@ -16,12 +16,13 @@ class ContentHash: method: str digest: bytes - def to_hex(self, char_count: int | None = None) -> str: + # TODO: make the default char count configurable + def to_hex(self, char_count: int | None = 20) -> str: """Convert digest to hex string, optionally truncated.""" hex_str = self.digest.hex() return hex_str[:char_count] if char_count else hex_str - def to_int(self, hexdigits: int = 16) -> int: + def to_int(self, hexdigits: int = 20) -> int: """ Convert digest to integer representation. From 56439cbab2e971709392e56bd9eccdb8d4147098 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 27 Aug 2025 15:49:48 -0700 Subject: [PATCH 211/224] refactor: rename kernel_id to reference --- src/orcapod/data/kernels.py | 11 +++++++---- src/orcapod/data/sources/arrow_table_source.py | 11 ++++++----- src/orcapod/data/sources/base.py | 14 +++++++++++--- src/orcapod/data/sources/csv_source.py | 1 - src/orcapod/protocols/data_protocols/datagrams.py | 13 ++++++------- src/orcapod/protocols/data_protocols/kernel.py | 12 ++++++------ 6 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index ad0ecc1..0c74cc8 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -41,12 +41,15 @@ def __init__( self._set_modified_time() @property - def kernel_id(self) -> tuple[str, ...]: + def reference(self) -> tuple[str, ...]: """ Returns a unique identifier for the kernel. This is used to identify the kernel in the computational graph. """ - return (f"{self.__class__.__name__}", self.content_hash().to_hex()) + return ( + f"{self.__class__.__name__}", + self.content_hash().to_hex(), + ) @property def last_modified(self) -> datetime | None: @@ -202,8 +205,8 @@ def computed_label(self) -> str | None: return self.kernel.label @property - def kernel_id(self) -> tuple[str, ...]: - return self.kernel.kernel_id + def reference(self) -> tuple[str, ...]: + return self.kernel.reference def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False diff --git a/src/orcapod/data/sources/arrow_table_source.py b/src/orcapod/data/sources/arrow_table_source.py index 8e71b0d..8d0cf46 100644 --- a/src/orcapod/data/sources/arrow_table_source.py +++ b/src/orcapod/data/sources/arrow_table_source.py @@ -20,11 +20,12 @@ class ArrowTableSource(SourceBase): """Construct source from a collection of dictionaries""" + SOURCE_ID = "arrow" + def __init__( self, table: "pa.Table", tag_columns: Collection[str] = (), - source_info: str | None = None, **kwargs, ): super().__init__(**kwargs) @@ -34,10 +35,6 @@ def __init__( table = arrow_data_utils.drop_system_columns(table) self.table_hash = self.data_context.arrow_hasher.hash_table(table) - if source_info is None: - source_info = f"arrow_table:{self.table_hash.to_hex(char_count=16)}" - self.source_info = source_info - self.tag_columns = [col for col in tag_columns if col in table.column_names] # add system tag column, indexing into the array @@ -59,6 +56,10 @@ def __init__( upstreams=(), ) + @property + def reference(self) -> tuple[str, ...]: + return (self.SOURCE_ID, self.table_hash.to_hex()) + @property def table(self) -> "pa.Table": return self._table diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py index 3190514..c5551d9 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/data/sources/base.py @@ -49,9 +49,15 @@ def kernel_identity_structure( # otherwise, return the identity structure of the stream return self.source_identity_structure() + # @property + # @abstractmethod + # def reference(self) -> tuple[str, ...]: + # """Return the unique identifier for the kernel.""" + # ... + def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False - ) -> tuple[dict[str, type], dict[str, type]]: + ) -> tuple[PythonSchema, PythonSchema]: return self.source_output_types(include_system_tags=include_system_tags) @abstractmethod @@ -116,9 +122,11 @@ def upstreams(self) -> tuple[dp.Stream, ...]: """Sources have no upstream dependencies.""" return () - def keys(self) -> tuple[tuple[str, ...], tuple[str, ...]]: + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: """Delegate to the cached KernelStream.""" - return self().keys() + return self().keys(include_system_tags=include_system_tags) def types( self, include_system_tags: bool = False diff --git a/src/orcapod/data/sources/csv_source.py b/src/orcapod/data/sources/csv_source.py index 2540645..d3b4709 100644 --- a/src/orcapod/data/sources/csv_source.py +++ b/src/orcapod/data/sources/csv_source.py @@ -1,4 +1,3 @@ -from collections.abc import Collection from typing import TYPE_CHECKING, Any diff --git a/src/orcapod/protocols/data_protocols/datagrams.py b/src/orcapod/protocols/data_protocols/datagrams.py index 41cd379..a0f24d8 100644 --- a/src/orcapod/protocols/data_protocols/datagrams.py +++ b/src/orcapod/protocols/data_protocols/datagrams.py @@ -19,15 +19,14 @@ class Datagram(ContentIdentifiable, Protocol): Each datagram contains: - **Data columns**: The primary business data (user_id, name, etc.) - - **Meta columns**: Internal system metadata with {orcapod.META_PREFIX} (typically '__') prefixes (e.g. __processed_at, etc.) - - **Context column**: Data context information ({orcapod.CONTEXT_KEY}) + - **Meta columns**: Internal system metadata with {constants.META_PREFIX} (typically '__') prefixes (e.g. __processed_at, etc.) + - **Context column**: Data context information ({constants.CONTEXT_KEY}) Derivative of datagram (such as Packet or Tag) will also include some specific columns pertinent to the function of the specialized datagram: - - **Source info columns**: Data provenance with {orcapod.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) used in Packet - - **System tags**: Internal tags for system use, typically prefixed with {orcapod.SYSTEM_TAG_PREFIX} ('_system_') (_system_created_at, etc.) used in Tag + - **Source info columns**: Data provenance with {constants.SOURCE_PREFIX} ('_source_') prefixes (_source_user_id, etc.) used in Packet + - **System tags**: Internal tags for system use, typically prefixed with {constants.SYSTEM_TAG_PREFIX} ('_system_') (_system_created_at, etc.) used in Tag - All operations are by design immutable - methods return new datagram instances rather than - modifying existing ones. + All operations are by design immutable - methods return new datagram instances rather than modifying existing ones. Example: >>> datagram = DictDatagram({"user_id": 123, "name": "Alice"}) @@ -55,7 +54,7 @@ def data_context_key(self) -> str: @property def meta_columns(self) -> tuple[str, ...]: - """Return tuple of meta column names (with {orcapod.META_PREFIX} ('__') prefix).""" + """Return tuple of meta column names (with {constants.META_PREFIX} ('__') prefix).""" ... # 2. Dict-like Interface (Data Access) diff --git a/src/orcapod/protocols/data_protocols/kernel.py b/src/orcapod/protocols/data_protocols/kernel.py index ee6b029..a4f5ea2 100644 --- a/src/orcapod/protocols/data_protocols/kernel.py +++ b/src/orcapod/protocols/data_protocols/kernel.py @@ -33,16 +33,16 @@ class Kernel(ContentIdentifiable, Labelable, Protocol): """ @property - def kernel_id(self) -> tuple[str, ...]: + def reference(self) -> tuple[str, ...]: """ - Return a unique identifier for this Pod. + Reference to the kernel + + The reference is used for caching/storage and tracking purposes. + As the name indicates, this is how data originating from the kernel will be referred to. - The pod_id is used for caching and tracking purposes. It should - uniquely identify the Pod's computational logic, parameters, and - any relevant metadata that affects its behavior. Returns: - tuple[str, ...]: Unique identifier for this Pod + tuple[str, ...]: Reference for this kernel """ ... From 47bcbc104a0428feb1a6e5ba60cf464a5f6ec104 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Wed, 27 Aug 2025 16:01:41 -0700 Subject: [PATCH 212/224] type: use mapping for input parameters --- src/orcapod/data/pods.py | 6 +++--- src/orcapod/data/sources/base.py | 19 ------------------- src/orcapod/hashing/arrow_hashers.py | 4 +++- src/orcapod/utils/types_utils.py | 6 +++--- 4 files changed, 9 insertions(+), 26 deletions(-) diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index bc7eb49..1045878 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -17,7 +17,7 @@ from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.database_protocols import ArrowDatabase -from orcapod.types import DataValue, PythonSchema +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import types_utils from orcapod.utils.lazy_module import LazyModule from orcapod.hashing.hash_utils import get_function_signature, get_function_components @@ -274,8 +274,8 @@ def __init__( output_keys: str | Collection[str] | None = None, function_name=None, version: str = "v0.0", - input_python_schema: Mapping[str, type] | None = None, - output_python_schema: Mapping[str, type] | Sequence[type] | None = None, + input_python_schema: PythonSchemaLike | None = None, + output_python_schema: PythonSchemaLike | Sequence[type] | None = None, label: str | None = None, function_info_extractor: hp.FunctionInfoExtractor | None = None, **kwargs, diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py index c5551d9..ba3afcf 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/data/sources/base.py @@ -214,25 +214,6 @@ def invalidate(self) -> None: # ==================== Source Protocol ==================== - @property - def tag_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the tag in the pipeline run records. - This is used to store the run-associated tag info. - """ - tag_keys, _ = self.keys() - return tag_keys - - @property - def packet_keys(self) -> tuple[str, ...]: - """ - Return the keys used for the packet in the pipeline run records. - This is used to store the run-associated packet info. - """ - # TODO: consider caching this - _, packet_keys = self.keys() - return packet_keys - def reset_cache(self) -> None: """ Clear the cached KernelStream, forcing a fresh one on next access. diff --git a/src/orcapod/hashing/arrow_hashers.py b/src/orcapod/hashing/arrow_hashers.py index 0ef8ab8..ebe797e 100644 --- a/src/orcapod/hashing/arrow_hashers.py +++ b/src/orcapod/hashing/arrow_hashers.py @@ -54,10 +54,10 @@ class SemanticArrowHasher: def __init__( self, + semantic_registry: SemanticTypeRegistry, hasher_id: str | None = None, hash_algorithm: str = "sha256", chunk_size: int = 8192, - semantic_registry: SemanticTypeRegistry | None = None, handle_missing: str = "error", serialization_method: str = "logical", # TODO: consider passing options for serialization method @@ -126,6 +126,8 @@ def _process_table_columns(self, table: pa.Table) -> pa.Table: processed_data.append(processed_value) # Create new Arrow column from processed data + assert new_type is not None, "Failed to infer new column type" + # TODO: revisit this logic new_column = pa.array(processed_data, type=new_type) new_field = pa.field(field.name, new_type) diff --git a/src/orcapod/utils/types_utils.py b/src/orcapod/utils/types_utils.py index eff0fb7..372d15e 100644 --- a/src/orcapod/utils/types_utils.py +++ b/src/orcapod/utils/types_utils.py @@ -2,7 +2,7 @@ from collections.abc import Callable, Collection, Sequence, Mapping from typing import get_origin, get_args, Any -from orcapod.types import PythonSchema +from orcapod.types import PythonSchema, PythonSchemaLike import inspect import logging @@ -55,8 +55,8 @@ def check_typespec_compatibility( def extract_function_typespecs( func: Callable, output_keys: Collection[str], - input_typespec: PythonSchema | None = None, - output_typespec: PythonSchema | Sequence[type] | None = None, + input_typespec: PythonSchemaLike | None = None, + output_typespec: PythonSchemaLike | Sequence[type] | None = None, ) -> tuple[PythonSchema, PythonSchema]: """ Extract input and output data types from a function signature. From b54aab444f866cffa38ba3392e26a94d3b27f681 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Fri, 29 Aug 2025 21:51:18 -0700 Subject: [PATCH 213/224] feat: fix bugs and add proper save path handling --- .../01_introduction_to_orcapod.ipynb | 126 ++- pyproject.toml | 5 + src/orcapod/__init__.py | 2 + src/orcapod/data/arrow_data_utils.py | 14 +- src/orcapod/data/base.py | 214 +---- src/orcapod/data/kernels.py | 38 +- src/orcapod/data/operators/batch.py | 2 +- src/orcapod/data/operators/mappers.py | 48 +- src/orcapod/data/pods.py | 30 +- src/orcapod/data/sources/__init__.py | 11 +- .../data/sources/arrow_table_source.py | 55 +- src/orcapod/data/sources/base.py | 19 +- .../data/sources/delta_table_source.py | 198 ++++ src/orcapod/data/sources/dict_source.py | 17 +- src/orcapod/data/sources/source_registry.py | 232 +++++ src/orcapod/data/streams.py | 5 +- src/orcapod/databases/delta_lake_databases.py | 11 +- src/orcapod/databases/file_utils.py | 868 +++++++++--------- src/orcapod/hashing/arrow_utils.py | 7 +- src/orcapod/hashing/object_hashers.py | 13 +- src/orcapod/hashing/semantic_type_hashers.py | 10 +- src/orcapod/hashing/string_cachers.py | 21 +- src/orcapod/pipeline/nodes.py | 67 +- uv.lock | 312 +++++++ 24 files changed, 1526 insertions(+), 799 deletions(-) create mode 100644 src/orcapod/data/sources/delta_table_source.py create mode 100644 src/orcapod/data/sources/source_registry.py diff --git a/notebooks/tutorials/01_introduction_to_orcapod.ipynb b/notebooks/tutorials/01_introduction_to_orcapod.ipynb index b0ffb65..ef11dc8 100644 --- a/notebooks/tutorials/01_introduction_to_orcapod.ipynb +++ b/notebooks/tutorials/01_introduction_to_orcapod.ipynb @@ -302,7 +302,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 5)
abcd_content_hash
i64strboolf64str
1"x"true1.1"arrow_v0.1@dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1@083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1@d4a11ad88c1d27eba1c…
" + "shape: (3, 5)
abcd_content_hash
i64strboolf64str
1"x"true1.1"arrow_v0.1:dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1:083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1:d4a11ad88c1d27eba1c…
" ], "text/plain": [ "shape: (3, 5)\n", @@ -311,9 +311,9 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", - "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@dbd5a1efe0a1a306cc2… │\n", - "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@083f8c4d8a4c7608af3… │\n", - "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@d4a11ad88c1d27eba1c… │\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1:dbd5a1efe0a1a306cc2… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1:083f8c4d8a4c7608af3… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1:d4a11ad88c1d27eba1c… │\n", "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, @@ -350,7 +350,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 5)
abcdmy_hash_values
i64strboolf64str
1"x"true1.1"arrow_v0.1@dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1@083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1@d4a11ad88c1d27eba1c…
" + "shape: (3, 5)
abcdmy_hash_values
i64strboolf64str
1"x"true1.1"arrow_v0.1:dbd5a1efe0a1a306cc2…
2"y"false2.2"arrow_v0.1:083f8c4d8a4c7608af3…
3"z"true3.3"arrow_v0.1:d4a11ad88c1d27eba1c…
" ], "text/plain": [ "shape: (3, 5)\n", @@ -359,9 +359,9 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", "╞═════╪═════╪═══════╪═════╪═════════════════════════════════╡\n", - "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1@dbd5a1efe0a1a306cc2… │\n", - "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1@083f8c4d8a4c7608af3… │\n", - "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1@d4a11ad88c1d27eba1c… │\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ arrow_v0.1:dbd5a1efe0a1a306cc2… │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ arrow_v0.1:083f8c4d8a4c7608af3… │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ arrow_v0.1:d4a11ad88c1d27eba1c… │\n", "└─────┴─────┴───────┴─────┴─────────────────────────────────┘" ] }, @@ -398,19 +398,19 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 5)
abcd_context_key
i64strboolf64str
1"x"true1.1null
2"y"false2.2null
3"z"true3.3null
" + "shape: (3, 5)
abcd_context_key
i64strboolf64str
1"x"true1.1"std:v0.1:default"
2"y"false2.2"std:v0.1:default"
3"z"true3.3"std:v0.1:default"
" ], "text/plain": [ "shape: (3, 5)\n", - "┌─────┬─────┬───────┬─────┬──────────────┐\n", - "│ a ┆ b ┆ c ┆ d ┆ _context_key │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", - "╞═════╪═════╪═══════╪═════╪══════════════╡\n", - "│ 1 ┆ x ┆ true ┆ 1.1 ┆ null │\n", - "│ 2 ┆ y ┆ false ┆ 2.2 ┆ null │\n", - "│ 3 ┆ z ┆ true ┆ 3.3 ┆ null │\n", - "└─────┴─────┴───────┴─────┴──────────────┘" + "┌─────┬─────┬───────┬─────┬──────────────────┐\n", + "│ a ┆ b ┆ c ┆ d ┆ _context_key │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ bool ┆ f64 ┆ str │\n", + "╞═════╪═════╪═══════╪═════╪══════════════════╡\n", + "│ 1 ┆ x ┆ true ┆ 1.1 ┆ std:v0.1:default │\n", + "│ 2 ┆ y ┆ false ┆ 2.2 ┆ std:v0.1:default │\n", + "│ 3 ┆ z ┆ true ┆ 3.3 ┆ std:v0.1:default │\n", + "└─────┴─────┴───────┴─────┴──────────────────┘" ] }, "execution_count": 12, @@ -450,7 +450,7 @@ "b: [[\"x\",\"y\",\"z\"]]\n", "c: [[true,false,true]]\n", "d: [[1.1,2.2,3.3]]\n", - "_context_key: [[null,null,null]]" + "_context_key: [[\"std:v0.1:default\",\"std:v0.1:default\",\"std:v0.1:default\"]]" ] }, "execution_count": 13, @@ -930,7 +930,7 @@ { "data": { "text/plain": [ - "'arrow_v0.1@6e1143896d73d370757811b52ceeea8d1d456cd30206416fbf81754e1cea5568'" + "ContentHash(method='arrow_v0.1', digest=b'n\\x11C\\x89ms\\xd3pux\\x11\\xb5,\\xee\\xea\\x8d\\x1dEl\\xd3\\x02\\x06Ao\\xbf\\x81uN\\x1c\\xeaUh')" ] }, "execution_count": 31, @@ -1683,14 +1683,14 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "id": "cb4bc91a", "metadata": {}, "outputs": [], "source": [ - "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", + "database = op.databases.DeltaTableDatabase(base_path=\"./pipeline_data\")\n", "\n", - "pipeline = op.Pipeline(name=\"my_pipeline\", pipeline_store=data_store)" + "pipeline = op.Pipeline(name=\"my_pipeline\", pipeline_database=database)" ] }, { @@ -1703,7 +1703,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "id": "f371822b", "metadata": {}, "outputs": [], @@ -1728,7 +1728,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "id": "e132fc93", "metadata": {}, "outputs": [], @@ -1750,7 +1750,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "id": "cca9e0d0", "metadata": {}, "outputs": [ @@ -1760,7 +1760,7 @@ "PodNode(pod=FunctionPod:add_numbers)" ] }, - "execution_count": 56, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1771,21 +1771,21 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "id": "08add7d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'TableStream': KernelNode(kernel=),\n", - " 'add_numbers': PodNode(pod=FunctionPod:add_numbers),\n", + "{'TableStream': KernelNode(kernel=StreamSource),\n", " 'multiply_numbers': PodNode(pod=FunctionPod:multiply_numbers),\n", + " 'add_numbers': PodNode(pod=FunctionPod:add_numbers),\n", " 'Join': KernelNode(kernel=Join()),\n", " 'combine_results': PodNode(pod=FunctionPod:combine_results)}" ] }, - "execution_count": 57, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -1812,10 +1812,42 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "id": "21086f72", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
idsum
i64i64
011
122
233
344
455
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────┬─────┐\n", + "│ id ┆ sum │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ i64 │\n", + "╞═════╪═════╡\n", + "│ 0 ┆ 11 │\n", + "│ 1 ┆ 22 │\n", + "│ 2 ┆ 33 │\n", + "│ 3 ┆ 44 │\n", + "│ 4 ┆ 55 │\n", + "└─────┴─────┘" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.add_numbers.as_df()" ] @@ -1830,7 +1862,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "id": "1e741659", "metadata": {}, "outputs": [], @@ -1856,7 +1888,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "id": "c77154ec", "metadata": {}, "outputs": [ @@ -1887,7 +1919,7 @@ "└─────┴─────┘" ] }, - "execution_count": 60, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -1924,19 +1956,19 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "id": "37e65e33", "metadata": {}, "outputs": [], "source": [ - "data_store = op.stores.BatchedDeltaTableArrowStore(base_path=\"./pipeline_data\")\n", + "database = op.databases.DeltaTableDatabase(base_path=\"./pipeline_data\")\n", "\n", - "pipeline2 = op.Pipeline(name=\"my_pipeline\", pipeline_store=data_store)" + "pipeline2 = op.Pipeline(name=\"my_pipeline\", pipeline_database=database)" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "id": "3bad8332", "metadata": {}, "outputs": [], @@ -1952,7 +1984,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "id": "8f146ae7", "metadata": {}, "outputs": [ @@ -1983,7 +2015,7 @@ "└─────┴─────┘" ] }, - "execution_count": 63, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1994,7 +2026,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "id": "8fd7bf4e", "metadata": {}, "outputs": [ @@ -2025,7 +2057,7 @@ "└─────┴─────────┘" ] }, - "execution_count": 64, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -2036,7 +2068,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "id": "2a918db1", "metadata": {}, "outputs": [ @@ -2067,7 +2099,7 @@ "└─────┴───────────────────────┘" ] }, - "execution_count": 65, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -2101,7 +2133,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.3" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index af8ed52..14996e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,15 +58,20 @@ dev = [ "jsonschema>=4.25.0", "minio>=7.2.16", "mkdocs>=1.6.1", + "mkdocs-material>=9.6.18", + "mkdocstrings[python]>=0.30.0", + "pdoc>=15.0.4", "pyarrow-stubs>=20.0.0.20250716", "pygraphviz>=1.14", "pyiceberg>=0.9.1", + "pyright>=1.1.404", "pytest>=8.3.5", "pytest-cov>=6.1.1", "ray[default]==2.48.0", "redis>=6.2.0", "ruff>=0.11.11", "s3fs>=2025.7.0", + "sphinx>=8.2.3", "tqdm>=4.67.1", ] diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 2a83782..3064dd4 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -2,6 +2,7 @@ from .data.pods import function_pod, FunctionPod, CachedPod from .data import streams from .data import operators +from .data import sources from . import databases from .pipeline import Pipeline @@ -16,6 +17,7 @@ "CachedPod", "streams", "databases", + "sources", "operators", "Pipeline", ] diff --git a/src/orcapod/data/arrow_data_utils.py b/src/orcapod/data/arrow_data_utils.py index 1c47599..d9e8bf8 100644 --- a/src/orcapod/data/arrow_data_utils.py +++ b/src/orcapod/data/arrow_data_utils.py @@ -36,7 +36,7 @@ def drop_system_columns( def add_source_info( table: "pa.Table", - source_info: str | None, + source_info: str | Collection[str] | None, exclude_prefixes: Collection[str] = ( constants.META_PREFIX, constants.DATAGRAM_PREFIX, @@ -45,7 +45,14 @@ def add_source_info( ) -> "pa.Table": """Add source information to an Arrow table.""" # Create a new column with the source information - source_column = pa.array([source_info] * table.num_rows) + if source_info is None or isinstance(source_info, str): + source_column = [source_info] * table.num_rows + elif isinstance(source_info, Collection): + if len(source_info) != table.num_rows: + raise ValueError( + "Length of source_info collection must match number of rows in the table." + ) + source_column = source_info # identify columns for which source columns should be created @@ -53,7 +60,8 @@ def add_source_info( if col.startswith(tuple(exclude_prefixes)) or col in exclude_columns: continue source_column = pa.array( - [f"{source_info}:{col}"] * table.num_rows, type=pa.large_string() + [f"{source_val}::{col}" for source_val in source_column], + type=pa.large_string(), ) table = table.append_column(f"{constants.SOURCE_PREFIX}{col}", source_column) diff --git a/src/orcapod/data/base.py b/src/orcapod/data/base.py index e1fe272..a71933f 100644 --- a/src/orcapod/data/base.py +++ b/src/orcapod/data/base.py @@ -116,9 +116,9 @@ def content_hash(self) -> hp.ContentHash: """ if self._cached_content_hash is None: structure = self.identity_structure() - processed_structure = process_structure(structure) - self._cached_content_hash = self._data_context.object_hasher.hash_object( - processed_structure + # processed_structure = process_structure(structure) + self._cached_content_hash = self.data_context.object_hasher.hash_object( + structure ) return self._cached_content_hash @@ -131,16 +131,7 @@ def __hash__(self) -> int: int: A hash value based on either content or identity """ # Get the identity structure - if self._cached_int_hash is None: - structure = self.identity_structure() - if structure is None: - # If no identity structure is provided, use the default hash - self._cached_int_hash = super().__hash__() - else: - self._cached_int_hash = self._data_context.object_hasher.hash_object( - structure - ).to_int() - return self._cached_int_hash + return self.content_hash().to_int() def __eq__(self, other: object) -> bool: """ @@ -160,200 +151,3 @@ def __eq__(self, other: object) -> bool: class LabeledContentIdentifiableBase(ContentIdentifiableBase, LablableBase): pass - - -def process_structure( - obj: Any, - visited: set[int] | None = None, - force_hash: bool = True, - function_info_extractor: hp.FunctionInfoExtractor | None = None, -) -> Any: - """ - Recursively process a structure to prepare it for hashing. - - Args: - obj: The object or structure to process - visited: Set of object ids already visited (to handle circular references) - function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation - - Returns: - A processed version of the structure suitable for stable hashing - """ - # Initialize the visited set if this is the top-level call - if visited is None: - visited = set() - else: - visited = visited.copy() # Copy to avoid modifying the original set - - # Check for circular references - use object's memory address - # NOTE: While id() is not stable across sessions, we only use it within a session - # to detect circular references, not as part of the final hash - obj_id = id(obj) - if obj_id in visited: - logger.debug( - f"Detected circular reference for object of type {type(obj).__name__}" - ) - return "CircularRef" # Don't include the actual id in hash output - - # For objects that could contain circular references, add to visited - if isinstance(obj, (dict, list, tuple, set)) or not isinstance( - obj, (str, int, float, bool, type(None)) - ): - visited.add(obj_id) - - # Handle None - if obj is None: - return None - - # TODO: currently using runtime_checkable on ContentIdentifiable protocol - # Re-evaluate this strategy to see if a faster / more robust check could be used - if isinstance(obj, hp.ContentIdentifiable): - logger.debug( - f"Processing ContentHashableBase instance of type {type(obj).__name__}" - ) - return obj.content_hash() - - # Handle basic types - if isinstance(obj, (str, int, float, bool)): - return obj - - # Handle bytes and bytearray - if isinstance(obj, (bytes, bytearray)): - logger.debug( - f"Converting bytes/bytearray of length {len(obj)} to hex representation" - ) - return obj.hex() - - # Handle Path objects - if isinstance(obj, Path): - logger.debug(f"Converting Path object to string: {obj}") - raise NotImplementedError( - "Path objects are not supported in this hasher. Please convert to string." - ) - return str(obj) - - # Handle UUID objects - if isinstance(obj, UUID): - logger.debug(f"Converting UUID to string: {obj}") - raise NotImplementedError( - "UUID objects are not supported in this hasher. Please convert to string." - ) - return str(obj) - - # Handle named tuples (which are subclasses of tuple) - if hasattr(obj, "_fields") and isinstance(obj, tuple): - logger.debug(f"Processing named tuple of type {type(obj).__name__}") - # For namedtuples, convert to dict and then process - d = {field: getattr(obj, field) for field in obj._fields} # type: ignore - return process_structure(d, visited) - - # Handle mappings (dict-like objects) - if isinstance(obj, Mapping): - # Process both keys and values - processed_items = [ - ( - process_structure(k, visited), - process_structure(v, visited), - ) - for k, v in obj.items() - ] - - # Sort by the processed keys for deterministic order - processed_items.sort(key=lambda x: str(x[0])) - - # Create a new dictionary with string keys based on processed keys - # TODO: consider checking for possibly problematic values in processed_k - # and issue a warning - return { - str(processed_k): processed_v - for processed_k, processed_v in processed_items - } - - # Handle sets and frozensets - if isinstance(obj, (set, frozenset)): - logger.debug( - f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" - ) - # Process each item first, then sort the processed results - processed_items = [process_structure(item, visited) for item in obj] - return sorted(processed_items, key=str) - - # Handle collections (list-like objects) - if isinstance(obj, Collection): - logger.debug( - f"Processing collection of type {type(obj).__name__} with {len(obj)} items" - ) - return [process_structure(item, visited) for item in obj] - - # For functions, use the function_content_hash - if callable(obj) and hasattr(obj, "__code__"): - logger.debug(f"Processing function: {getattr(obj, '__name__')}") - if function_info_extractor is not None: - # Use the extractor to get a stable representation - function_info = function_info_extractor.extract_function_info(obj) - logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") - - # simply return the function info as a stable representation - return function_info - else: - raise ValueError( - f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" - ) - - # handle data types - if isinstance(obj, type): - logger.debug(f"Processing class/type: {obj.__name__}") - return f"type:{obj.__name__}" - - # For other objects, attempt to create deterministic representation only if force_hash=True - class_name = obj.__class__.__name__ - module_name = obj.__class__.__module__ - if force_hash: - try: - import re - - logger.debug( - f"Processing generic object of type {module_name}.{class_name}" - ) - - # Try to get a stable dict representation if possible - if hasattr(obj, "__dict__"): - # Sort attributes to ensure stable order - attrs = sorted( - (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") - ) - # Limit to first 10 attributes to avoid extremely long representations - if len(attrs) > 10: - logger.debug( - f"Object has {len(attrs)} attributes, limiting to first 10" - ) - attrs = attrs[:10] - attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] - obj_repr = f"{{{', '.join(attr_strs)}}}" - else: - # Get basic repr but remove memory addresses - logger.debug( - "Object has no __dict__, using repr() with memory address removal" - ) - obj_repr = repr(obj) - if len(obj_repr) > 1000: - logger.debug( - f"Object repr is {len(obj_repr)} chars, truncating to 1000" - ) - obj_repr = obj_repr[:1000] + "..." - # Remove memory addresses which look like '0x7f9a1c2b3d4e' - obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) - - return f"{module_name}.{class_name}:{obj_repr}" - except Exception as e: - # Last resort - use class name only - logger.warning(f"Failed to process object representation: {e}") - try: - return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" - except AttributeError: - logger.error("Could not determine object class, using UnknownObject") - return "UnknownObject" - else: - raise ValueError( - f"Processing of {obj} of type {module_name}.{class_name} is not supported" - ) diff --git a/src/orcapod/data/kernels.py b/src/orcapod/data/kernels.py index 0c74cc8..2dde9c7 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/data/kernels.py @@ -59,6 +59,7 @@ def last_modified(self) -> datetime | None: """ return self._last_modified + # TODO: reconsider making this a public method def _set_modified_time( self, timestamp: datetime | None = None, invalidate: bool = False ) -> None: @@ -97,23 +98,30 @@ def output_types( @abstractmethod def kernel_identity_structure( self, streams: Collection[dp.Stream] | None = None - ) -> Any: ... + ) -> Any: + """ + Identity structure for this kernel. Input stream(s), if present, have already been preprocessed + and validated. + """ + ... def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: - # Default implementation of identity_structure for the kernel only - # concerns the kernel class and the streams if present. Subclasses of - # Kernels should override this method to provide a more meaningful - # representation of the kernel. Note that kernel must provide the notion - # of identity under possibly two distinct contexts: - # 1) identity of the kernel in itself when invoked without any stream - # 2) identity of the specific invocation of the kernel with a collection of streams - # While the latter technically corresponds to the identity of the invocation and not - # the kernel, only kernel can provide meaningful information as to the uniqueness of - # the invocation as only kernel would know if / how the input stream(s) alter the identity - # of the invocation. For example, if the kernel corresponds to an commutative computation - # and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the - # equivalence of the two by returning the same identity structure for both invocations. - # This can be achieved, for example, by returning a set over the streams instead of a tuple. + """ + Default implementation of identity_structure for the kernel only + concerns the kernel class and the streams if present. Subclasses of + Kernels should override this method to provide a more meaningful + representation of the kernel. Note that kernel must provide the notion + of identity under possibly two distinct contexts: + 1) identity of the kernel in itself when invoked without any stream + 2) identity of the specific invocation of the kernel with a collection of streams + While the latter technically corresponds to the identity of the invocation and not + the kernel, only kernel can provide meaningful information as to the uniqueness of + the invocation as only kernel would know if / how the input stream(s) alter the identity + of the invocation. For example, if the kernel corresponds to an commutative computation + and therefore kernel K(x, y) == K(y, x), then the identity structure must reflect the + equivalence of the two by returning the same identity structure for both invocations. + This can be achieved, for example, by returning a set over the streams instead of a tuple. + """ if streams is not None: streams = self.pre_kernel_processing(*streams) self.validate_inputs(*streams) diff --git a/src/orcapod/data/operators/batch.py b/src/orcapod/data/operators/batch.py index 3d8e82e..1281ea4 100644 --- a/src/orcapod/data/operators/batch.py +++ b/src/orcapod/data/operators/batch.py @@ -67,7 +67,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: next_batch = {} i = 0 - for i, entry in enumerate(data_list): + for entry in data_list: i += 1 for c in entry: next_batch.setdefault(c, []).append(entry[c]) diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/data/operators/mappers.py index a32b4ca..c3042c3 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/data/operators/mappers.py @@ -22,7 +22,7 @@ class MapPackets(UnaryOperator): """ def __init__( - self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + self, name_map: Mapping[str, str], drop_unmapped: bool = False, **kwargs ): self.name_map = dict(name_map) self.drop_unmapped = drop_unmapped @@ -30,14 +30,26 @@ def __init__( def op_forward(self, stream: dp.Stream) -> dp.Stream: tag_columns, packet_columns = stream.keys() + unmapped_columns = set(packet_columns) - set(self.name_map.keys()) if not any(n in packet_columns for n in self.name_map): # nothing to rename in the packet, return stream as is return stream - table = stream.as_table(include_source=True) + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) - name_map = {tc: tc for tc in tag_columns} # no renaming on tag columns + name_map = { + c: c + for c in table.column_names + if c not in packet_columns and not c.startswith("") + } + name_map = { + tc: tc + for tc in table.column_names + if tc not in packet_columns and not tc.startswith(constants.SOURCE_PREFIX) + } # no renaming on tag columns for c in packet_columns: if c in self.name_map: name_map[c] = self.name_map[c] @@ -48,6 +60,10 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: name_map[c] = c renamed_table = table.rename_columns(name_map) + + if self.drop_unmapped and unmapped_columns: + renamed_table = renamed_table.drop_columns(list(unmapped_columns)) + return TableStream( renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) ) @@ -79,8 +95,12 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: message += f"overlapping tag columns: {overlapping_tag_columns}." raise InputValidationError(message) - def op_output_types(self, stream: dp.Stream) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, packet_typespec = stream.types() + def op_output_types( + self, stream: dp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, packet_typespec = stream.types( + include_system_tags=include_system_tags + ) # Create new packet typespec with renamed keys new_packet_typespec = { @@ -105,7 +125,7 @@ class MapTags(UnaryOperator): """ def __init__( - self, name_map: Mapping[str, str], drop_unmapped: bool = True, **kwargs + self, name_map: Mapping[str, str], drop_unmapped: bool = False, **kwargs ): self.name_map = dict(name_map) self.drop_unmapped = drop_unmapped @@ -113,12 +133,13 @@ def __init__( def op_forward(self, stream: dp.Stream) -> dp.Stream: tag_columns, packet_columns = stream.keys() + missing_tags = set(tag_columns) - set(self.name_map.keys()) if not any(n in tag_columns for n in self.name_map): # nothing to rename in the tags, return stream as is return stream - table = stream.as_table(include_source=True) + table = stream.as_table(include_source=True, include_system_tags=True) name_map = { tc: self.name_map.get(tc, tc) for tc in tag_columns @@ -128,6 +149,11 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: name_map[c] = c # no renaming on packet columns renamed_table = table.rename_columns(name_map) + + if missing_tags and self.drop_unmapped: + # drop any tags that are not in the name map + renamed_table = renamed_table.drop_columns(list(missing_tags)) + return TableStream( renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) ) @@ -157,8 +183,12 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: message += f"overlapping packet columns: {overlapping_packet_columns}." raise InputValidationError(message) - def op_output_types(self, stream: dp.Stream) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, packet_typespec = stream.types() + def op_output_types( + self, stream: dp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, packet_typespec = stream.types( + include_system_tags=include_system_tags + ) # Create new packet typespec with renamed keys new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} diff --git a/src/orcapod/data/pods.py b/src/orcapod/data/pods.py index 1045878..d29bc64 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/data/pods.py @@ -1,7 +1,8 @@ +import hashlib import logging import sys from abc import abstractmethod -from collections.abc import Callable, Collection, Iterable, Mapping, Sequence +from collections.abc import Callable, Collection, Iterable, Sequence from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal @@ -12,20 +13,23 @@ ) from orcapod.data.kernels import KernelStream, TrackedKernelBase from orcapod.data.operators import Join -from orcapod.data.streams import LazyPodResultStream, EfficientPodResultStream +from orcapod.data.streams import EfficientPodResultStream, LazyPodResultStream from orcapod.data.system_constants import constants +from orcapod.hashing.hash_utils import get_function_components, get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.database_protocols import ArrowDatabase from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import types_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.hashing.hash_utils import get_function_signature, get_function_components -import hashlib +# TODO: extract default char count as config def combine_hashes( - *hashes: str, order: bool = False, prefix_hasher_id: bool = False + *hashes: str, + order: bool = False, + prefix_hasher_id: bool = False, + hex_char_count: int | None = 20, ) -> str: """Combine hashes into a single hash string.""" @@ -36,6 +40,8 @@ def combine_hashes( prepared_hashes = list(hashes) combined = "".join(prepared_hashes) combined_hash = hashlib.sha256(combined.encode()).hexdigest() + if hex_char_count is not None: + combined_hash = combined_hash[:hex_char_count] if prefix_hasher_id: return "sha256@" + combined_hash return combined_hash @@ -416,11 +422,16 @@ def call( output_data = self.process_function_output(values) + # TODO: extract out this function + def combine(*components: tuple[str, ...]) -> str: + inner_parsed = [":".join(component) for component in components] + return "::".join(inner_parsed) + if record_id is None: # if record_id is not provided, generate it from the packet record_id = self.get_record_id(packet, execution_engine_hash) source_info = { - k: ":".join(self.reference + (record_id, k)) for k in output_data + k: combine(self.reference, (record_id,), (k,)) for k in output_data } output_packet = DictPacket( @@ -466,11 +477,16 @@ async def async_call( output_data = self.process_function_output(values) + # TODO: extract out this function + def combine(*components: tuple[str, ...]) -> str: + inner_parsed = [":".join(component) for component in components] + return "::".join(inner_parsed) + if record_id is None: # if record_id is not provided, generate it from the packet record_id = self.get_record_id(packet, execution_engine_hash) source_info = { - k: ":".join(self.reference + (record_id, k)) for k in output_data + k: combine(self.reference, (record_id,), (k,)) for k in output_data } output_packet = DictPacket( diff --git a/src/orcapod/data/sources/__init__.py b/src/orcapod/data/sources/__init__.py index 6d6a954..65ead00 100644 --- a/src/orcapod/data/sources/__init__.py +++ b/src/orcapod/data/sources/__init__.py @@ -1,5 +1,14 @@ from .base import SourceBase from .arrow_table_source import ArrowTableSource +from .delta_table_source import DeltaTableSource from .dict_source import DictSource +from .source_registry import SourceRegistry, GLOBAL_SOURCE_REGISTRY -__all__ = ["SourceBase", "ArrowTableSource", "DictSource"] +__all__ = [ + "SourceBase", + "ArrowTableSource", + "DeltaTableSource", + "DictSource", + "SourceRegistry", + "GLOBAL_SOURCE_REGISTRY", +] diff --git a/src/orcapod/data/sources/arrow_table_source.py b/src/orcapod/data/sources/arrow_table_source.py index 8d0cf46..801adf3 100644 --- a/src/orcapod/data/sources/arrow_table_source.py +++ b/src/orcapod/data/sources/arrow_table_source.py @@ -1,4 +1,5 @@ from collections.abc import Collection +from re import S from typing import TYPE_CHECKING, Any @@ -8,14 +9,15 @@ from orcapod.utils.lazy_module import LazyModule from orcapod.data.system_constants import constants from orcapod.data import arrow_data_utils +from orcapod.data.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry + +from orcapod.data.sources.base import SourceBase if TYPE_CHECKING: import pyarrow as pa else: pa = LazyModule("pyarrow") -from orcapod.data.sources.base import SourceBase - class ArrowTableSource(SourceBase): """Construct source from a collection of dictionaries""" @@ -24,31 +26,49 @@ class ArrowTableSource(SourceBase): def __init__( self, - table: "pa.Table", + arrow_table: "pa.Table", tag_columns: Collection[str] = (), + source_name: str | None = None, + source_registry: SourceRegistry | None = None, + auto_register: bool = True, + preserve_system_columns: bool = False, **kwargs, ): super().__init__(**kwargs) # clean the table, dropping any system columns # TODO: consider special treatment of system columns if provided - table = arrow_data_utils.drop_system_columns(table) - self.table_hash = self.data_context.arrow_hasher.hash_table(table) + if not preserve_system_columns: + arrow_table = arrow_data_utils.drop_system_columns(arrow_table) - self.tag_columns = [col for col in tag_columns if col in table.column_names] + self.tag_columns = [ + col for col in tag_columns if col in arrow_table.column_names + ] - # add system tag column, indexing into the array - system_tag_column = pa.array(list(range(table.num_rows)), pa.int64()) + self.table_hash = self.data_context.arrow_hasher.hash_table(arrow_table) - table = table.add_column( - 0, f"{constants.SYSTEM_TAG_PREFIX}{self.source_info}", system_tag_column - ) + if source_name is None: + source_name = self.content_hash().to_hex() + + self._source_name = source_name + + row_index = list(range(arrow_table.num_rows)) + + source_info = [f"{self.source_id}::row_{i}" for i in row_index] # add source info - self._table = arrow_data_utils.add_source_info( - table, self.source_info, exclude_columns=tag_columns + arrow_table = arrow_data_utils.add_source_info( + arrow_table, source_info, exclude_columns=tag_columns + ) + + arrow_table = arrow_table.add_column( + 0, + f"{constants.SYSTEM_TAG_PREFIX}{self.source_id}::row_index", + pa.array(row_index, pa.int64()), ) + self._table = arrow_table + self._table_stream = TableStream( table=self._table, tag_columns=self.tag_columns, @@ -56,16 +76,21 @@ def __init__( upstreams=(), ) + # Auto-register with global registry + if auto_register: + registry = source_registry or GLOBAL_SOURCE_REGISTRY + registry.register(self.source_id, self) + @property def reference(self) -> tuple[str, ...]: - return (self.SOURCE_ID, self.table_hash.to_hex()) + return ("arrow_table", self._source_name) @property def table(self) -> "pa.Table": return self._table def source_identity_structure(self) -> Any: - return (self.__class__.__name__, self.source_info, self.table_hash) + return (self.__class__.__name__, self.tag_columns, self.table_hash) def get_all_records( self, include_system_columns: bool = False diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/data/sources/base.py index ba3afcf..5b6ddc2 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/data/sources/base.py @@ -49,11 +49,16 @@ def kernel_identity_structure( # otherwise, return the identity structure of the stream return self.source_identity_structure() - # @property - # @abstractmethod - # def reference(self) -> tuple[str, ...]: - # """Return the unique identifier for the kernel.""" - # ... + @property + def source_id(self) -> str: + return ":".join(self.reference) + + # Redefine the reference to ensure subclass would provide a concrete implementation + @property + @abstractmethod + def reference(self) -> tuple[str, ...]: + """Return the unique identifier for the kernel.""" + ... def kernel_output_types( self, *streams: dp.Stream, include_system_tags: bool = False @@ -246,8 +251,8 @@ def source_output_types( return self.stream.types(include_system_tags=include_system_tags) @property - def kernel_id(self) -> tuple[str, ...]: - return (self.stream.__class__.__name__,) + def reference(self) -> tuple[str, ...]: + return ("stream", self.stream.content_hash().to_string()) def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: """ diff --git a/src/orcapod/data/sources/delta_table_source.py b/src/orcapod/data/sources/delta_table_source.py new file mode 100644 index 0000000..eb4cd52 --- /dev/null +++ b/src/orcapod/data/sources/delta_table_source.py @@ -0,0 +1,198 @@ +from collections.abc import Collection +from typing import TYPE_CHECKING, Any + + +from orcapod.data.streams import TableStream +from orcapod.protocols import data_protocols as dp +from orcapod.types import PathLike, PythonSchema +from orcapod.utils.lazy_module import LazyModule +from pathlib import Path + + +from orcapod.data.sources.base import SourceBase +from orcapod.data.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry +from deltalake import DeltaTable +from deltalake.exceptions import TableNotFoundError + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +class DeltaTableSource(SourceBase): + """Source that generates streams from a Delta table.""" + + def __init__( + self, + delta_table_path: PathLike, + tag_columns: Collection[str] = (), + source_name: str | None = None, + source_registry: SourceRegistry | None = None, + auto_register: bool = True, + **kwargs, + ): + """ + Initialize DeltaTableSource with a Delta table. + + Args: + delta_table_path: Path to the Delta table + source_name: Name for this source (auto-generated if None) + tag_columns: Column names to use as tags vs packet data + source_registry: Registry to register with (uses global if None) + auto_register: Whether to auto-register this source + """ + super().__init__(**kwargs) + + # Normalize path + self._delta_table_path = Path(delta_table_path).resolve() + + # Try to open the Delta table + try: + self._delta_table = DeltaTable(str(self._delta_table_path)) + except TableNotFoundError: + raise ValueError(f"Delta table not found at {self._delta_table_path}") + + # Generate source name if not provided + if source_name is None: + source_name = self._delta_table_path.name + + self._source_name = source_name + self._tag_columns = tuple(tag_columns) + + # Auto-register with global registry + if auto_register: + registry = source_registry or GLOBAL_SOURCE_REGISTRY + registry.register(self.source_id, self) + + @property + def reference(self) -> tuple[str, ...]: + """Reference tuple for this source.""" + return ("delta_table", self._source_name) + + def source_identity_structure(self) -> Any: + """ + Identity structure for this source - includes path and modification info. + This changes when the underlying Delta table changes. + """ + # Get Delta table version for change detection + table_version = self._delta_table.version() + + return { + "class": self.__class__.__name__, + "path": str(self._delta_table_path), + "version": table_version, + "tag_columns": self._tag_columns, + } + + def validate_inputs(self, *streams: dp.Stream) -> None: + """Delta table sources don't take input streams.""" + if len(streams) > 0: + raise ValueError( + f"DeltaTableSource doesn't accept input streams, got {len(streams)}" + ) + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """Return tag and packet types based on Delta table schema.""" + # Create a sample stream to get types + return self.forward().types(include_system_tags=include_system_tags) + + def forward(self, *streams: dp.Stream) -> dp.Stream: + """ + Generate stream from Delta table data. + + Returns: + TableStream containing all data from the Delta table + """ + + # Refresh table to get latest data + self._refresh_table() + + # Load table data + table_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + + return TableStream( + table=table_data, + tag_columns=self._tag_columns, + source=self, + ) + + def _refresh_table(self) -> None: + """Refresh the Delta table to get latest version.""" + try: + # Create fresh Delta table instance to get latest data + self._delta_table = DeltaTable(str(self._delta_table_path)) + except Exception as e: + # If refresh fails, log but continue with existing table + import logging + + logger = logging.getLogger(__name__) + logger.warning( + f"Failed to refresh Delta table {self._delta_table_path}: {e}" + ) + + def get_table_info(self) -> dict[str, Any]: + """Get metadata about the Delta table.""" + self._refresh_table() + + schema = self._delta_table.schema() + history = self._delta_table.history() + + return { + "path": str(self._delta_table_path), + "version": self._delta_table.version(), + "schema": schema, + "num_files": len(self._delta_table.files()), + "tag_columns": self._tag_columns, + "latest_commit": history[0] if history else None, + } + + def resolve_field(self, collection_id: str, record_id: str, field_name: str) -> Any: + """ + Resolve a specific field value from source field reference. + + For Delta table sources: + - collection_id: Not used (single table) + - record_id: Row identifier (implementation dependent) + - field_name: Column name + """ + # This is a basic implementation - you might want to add more sophisticated + # record identification based on your needs + + # For now, assume record_id is a row index + try: + row_index = int(record_id) + table_data = self._delta_table.to_pyarrow_dataset( + as_large_types=True + ).to_table() + + if row_index >= table_data.num_rows: + raise ValueError( + f"Record ID {record_id} out of range (table has {table_data.num_rows} rows)" + ) + + if field_name not in table_data.column_names: + raise ValueError( + f"Field '{field_name}' not found in table columns: {table_data.column_names}" + ) + + return table_data[field_name][row_index].as_py() + + except ValueError as e: + if "invalid literal for int()" in str(e): + raise ValueError( + f"Record ID must be numeric for DeltaTableSource, got: {record_id}" + ) + raise + + def __repr__(self) -> str: + return ( + f"DeltaTableSource(path={self._delta_table_path}, name={self._source_name})" + ) + + def __str__(self) -> str: + return f"DeltaTableSource:{self._source_name}" diff --git a/src/orcapod/data/sources/dict_source.py b/src/orcapod/data/sources/dict_source.py index bd5d114..6cc3ae4 100644 --- a/src/orcapod/data/sources/dict_source.py +++ b/src/orcapod/data/sources/dict_source.py @@ -1,4 +1,4 @@ -from collections.abc import Collection +from collections.abc import Collection, Mapping from typing import TYPE_CHECKING, Any @@ -62,20 +62,29 @@ class DictSource(SourceBase): def __init__( self, - data: Collection[dict[str, DataValue]], + data: Collection[Mapping[str, DataValue]], tag_columns: Collection[str] = (), system_tag_columns: Collection[str] = (), + source_name: str | None = None, data_schema: PythonSchemaLike | None = None, **kwargs, ): super().__init__(**kwargs) arrow_table = self.data_context.type_converter.python_dicts_to_arrow_table( - list(data), python_schema=data_schema + [dict(e) for e in data], python_schema=data_schema ) self._table_source = ArrowTableSource( - arrow_table, tag_columns=tag_columns, system_tag_columns=system_tag_columns + arrow_table, + tag_columns=tag_columns, + source_name=source_name, + system_tag_columns=system_tag_columns, ) + @property + def reference(self) -> tuple[str, ...]: + # TODO: provide more thorough implementation + return ("dict",) + self._table_source.reference[1:] + def source_identity_structure(self) -> Any: return self._table_source.source_identity_structure() diff --git a/src/orcapod/data/sources/source_registry.py b/src/orcapod/data/sources/source_registry.py new file mode 100644 index 0000000..ea1fe94 --- /dev/null +++ b/src/orcapod/data/sources/source_registry.py @@ -0,0 +1,232 @@ +import logging +from collections.abc import Iterator +from orcapod.protocols.data_protocols import Source + + +logger = logging.getLogger(__name__) + + +class SourceCollisionError(Exception): + """Raised when attempting to register a source ID that already exists.""" + + pass + + +class SourceNotFoundError(Exception): + """Raised when attempting to access a source that doesn't exist.""" + + pass + + +class SourceRegistry: + """ + Registry for managing data sources. + + Provides collision detection, source lookup, and management of source lifecycles. + """ + + def __init__(self): + self._sources: dict[str, Source] = {} + + def register(self, source_id: str, source: Source) -> None: + """ + Register a source with the given ID. + + Args: + source_id: Unique identifier for the source + source: Source instance to register + + Raises: + SourceCollisionError: If source_id already exists + ValueError: If source_id or source is invalid + """ + if not source_id: + raise ValueError("Source ID cannot be empty") + + if not isinstance(source_id, str): + raise ValueError(f"Source ID must be a string, got {type(source_id)}") + + if source is None: + raise ValueError("Source cannot be None") + + if source_id in self._sources: + existing_source = self._sources[source_id] + if existing_source == source: + # Idempotent - same source already registered + logger.debug( + f"Source ID '{source_id}' already registered with the same source instance." + ) + return + raise SourceCollisionError( + f"Source ID '{source_id}' already registered with {type(existing_source).__name__}. " + f"Cannot register {type(source).__name__}. " + f"Choose a different source_id or unregister the existing source first." + ) + + self._sources[source_id] = source + logger.info(f"Registered source: '{source_id}' -> {type(source).__name__}") + + def get(self, source_id: str) -> Source: + """ + Get a source by ID. + + Args: + source_id: Source identifier + + Returns: + Source instance + + Raises: + SourceNotFoundError: If source doesn't exist + """ + if source_id not in self._sources: + available_ids = list(self._sources.keys()) + raise SourceNotFoundError( + f"Source '{source_id}' not found. Available sources: {available_ids}" + ) + + return self._sources[source_id] + + def get_optional(self, source_id: str) -> Source | None: + """ + Get a source by ID, returning None if not found. + + Args: + source_id: Source identifier + + Returns: + Source instance or None if not found + """ + return self._sources.get(source_id) + + def unregister(self, source_id: str) -> Source: + """ + Unregister a source by ID. + + Args: + source_id: Source identifier + + Returns: + The unregistered source instance + + Raises: + SourceNotFoundError: If source doesn't exist + """ + if source_id not in self._sources: + raise SourceNotFoundError(f"Source '{source_id}' not found") + + source = self._sources.pop(source_id) + logger.info(f"Unregistered source: '{source_id}'") + return source + + # TODO: consider just using __contains__ + def contains(self, source_id: str) -> bool: + """Check if a source ID is registered.""" + return source_id in self._sources + + def list_sources(self) -> list[str]: + """Get list of all registered source IDs.""" + return list(self._sources.keys()) + + # TODO: consider removing this + def list_sources_by_type(self, source_type: type) -> list[str]: + """ + Get list of source IDs filtered by source type. + + Args: + source_type: Class type to filter by + + Returns: + List of source IDs that match the type + """ + return [ + source_id + for source_id, source in self._sources.items() + if isinstance(source, source_type) + ] + + def clear(self) -> None: + """Remove all registered sources.""" + count = len(self._sources) + self._sources.clear() + logger.info(f"Cleared {count} sources from registry") + + def replace(self, source_id: str, source: Source) -> Source | None: + """ + Replace an existing source or register a new one. + + Args: + source_id: Source identifier + source: New source instance + + Returns: + Previous source if it existed, None otherwise + """ + old_source = self._sources.get(source_id) + self._sources[source_id] = source + + if old_source: + logger.info(f"Replaced source: '{source_id}' -> {type(source).__name__}") + else: + logger.info( + f"Registered new source: '{source_id}' -> {type(source).__name__}" + ) + + return old_source + + def get_source_info(self, source_id: str) -> dict: + """ + Get information about a registered source. + + Args: + source_id: Source identifier + + Returns: + Dictionary with source information + + Raises: + SourceNotFoundError: If source doesn't exist + """ + source = self.get(source_id) # This handles the not found case + + info = { + "source_id": source_id, + "type": type(source).__name__, + "reference": source.reference if hasattr(source, "reference") else None, + } + info["identity"] = source.identity_structure() + + return info + + def __len__(self) -> int: + """Return number of registered sources.""" + return len(self._sources) + + def __contains__(self, source_id: str) -> bool: + """Support 'in' operator for checking source existence.""" + return source_id in self._sources + + def __iter__(self) -> Iterator[str]: + """Iterate over source IDs.""" + return iter(self._sources) + + def items(self) -> Iterator[tuple[str, Source]]: + """Iterate over (source_id, source) pairs.""" + yield from self._sources.items() + + def __repr__(self) -> str: + return f"SourceRegistry({len(self._sources)} sources)" + + def __str__(self) -> str: + if not self._sources: + return "SourceRegistry(empty)" + + source_summary = [] + for source_id, source in self._sources.items(): + source_summary.append(f" {source_id}: {type(source).__name__}") + + return "SourceRegistry:\n" + "\n".join(source_summary) + + +# Global source registry instance +GLOBAL_SOURCE_REGISTRY = SourceRegistry() diff --git a/src/orcapod/data/streams.py b/src/orcapod/data/streams.py index 43365c7..b2a2909 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/data/streams.py @@ -574,12 +574,13 @@ def __init__( raise ValueError( "Table must contain at least one column to be used as a stream." ) + table = data_table if data_context_table is None: data_context_table = pa.table( { constants.CONTEXT_KEY: pa.array( - [contexts.get_default_context_key()] * len(data_table), + [contexts.get_default_context_key()] * len(table), pa.large_string(), ) } @@ -1326,6 +1327,8 @@ def iter_packets( # identify all entries in the input stream for which we still have not computed packets target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, include_content_hash=constants.INPUT_PACKET_HASH, execution_engine=execution_engine, ) diff --git a/src/orcapod/databases/delta_lake_databases.py b/src/orcapod/databases/delta_lake_databases.py index 4457f0f..96181ee 100644 --- a/src/orcapod/databases/delta_lake_databases.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -1,18 +1,19 @@ -from pathlib import Path -from typing import Any, Literal, TYPE_CHECKING, cast import logging -from deltalake import DeltaTable, write_deltalake -from deltalake.exceptions import TableNotFoundError from collections import defaultdict from collections.abc import Collection, Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, cast + +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError from orcapod.data import constants from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: + import polars as pl import pyarrow as pa import pyarrow.compute as pc - import polars as pl else: pa = LazyModule("pyarrow") pl = LazyModule("polars") diff --git a/src/orcapod/databases/file_utils.py b/src/orcapod/databases/file_utils.py index 712aada..8fb6191 100644 --- a/src/orcapod/databases/file_utils.py +++ b/src/orcapod/databases/file_utils.py @@ -1,434 +1,434 @@ -# file_ops.py - Atomic file operations module - -import builtins -import contextlib -import inspect -import logging -import os -from pathlib import Path - -from orcapod.types import PathLike, PathSet, PacketLike -from collections.abc import Collection, Callable - - -logger = logging.getLogger(__name__) - - -def atomic_write(file_path: PathLike, content: str) -> Path: - """ - Atomically write content to a file. - - This function writes content to a temporary file and then atomically - renames it to the target file path, ensuring that other processes never - see a partially-written file. - - Args: - file_path: Target file path - content: Content to write - - Returns: - Path object to the written file - - Raises: - OSError: If the file cannot be written - """ - file_path = Path(file_path) - temp_path = file_path.with_name(f"{file_path.name}.tmp{os.getpid()}") - - # Ensure parent directory exists - file_path.parent.mkdir(parents=True, exist_ok=True) - - try: - # Write content to a temporary file - with open(temp_path, "w") as f: - f.write(content) - f.flush() - os.fsync(f.fileno()) # Force flush to disk - - # Atomic rename - os.rename(temp_path, file_path) - return file_path - except Exception as e: - logger.error(f"Error writing file {file_path}: {str(e)}") - raise - finally: - # Clean up the temporary file if it exists - if temp_path.exists(): - temp_path.unlink(missing_ok=True) - - -def atomic_write_bytes(file_path: PathLike, content: bytes) -> Path: - """ - Atomically write binary content to a file. - - This function writes binary content to a temporary file and then atomically - renames it to the target file path. - - Args: - file_path: Target file path - content: Binary content to write - - Returns: - Path object to the written file - - Raises: - OSError: If the file cannot be written - """ - file_path = Path(file_path) - temp_path = file_path.with_name(f"{file_path.name}.tmp{os.getpid()}") - - # Ensure parent directory exists - file_path.parent.mkdir(parents=True, exist_ok=True) - - try: - # Write content to a temporary file - with open(temp_path, "wb") as f: - f.write(content) - f.flush() - os.fsync(f.fileno()) # Force flush to disk - - # Atomic rename - os.rename(temp_path, file_path) - return file_path - except Exception as e: - logger.error(f"Error writing file {file_path}: {str(e)}") - raise - finally: - # Clean up the temporary file if it exists - if temp_path.exists(): - temp_path.unlink(missing_ok=True) - - -def atomic_copy(source_path: PathLike, dest_path: PathLike) -> Path: - """ - Atomically copy a file. - - This function copies a file to a temporary location and then atomically - renames it to the target path, ensuring that other processes never - see a partially-copied file. - - Args: - source_path: Source file path - dest_path: Destination file path - - Returns: - Path object to the copied file - - Raises: - OSError: If the file cannot be copied - FileNotFoundError: If the source file does not exist - """ - import shutil - - source_path = Path(source_path) - dest_path = Path(dest_path) - temp_path = dest_path.with_name(f"{dest_path.name}.tmp{os.getpid()}") - - # Check if source exists - if not source_path.exists(): - raise FileNotFoundError(f"Source file does not exist: {source_path}") - - # Ensure parent directory exists - dest_path.parent.mkdir(parents=True, exist_ok=True) - - try: - # Copy to temporary file - shutil.copy2(source_path, temp_path) - - # Ensure the data is written to disk - with open(temp_path, "a") as f: - os.fsync(f.fileno()) - - # Atomic rename - os.rename(temp_path, dest_path) - return dest_path - except Exception as e: - logger.error(f"Error copying file from {source_path} to {dest_path}: {str(e)}") - raise - finally: - # Clean up the temporary file if it exists - if temp_path.exists(): - temp_path.unlink(missing_ok=True) - - -def atomic_append(file_path: PathLike, content: str) -> Path: - """ - Atomically append content to a file. - - This function reads the existing content, appends the new content, - and then atomically writes the result back to the file. - - Args: - file_path: Target file path - content: Content to append - - Returns: - Path object to the appended file - - Raises: - OSError: If the file cannot be written - """ - file_path = Path(file_path) - - # Read existing content if file exists - existing_content = "" - if file_path.exists(): - try: - with open(file_path, "r") as f: - existing_content = f.read() - except Exception as e: - logger.error(f"Error reading file {file_path} for append: {str(e)}") - raise - - # Write the combined content atomically - return atomic_write(file_path, existing_content + content) - - -def atomic_replace( - file_path: PathLike, pattern: str, replacement: str, count: int = -1 -) -> tuple[Path, int]: - """ - Atomically replace text in a file. - - This function reads the existing content, performs the replacement, - and then atomically writes the result back to the file. - - Args: - file_path: Target file path - pattern: Pattern to replace - replacement: Replacement text - count: Maximum number of replacements (default: unlimited) - - Returns: - Tuple of (Path object to the file, number of replacements made) - - Raises: - OSError: If the file cannot be read or written - FileNotFoundError: If the file does not exist - """ - file_path = Path(file_path) - - # Check if file exists - if not file_path.exists(): - raise FileNotFoundError(f"File does not exist: {file_path}") - - # Read existing content - try: - with open(file_path, "r") as f: - existing_content = f.read() - except Exception as e: - logger.error(f"Error reading file {file_path} for replacement: {str(e)}") - raise - - # Perform replacement - new_content, num_replacements = existing_content, 0 - if count == -1: - # Replace all occurrences - new_content = existing_content.replace(pattern, replacement) - num_replacements = existing_content.count(pattern) - else: - # Replace only up to count occurrences - new_content = "" - remaining = existing_content - for _ in range(count): - if pattern not in remaining: - break - pos = remaining.find(pattern) - new_content += remaining[:pos] + replacement - remaining = remaining[pos + len(pattern) :] - num_replacements += 1 - new_content += remaining - - # Write the new content atomically - return atomic_write(file_path, new_content), num_replacements - - -def is_file_locked(file_path: PathLike) -> bool: - """ - Check if a file is locked. - - This function attempts to open the file for writing in non-blocking mode - and checks if it fails with a "resource temporarily unavailable" error. - - Args: - file_path: File path to check - - Returns: - True if the file is locked, False otherwise - """ - import errno - import fcntl - - file_path = Path(file_path) - - # If file doesn't exist, it's not locked - if not file_path.exists(): - return False - - try: - # Try to open the file and get an exclusive lock in non-blocking mode - with open(file_path, "r+") as f: - fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) - # If we get here, the file is not locked - fcntl.flock(f, fcntl.LOCK_UN) - return False - except IOError as e: - if e.errno == errno.EAGAIN: - # Resource temporarily unavailable = the file is locked - return True - # Some other error - assume not locked - return False - except Exception: - # Any other exception - assume not locked - return False - - -@contextlib.contextmanager -def redirect_open( - mapping: dict[str, str] | Callable[[str], str | None], -): - """ - Context manager to intercept file opening operations. - - Args: - mapping: Either a dictionary mapping original paths to their replacements, - or a function that takes a path string and returns a replacement path - (or None to indicate the file should not be opened). - - Raises: - FileNotFoundError: If using a dictionary and the path is not found in it. - """ - # Track all places that might store an open() function - places_to_patch = [] - - # 1. Standard builtins.open - original_builtin_open = builtins.open - places_to_patch.append((builtins, "open", original_builtin_open)) - - # 2. __builtins__ (could be different in some contexts, especially IPython) - if isinstance(__builtins__, dict) and "open" in __builtins__: - places_to_patch.append((__builtins__, "open", __builtins__["open"])) - - # 3. Current module's globals (for the calling namespace) - current_frame = inspect.currentframe() - if current_frame is not None: - caller_globals = current_frame.f_back.f_globals if current_frame.f_back else {} - if "open" in caller_globals: - places_to_patch.append((caller_globals, "open", caller_globals["open"])) - - # 4. Check for IPython user namespace - try: - import IPython - - ip = IPython.get_ipython() # type: ignore - if ip and "open" in ip.user_ns: - places_to_patch.append((ip.user_ns, "open", ip.user_ns["open"])) - except (ImportError, AttributeError): - pass - - def patched_open(file, *args, **kwargs): - # Convert PathLike objects to string if needed - if hasattr(file, "__fspath__"): - file_path = os.fspath(file) - else: - file_path = str(file) - - if isinstance(mapping, dict): - if file_path in mapping: - redirected_path = mapping[file_path] - print(f"Redirecting '{file_path}' to '{redirected_path}'") - return original_builtin_open(redirected_path, *args, **kwargs) - else: - raise FileNotFoundError( - f"Path '{file_path}' not found in redirection mapping" - ) - else: # mapping is a function - redirected_path = mapping(file_path) - if redirected_path is not None: - print(f"Redirecting '{file_path}' to '{redirected_path}'") - return original_builtin_open(redirected_path, *args, **kwargs) - else: - raise FileNotFoundError(f"Path '{file_path}' could not be redirected") - - # Apply the patch to all places - for obj, attr, _ in places_to_patch: - if isinstance(obj, dict): - obj[attr] = patched_open - else: - setattr(obj, attr, patched_open) - - try: - yield - finally: - # Restore all original functions - for obj, attr, original in places_to_patch: - if isinstance(obj, dict): - obj[attr] = original - else: - setattr(obj, attr, original) - - -def virtual_mount( - packet: PacketLike, -) -> tuple[PacketLike, dict[str, str], dict[str, str]]: - """ - Visit all pathset within the packet, and convert them to alternative path - representation. By default, full path is mapped to the file name. If two or - more paths have the same file name, the second one is suffixed with "_1", the - third one with "_2", etc. This is useful for creating a virtual mount point - for a set of files, where the original paths are not important, but the file - names can be used to identify the files. - """ - forward_lut = {} # mapping from original path to new path - reverse_lut = {} # mapping from new path to original path - new_packet = {} - - for key, value in packet.items(): - new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) # type: ignore - - return new_packet, forward_lut, reverse_lut - - -# TODO: re-assess the structure of PathSet and consider making it recursive -def convert_pathset(pathset: PathSet, forward_lut, reverse_lut) -> PathSet: - """ - Convert a pathset to a new pathset. forward_lut and reverse_lut are updated - with the new paths. The new paths are created by replacing the original paths - with the new paths in the forward_lut. The reverse_lut is updated with the - original paths. If name already exists, a suffix is added to the new name to avoid - collisions. - """ - if isinstance(pathset, (str, bytes)): - new_name = Path(pathset).name - if new_name in reverse_lut: - # if the name already exists, add a suffix - i = 1 - while f"{new_name}_{i}" in reverse_lut: - i += 1 - new_name = f"{new_name}_{i}" - forward_lut[pathset] = new_name - reverse_lut[new_name] = pathset - return new_name - elif isinstance(pathset, Collection): - return [convert_pathset(p, forward_lut, reverse_lut) for p in pathset] # type: ignore - else: - raise ValueError( - f"Unsupported pathset type: {type(pathset)}. Expected str, bytes, or Collection." - ) - - -class WrappedPath: - def __init__(self, path, name=None): - self.path = Path(path) - if name is None: - name = self.path.name - self.name = name - - def __fspath__(self) -> str | bytes: - return self.path.__fspath__() - - def __str__(self) -> str: - return self.name - - def __repr__(self) -> str: - return f"WrappedPath({self.path}): {self.name}" +# # file_ops.py - Atomic file operations module + +# import builtins +# import contextlib +# import inspect +# import logging +# import os +# from pathlib import Path + +# from orcapod.types import PathLike, PathSet, PacketLike +# from collections.abc import Collection, Callable + + +# logger = logging.getLogger(__name__) + + +# def atomic_write(file_path: PathLike, content: str) -> Path: +# """ +# Atomically write content to a file. + +# This function writes content to a temporary file and then atomically +# renames it to the target file path, ensuring that other processes never +# see a partially-written file. + +# Args: +# file_path: Target file path +# content: Content to write + +# Returns: +# Path object to the written file + +# Raises: +# OSError: If the file cannot be written +# """ +# file_path = Path(file_path) +# temp_path = file_path.with_name(f"{file_path.name}.tmp{os.getpid()}") + +# # Ensure parent directory exists +# file_path.parent.mkdir(parents=True, exist_ok=True) + +# try: +# # Write content to a temporary file +# with open(temp_path, "w") as f: +# f.write(content) +# f.flush() +# os.fsync(f.fileno()) # Force flush to disk + +# # Atomic rename +# os.rename(temp_path, file_path) +# return file_path +# except Exception as e: +# logger.error(f"Error writing file {file_path}: {str(e)}") +# raise +# finally: +# # Clean up the temporary file if it exists +# if temp_path.exists(): +# temp_path.unlink(missing_ok=True) + + +# def atomic_write_bytes(file_path: PathLike, content: bytes) -> Path: +# """ +# Atomically write binary content to a file. + +# This function writes binary content to a temporary file and then atomically +# renames it to the target file path. + +# Args: +# file_path: Target file path +# content: Binary content to write + +# Returns: +# Path object to the written file + +# Raises: +# OSError: If the file cannot be written +# """ +# file_path = Path(file_path) +# temp_path = file_path.with_name(f"{file_path.name}.tmp{os.getpid()}") + +# # Ensure parent directory exists +# file_path.parent.mkdir(parents=True, exist_ok=True) + +# try: +# # Write content to a temporary file +# with open(temp_path, "wb") as f: +# f.write(content) +# f.flush() +# os.fsync(f.fileno()) # Force flush to disk + +# # Atomic rename +# os.rename(temp_path, file_path) +# return file_path +# except Exception as e: +# logger.error(f"Error writing file {file_path}: {str(e)}") +# raise +# finally: +# # Clean up the temporary file if it exists +# if temp_path.exists(): +# temp_path.unlink(missing_ok=True) + + +# def atomic_copy(source_path: PathLike, dest_path: PathLike) -> Path: +# """ +# Atomically copy a file. + +# This function copies a file to a temporary location and then atomically +# renames it to the target path, ensuring that other processes never +# see a partially-copied file. + +# Args: +# source_path: Source file path +# dest_path: Destination file path + +# Returns: +# Path object to the copied file + +# Raises: +# OSError: If the file cannot be copied +# FileNotFoundError: If the source file does not exist +# """ +# import shutil + +# source_path = Path(source_path) +# dest_path = Path(dest_path) +# temp_path = dest_path.with_name(f"{dest_path.name}.tmp{os.getpid()}") + +# # Check if source exists +# if not source_path.exists(): +# raise FileNotFoundError(f"Source file does not exist: {source_path}") + +# # Ensure parent directory exists +# dest_path.parent.mkdir(parents=True, exist_ok=True) + +# try: +# # Copy to temporary file +# shutil.copy2(source_path, temp_path) + +# # Ensure the data is written to disk +# with open(temp_path, "a") as f: +# os.fsync(f.fileno()) + +# # Atomic rename +# os.rename(temp_path, dest_path) +# return dest_path +# except Exception as e: +# logger.error(f"Error copying file from {source_path} to {dest_path}: {str(e)}") +# raise +# finally: +# # Clean up the temporary file if it exists +# if temp_path.exists(): +# temp_path.unlink(missing_ok=True) + + +# def atomic_append(file_path: PathLike, content: str) -> Path: +# """ +# Atomically append content to a file. + +# This function reads the existing content, appends the new content, +# and then atomically writes the result back to the file. + +# Args: +# file_path: Target file path +# content: Content to append + +# Returns: +# Path object to the appended file + +# Raises: +# OSError: If the file cannot be written +# """ +# file_path = Path(file_path) + +# # Read existing content if file exists +# existing_content = "" +# if file_path.exists(): +# try: +# with open(file_path, "r") as f: +# existing_content = f.read() +# except Exception as e: +# logger.error(f"Error reading file {file_path} for append: {str(e)}") +# raise + +# # Write the combined content atomically +# return atomic_write(file_path, existing_content + content) + + +# def atomic_replace( +# file_path: PathLike, pattern: str, replacement: str, count: int = -1 +# ) -> tuple[Path, int]: +# """ +# Atomically replace text in a file. + +# This function reads the existing content, performs the replacement, +# and then atomically writes the result back to the file. + +# Args: +# file_path: Target file path +# pattern: Pattern to replace +# replacement: Replacement text +# count: Maximum number of replacements (default: unlimited) + +# Returns: +# Tuple of (Path object to the file, number of replacements made) + +# Raises: +# OSError: If the file cannot be read or written +# FileNotFoundError: If the file does not exist +# """ +# file_path = Path(file_path) + +# # Check if file exists +# if not file_path.exists(): +# raise FileNotFoundError(f"File does not exist: {file_path}") + +# # Read existing content +# try: +# with open(file_path, "r") as f: +# existing_content = f.read() +# except Exception as e: +# logger.error(f"Error reading file {file_path} for replacement: {str(e)}") +# raise + +# # Perform replacement +# new_content, num_replacements = existing_content, 0 +# if count == -1: +# # Replace all occurrences +# new_content = existing_content.replace(pattern, replacement) +# num_replacements = existing_content.count(pattern) +# else: +# # Replace only up to count occurrences +# new_content = "" +# remaining = existing_content +# for _ in range(count): +# if pattern not in remaining: +# break +# pos = remaining.find(pattern) +# new_content += remaining[:pos] + replacement +# remaining = remaining[pos + len(pattern) :] +# num_replacements += 1 +# new_content += remaining + +# # Write the new content atomically +# return atomic_write(file_path, new_content), num_replacements + + +# def is_file_locked(file_path: PathLike) -> bool: +# """ +# Check if a file is locked. + +# This function attempts to open the file for writing in non-blocking mode +# and checks if it fails with a "resource temporarily unavailable" error. + +# Args: +# file_path: File path to check + +# Returns: +# True if the file is locked, False otherwise +# """ +# import errno +# import fcntl + +# file_path = Path(file_path) + +# # If file doesn't exist, it's not locked +# if not file_path.exists(): +# return False + +# try: +# # Try to open the file and get an exclusive lock in non-blocking mode +# with open(file_path, "r+") as f: +# fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) +# # If we get here, the file is not locked +# fcntl.flock(f, fcntl.LOCK_UN) +# return False +# except IOError as e: +# if e.errno == errno.EAGAIN: +# # Resource temporarily unavailable = the file is locked +# return True +# # Some other error - assume not locked +# return False +# except Exception: +# # Any other exception - assume not locked +# return False + + +# @contextlib.contextmanager +# def redirect_open( +# mapping: dict[str, str] | Callable[[str], str | None], +# ): +# """ +# Context manager to intercept file opening operations. + +# Args: +# mapping: Either a dictionary mapping original paths to their replacements, +# or a function that takes a path string and returns a replacement path +# (or None to indicate the file should not be opened). + +# Raises: +# FileNotFoundError: If using a dictionary and the path is not found in it. +# """ +# # Track all places that might store an open() function +# places_to_patch = [] + +# # 1. Standard builtins.open +# original_builtin_open = builtins.open +# places_to_patch.append((builtins, "open", original_builtin_open)) + +# # 2. __builtins__ (could be different in some contexts, especially IPython) +# if isinstance(__builtins__, dict) and "open" in __builtins__: +# places_to_patch.append((__builtins__, "open", __builtins__["open"])) + +# # 3. Current module's globals (for the calling namespace) +# current_frame = inspect.currentframe() +# if current_frame is not None: +# caller_globals = current_frame.f_back.f_globals if current_frame.f_back else {} +# if "open" in caller_globals: +# places_to_patch.append((caller_globals, "open", caller_globals["open"])) + +# # 4. Check for IPython user namespace +# try: +# import IPython + +# ip = IPython.get_ipython() # type: ignore +# if ip and "open" in ip.user_ns: +# places_to_patch.append((ip.user_ns, "open", ip.user_ns["open"])) +# except (ImportError, AttributeError): +# pass + +# def patched_open(file, *args, **kwargs): +# # Convert PathLike objects to string if needed +# if hasattr(file, "__fspath__"): +# file_path = os.fspath(file) +# else: +# file_path = str(file) + +# if isinstance(mapping, dict): +# if file_path in mapping: +# redirected_path = mapping[file_path] +# print(f"Redirecting '{file_path}' to '{redirected_path}'") +# return original_builtin_open(redirected_path, *args, **kwargs) +# else: +# raise FileNotFoundError( +# f"Path '{file_path}' not found in redirection mapping" +# ) +# else: # mapping is a function +# redirected_path = mapping(file_path) +# if redirected_path is not None: +# print(f"Redirecting '{file_path}' to '{redirected_path}'") +# return original_builtin_open(redirected_path, *args, **kwargs) +# else: +# raise FileNotFoundError(f"Path '{file_path}' could not be redirected") + +# # Apply the patch to all places +# for obj, attr, _ in places_to_patch: +# if isinstance(obj, dict): +# obj[attr] = patched_open +# else: +# setattr(obj, attr, patched_open) + +# try: +# yield +# finally: +# # Restore all original functions +# for obj, attr, original in places_to_patch: +# if isinstance(obj, dict): +# obj[attr] = original +# else: +# setattr(obj, attr, original) + + +# def virtual_mount( +# packet: PacketLike, +# ) -> tuple[PacketLike, dict[str, str], dict[str, str]]: +# """ +# Visit all pathset within the packet, and convert them to alternative path +# representation. By default, full path is mapped to the file name. If two or +# more paths have the same file name, the second one is suffixed with "_1", the +# third one with "_2", etc. This is useful for creating a virtual mount point +# for a set of files, where the original paths are not important, but the file +# names can be used to identify the files. +# """ +# forward_lut = {} # mapping from original path to new path +# reverse_lut = {} # mapping from new path to original path +# new_packet = {} + +# for key, value in packet.items(): +# new_packet[key] = convert_pathset(value, forward_lut, reverse_lut) # type: ignore + +# return new_packet, forward_lut, reverse_lut + + +# # TODO: re-assess the structure of PathSet and consider making it recursive +# def convert_pathset(pathset: PathSet, forward_lut, reverse_lut) -> PathSet: +# """ +# Convert a pathset to a new pathset. forward_lut and reverse_lut are updated +# with the new paths. The new paths are created by replacing the original paths +# with the new paths in the forward_lut. The reverse_lut is updated with the +# original paths. If name already exists, a suffix is added to the new name to avoid +# collisions. +# """ +# if isinstance(pathset, (str, bytes)): +# new_name = Path(pathset).name +# if new_name in reverse_lut: +# # if the name already exists, add a suffix +# i = 1 +# while f"{new_name}_{i}" in reverse_lut: +# i += 1 +# new_name = f"{new_name}_{i}" +# forward_lut[pathset] = new_name +# reverse_lut[new_name] = pathset +# return new_name +# elif isinstance(pathset, Collection): +# return [convert_pathset(p, forward_lut, reverse_lut) for p in pathset] # type: ignore +# else: +# raise ValueError( +# f"Unsupported pathset type: {type(pathset)}. Expected str, bytes, or Collection." +# ) + + +# class WrappedPath: +# def __init__(self, path, name=None): +# self.path = Path(path) +# if name is None: +# name = self.path.name +# self.name = name + +# def __fspath__(self) -> str | bytes: +# return self.path.__fspath__() + +# def __str__(self) -> str: +# return self.name + +# def __repr__(self) -> str: +# return f"WrappedPath({self.path}): {self.name}" diff --git a/src/orcapod/hashing/arrow_utils.py b/src/orcapod/hashing/arrow_utils.py index 7dc565e..b9f35d6 100644 --- a/src/orcapod/hashing/arrow_utils.py +++ b/src/orcapod/hashing/arrow_utils.py @@ -1,8 +1,9 @@ -import json +import base64 import hashlib -from typing import Any, TYPE_CHECKING +import json from decimal import Decimal -import base64 +from typing import TYPE_CHECKING, Any + from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: diff --git a/src/orcapod/hashing/object_hashers.py b/src/orcapod/hashing/object_hashers.py index 07fc518..09b01dd 100644 --- a/src/orcapod/hashing/object_hashers.py +++ b/src/orcapod/hashing/object_hashers.py @@ -1,14 +1,15 @@ -from collections.abc import Collection, Mapping import hashlib import json -from orcapod.protocols import hashing_protocols as hp -from typing import Any +import logging import uuid from abc import ABC, abstractmethod -import logging +from collections.abc import Collection, Mapping from pathlib import Path +from typing import Any from uuid import UUID +from orcapod.protocols import hashing_protocols as hp + logger = logging.getLogger(__name__) @@ -111,6 +112,10 @@ def process_structure( ) return "CircularRef" # Don't include the actual id in hash output + # TODO: revisit the hashing of the ContentHash + if isinstance(obj, hp.ContentHash): + return (obj.method, obj.digest.hex()) + # For objects that could contain circular references, add to visited if isinstance(obj, (dict, list, tuple, set)) or not isinstance( obj, (str, int, float, bool, type(None)) diff --git a/src/orcapod/hashing/semantic_type_hashers.py b/src/orcapod/hashing/semantic_type_hashers.py index 7cd279a..712d019 100644 --- a/src/orcapod/hashing/semantic_type_hashers.py +++ b/src/orcapod/hashing/semantic_type_hashers.py @@ -1,11 +1,13 @@ +import hashlib +import os + +import pyarrow as pa + from orcapod.protocols.hashing_protocols import ( - SemanticTypeHasher, FileContentHasher, + SemanticTypeHasher, StringCacher, ) -import os -import hashlib -import pyarrow as pa class PathHasher(SemanticTypeHasher): diff --git a/src/orcapod/hashing/string_cachers.py b/src/orcapod/hashing/string_cachers.py index bb09eff..caa6c93 100644 --- a/src/orcapod/hashing/string_cachers.py +++ b/src/orcapod/hashing/string_cachers.py @@ -4,7 +4,7 @@ import sqlite3 import threading from pathlib import Path -from typing import TYPE_CHECKING, Any, TYPE_CHECKING +from typing import TYPE_CHECKING, Any from orcapod.protocols.hashing_protocols import StringCacher @@ -14,16 +14,6 @@ import redis -def _get_redis(): - """Lazy import for Redis to avoid circular dependencies.""" - try: - import redis - - return redis - except ImportError as e: - return None - - class TransferCacher(StringCacher): """ Takes two string cachers as source and destination. Everytime a cached value is retrieved from source, @@ -615,11 +605,14 @@ def __init__( socket_timeout: Socket timeout in seconds """ # TODO: cleanup the redis use pattern - self._redis_module = _get_redis() - if self._redis_module is None: + try: + import redis + except ImportError as e: raise ImportError( "Could not import Redis module. redis package is required for RedisCacher" - ) + ) from e + + self._redis_module = redis self.key_prefix = key_prefix self._connection_failed = False self._lock = threading.RLock() diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index e2b2796..a6e7f0f 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,3 +1,5 @@ +from abc import abstractmethod +from einops import pack from orcapod.data.kernels import KernelStream, WrappedKernel from orcapod.data.sources import SourceBase from orcapod.data.pods import CachedPod @@ -39,13 +41,16 @@ def __init__( self.pipeline_path_prefix = pipeline_path_prefix # compute invocation hash - note that empty () is passed into identity_structure to signify # identity structure of invocation with no input streams - self.invocation_hash = self.data_context.object_hasher.hash_object( + self.pipeline_node_hash = self.data_context.object_hasher.hash_object( self.identity_structure(()) ).to_string() - tag_types, _ = self.types(include_system_tags=True) + tag_types, packet_types = self.types(include_system_tags=True) self.tag_schema_hash = self.data_context.object_hasher.hash_object( tag_types ).to_string() + self.packet_schema_hash = self.data_context.object_hasher.hash_object( + packet_types + ).to_string() self.pipeline_database = pipeline_database @property @@ -55,17 +60,17 @@ def contained_kernel(self) -> dp.Kernel: ) @property + def reference(self) -> tuple[str, ...]: + return self.contained_kernel.reference + + @property + @abstractmethod def pipeline_path(self) -> tuple[str, ...]: """ Return the path to the pipeline run records. This is used to store the run-associated tag info. """ - # TODO: include output tag hash! - return ( - self.pipeline_path_prefix - + self.reference - + (self.invocation_hash, self.tag_schema_hash) - ) + ... def forward(self, *streams: dp.Stream) -> dp.Stream: if len(streams) > 0: @@ -160,6 +165,22 @@ def record_pipeline_output(self, output_stream: dp.Stream) -> None: skip_duplicates=True, ) + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return ( + self.pipeline_path_prefix # pipeline ID + + self.reference # node ID + + ( + self.pipeline_node_hash, # pipeline node ID + self.packet_schema_hash, # packet schema ID + self.tag_schema_hash, # tag schema ID + ) + ) + def get_all_records( self, include_system_columns: bool = False ) -> "pa.Table | None": @@ -200,12 +221,26 @@ def __init__( pipeline_path_prefix=pipeline_path_prefix, **kwargs, ) - self.pipeline_store = pipeline_database @property def contained_kernel(self) -> dp.Kernel: return self.pod + @property + def pipeline_path(self) -> tuple[str, ...]: + """ + Return the path to the pipeline run records. + This is used to store the run-associated tag info. + """ + return ( + self.pipeline_path_prefix # pipeline ID + + self.reference # node ID + + ( + self.pipeline_node_hash, # pipeline node ID + self.tag_schema_hash, # tag schema ID + ) + ) + def __repr__(self): return f"PodNode(pod={self.pod!r})" @@ -302,11 +337,13 @@ def add_pipeline_record( pa.array([input_packet.content_hash().to_string()], type=pa.large_string()), ) + # unique entry ID is determined by the combination of tags, system_tags, and input_packet hash entry_id = self.data_context.arrow_hasher.hash_table(tag_with_hash).to_string() - # FIXME: consider and implement more robust cache lookup logic + + # check presence of an existing entry with the same entry_id existing_record = None if not skip_cache_lookup: - existing_record = self.pipeline_store.get_record_by_id( + existing_record = self.pipeline_database.get_record_by_id( self.pipeline_path, entry_id, ) @@ -340,7 +377,7 @@ def add_pipeline_record( tag.as_table(include_system_tags=True), input_packet_info ) - self.pipeline_store.add_record( + self.pipeline_database.add_record( self.pipeline_path, entry_id, combined_record, @@ -354,11 +391,11 @@ def get_all_records( self.record_path, record_id_column=constants.PACKET_RECORD_ID ) - if self.pipeline_store is None: + if self.pipeline_database is None: raise ValueError( - "Pipeline store is not configured, cannot retrieve tag info" + "Pipeline database is not configured, cannot retrieve tag info" ) - taginfo = self.pipeline_store.get_all_records( + taginfo = self.pipeline_database.get_all_records( self.pipeline_path, ) diff --git a/uv.lock b/uv.lock index ea9b66b..891c272 100644 --- a/uv.lock +++ b/uv.lock @@ -162,6 +162,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] +[[package]] +name = "alabaster" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, +] + [[package]] name = "annotated-types" version = "0.7.0" @@ -411,6 +420,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl", hash = "sha256:8c5631b8b22b4f53ec5fff2f3bededf34cfef111e2af613ad42c9e6de00a77fe", size = 412907, upload-time = "2025-07-16T21:34:09.367Z" }, ] +[[package]] +name = "babel" +version = "2.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7d/6b/d52e42361e1aa00709585ecc30b3f9684b3ab62530771402248b1b1d6240/babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d", size = 9951852, upload-time = "2025-02-01T15:17:41.026Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, +] + +[[package]] +name = "backrefs" +version = "5.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/a7/312f673df6a79003279e1f55619abbe7daebbb87c17c976ddc0345c04c7b/backrefs-5.9.tar.gz", hash = "sha256:808548cb708d66b82ee231f962cb36faaf4f2baab032f2fbb783e9c2fdddaa59", size = 5765857, upload-time = "2025-06-22T19:34:13.97Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/19/4d/798dc1f30468134906575156c089c492cf79b5a5fd373f07fe26c4d046bf/backrefs-5.9-py310-none-any.whl", hash = "sha256:db8e8ba0e9de81fcd635f440deab5ae5f2591b54ac1ebe0550a2ca063488cd9f", size = 380267, upload-time = "2025-06-22T19:34:05.252Z" }, + { url = "https://files.pythonhosted.org/packages/55/07/f0b3375bf0d06014e9787797e6b7cc02b38ac9ff9726ccfe834d94e9991e/backrefs-5.9-py311-none-any.whl", hash = "sha256:6907635edebbe9b2dc3de3a2befff44d74f30a4562adbb8b36f21252ea19c5cf", size = 392072, upload-time = "2025-06-22T19:34:06.743Z" }, + { url = "https://files.pythonhosted.org/packages/9d/12/4f345407259dd60a0997107758ba3f221cf89a9b5a0f8ed5b961aef97253/backrefs-5.9-py312-none-any.whl", hash = "sha256:7fdf9771f63e6028d7fee7e0c497c81abda597ea45d6b8f89e8ad76994f5befa", size = 397947, upload-time = "2025-06-22T19:34:08.172Z" }, + { url = "https://files.pythonhosted.org/packages/10/bf/fa31834dc27a7f05e5290eae47c82690edc3a7b37d58f7fb35a1bdbf355b/backrefs-5.9-py313-none-any.whl", hash = "sha256:cc37b19fa219e93ff825ed1fed8879e47b4d89aa7a1884860e2db64ccd7c676b", size = 399843, upload-time = "2025-06-22T19:34:09.68Z" }, + { url = "https://files.pythonhosted.org/packages/fc/24/b29af34b2c9c41645a9f4ff117bae860291780d73880f449e0b5d948c070/backrefs-5.9-py314-none-any.whl", hash = "sha256:df5e169836cc8acb5e440ebae9aad4bf9d15e226d3bad049cf3f6a5c20cc8dc9", size = 411762, upload-time = "2025-06-22T19:34:11.037Z" }, + { url = "https://files.pythonhosted.org/packages/41/ff/392bff89415399a979be4a65357a41d92729ae8580a66073d8ec8d810f98/backrefs-5.9-py39-none-any.whl", hash = "sha256:f48ee18f6252b8f5777a22a00a09a85de0ca931658f1dd96d4406a34f3748c60", size = 380265, upload-time = "2025-06-22T19:34:12.405Z" }, +] + [[package]] name = "beartype" version = "0.21.0" @@ -905,6 +937,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "docutils" +version = "0.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, +] + [[package]] name = "einops" version = "0.8.1" @@ -1234,6 +1275,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, ] +[[package]] +name = "griffe" +version = "1.13.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/b5/23b91f22b7b3a7f8f62223f6664946271c0f5cb4179605a3e6bbae863920/griffe-1.13.0.tar.gz", hash = "sha256:246ea436a5e78f7fbf5f24ca8a727bb4d2a4b442a2959052eea3d0bfe9a076e0", size = 412759, upload-time = "2025-08-26T13:27:11.422Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/8c/b7cfdd8dfe48f6b09f7353323732e1a290c388bd14f216947928dc85f904/griffe-1.13.0-py3-none-any.whl", hash = "sha256:470fde5b735625ac0a36296cd194617f039e9e83e301fcbd493e2b58382d0559", size = 139365, upload-time = "2025-08-26T13:27:09.882Z" }, +] + [[package]] name = "grpcio" version = "1.74.0" @@ -1363,6 +1416,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/bd/b394387b598ed84d8d0fa90611a90bee0adc2021820ad5729f7ced74a8e2/imageio-2.37.0-py3-none-any.whl", hash = "sha256:11efa15b87bc7871b61590326b2d635439acc321cf7f8ce996f812543ce10eed", size = 315796, upload-time = "2025-01-20T02:42:34.931Z" }, ] +[[package]] +name = "imagesize" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" }, +] + [[package]] name = "importlib-metadata" version = "8.7.0" @@ -1823,6 +1885,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, ] +[[package]] +name = "mkdocs-autorefs" +version = "1.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/fa/9124cd63d822e2bcbea1450ae68cdc3faf3655c69b455f3a7ed36ce6c628/mkdocs_autorefs-1.4.3.tar.gz", hash = "sha256:beee715b254455c4aa93b6ef3c67579c399ca092259cc41b7d9342573ff1fc75", size = 55425, upload-time = "2025-08-26T14:23:17.223Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/4d/7123b6fa2278000688ebd338e2a06d16870aaf9eceae6ba047ea05f92df1/mkdocs_autorefs-1.4.3-py3-none-any.whl", hash = "sha256:469d85eb3114801d08e9cc55d102b3ba65917a869b893403b8987b601cf55dc9", size = 25034, upload-time = "2025-08-26T14:23:15.906Z" }, +] + [[package]] name = "mkdocs-get-deps" version = "0.2.0" @@ -1837,6 +1913,74 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134", size = 9521, upload-time = "2023-11-20T17:51:08.587Z" }, ] +[[package]] +name = "mkdocs-material" +version = "9.6.18" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "backrefs" }, + { name = "click" }, + { name = "colorama" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "mkdocs" }, + { name = "mkdocs-material-extensions" }, + { name = "paginate" }, + { name = "pygments" }, + { name = "pymdown-extensions" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e6/46/db0d78add5aac29dfcd0a593bcc6049c86c77ba8a25b3a5b681c190d5e99/mkdocs_material-9.6.18.tar.gz", hash = "sha256:a2eb253bcc8b66f8c6eaf8379c10ed6e9644090c2e2e9d0971c7722dc7211c05", size = 4034856, upload-time = "2025-08-22T08:21:47.575Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/0b/545a4f8d4f9057e77f1d99640eb09aaae40c4f9034707f25636caf716ff9/mkdocs_material-9.6.18-py3-none-any.whl", hash = "sha256:dbc1e146a0ecce951a4d84f97b816a54936cdc9e1edd1667fc6868878ac06701", size = 9232642, upload-time = "2025-08-22T08:21:44.52Z" }, +] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, +] + +[[package]] +name = "mkdocstrings" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, + { name = "mkdocs-autorefs" }, + { name = "pymdown-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e2/0a/7e4776217d4802009c8238c75c5345e23014a4706a8414a62c0498858183/mkdocstrings-0.30.0.tar.gz", hash = "sha256:5d8019b9c31ddacd780b6784ffcdd6f21c408f34c0bd1103b5351d609d5b4444", size = 106597, upload-time = "2025-07-22T23:48:45.998Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/b4/3c5eac68f31e124a55d255d318c7445840fa1be55e013f507556d6481913/mkdocstrings-0.30.0-py3-none-any.whl", hash = "sha256:ae9e4a0d8c1789697ac776f2e034e2ddd71054ae1cf2c2bb1433ccfd07c226f2", size = 36579, upload-time = "2025-07-22T23:48:44.152Z" }, +] + +[package.optional-dependencies] +python = [ + { name = "mkdocstrings-python" }, +] + +[[package]] +name = "mkdocstrings-python" +version = "1.18.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "griffe" }, + { name = "mkdocs-autorefs" }, + { name = "mkdocstrings" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/d4/6327c4e82dda667b0ff83b6f6b6a03e7b81dfd1f28cd5eda50ffe66d546f/mkdocstrings_python-1.18.0.tar.gz", hash = "sha256:0b9924b4034fe9ae43604d78fe8e5107ea2c2391620124fc833043a62e83c744", size = 207601, upload-time = "2025-08-26T14:02:30.839Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/96/7ecc71bb9f01ee20f201b2531960b401159c6730aec90ec76a1b74bc81e1/mkdocstrings_python-1.18.0-py3-none-any.whl", hash = "sha256:f5056d8afe9a9683ad0c59001df1ecd9668b51c19b9a6b4dc0ff02cc9b76265a", size = 138182, upload-time = "2025-08-26T14:02:28.076Z" }, +] + [[package]] name = "mmh3" version = "5.1.0" @@ -2061,6 +2205,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" }, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, +] + [[package]] name = "numpy" version = "2.2.6" @@ -2401,15 +2554,20 @@ dev = [ { name = "jsonschema" }, { name = "minio" }, { name = "mkdocs" }, + { name = "mkdocs-material" }, + { name = "mkdocstrings", extra = ["python"] }, + { name = "pdoc" }, { name = "pyarrow-stubs" }, { name = "pygraphviz" }, { name = "pyiceberg" }, + { name = "pyright" }, { name = "pytest" }, { name = "pytest-cov" }, { name = "ray", extra = ["default"] }, { name = "redis" }, { name = "ruff" }, { name = "s3fs" }, + { name = "sphinx" }, { name = "tqdm" }, ] @@ -2449,15 +2607,20 @@ dev = [ { name = "jsonschema", specifier = ">=4.25.0" }, { name = "minio", specifier = ">=7.2.16" }, { name = "mkdocs", specifier = ">=1.6.1" }, + { name = "mkdocs-material", specifier = ">=9.6.18" }, + { name = "mkdocstrings", extras = ["python"], specifier = ">=0.30.0" }, + { name = "pdoc", specifier = ">=15.0.4" }, { name = "pyarrow-stubs", specifier = ">=20.0.0.20250716" }, { name = "pygraphviz", specifier = ">=1.14" }, { name = "pyiceberg", specifier = ">=0.9.1" }, + { name = "pyright", specifier = ">=1.1.404" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-cov", specifier = ">=6.1.1" }, { name = "ray", extras = ["default"], specifier = "==2.48.0" }, { name = "redis", specifier = ">=6.2.0" }, { name = "ruff", specifier = ">=0.11.11" }, { name = "s3fs", specifier = ">=2025.7.0" }, + { name = "sphinx", specifier = ">=8.2.3" }, { name = "tqdm", specifier = ">=4.67.1" }, ] @@ -2494,6 +2657,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "paginate" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, +] + [[package]] name = "pandas" version = "2.2.3" @@ -2553,6 +2725,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, ] +[[package]] +name = "pdoc" +version = "15.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2" }, + { name = "markupsafe" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/5c/e94c1ab4aa2f8a9cc29d81e1c513c6216946cb3a90957ef7115b12e9363d/pdoc-15.0.4.tar.gz", hash = "sha256:cf9680f10f5b4863381f44ef084b1903f8f356acb0d4cc6b64576ba9fb712c82", size = 155678, upload-time = "2025-06-04T17:05:49.639Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/2c/87250ac73ca8730b2c4e0185b573585f0b42e09562132e6c29d00b3a9bb9/pdoc-15.0.4-py3-none-any.whl", hash = "sha256:f9028e85e7bb8475b054e69bde1f6d26fc4693d25d9fa1b1ce9009bec7f7a5c4", size = 145978, upload-time = "2025-06-04T17:05:48.473Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -3101,6 +3287,19 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.16.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/b3/6d2b3f149bc5413b0a29761c2c5832d8ce904a1d7f621e86616d96f505cc/pymdown_extensions-10.16.1.tar.gz", hash = "sha256:aace82bcccba3efc03e25d584e6a22d27a8e17caa3f4dd9f207e49b787aa9a91", size = 853277, upload-time = "2025-07-28T16:19:34.167Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e4/06/43084e6cbd4b3bc0e80f6be743b2e79fbc6eed8de9ad8c629939fa55d972/pymdown_extensions-10.16.1-py3-none-any.whl", hash = "sha256:d6ba157a6c03146a7fb122b2b9a121300056384eafeec9c9f9e584adfdb2a32d", size = 266178, upload-time = "2025-07-28T16:19:31.401Z" }, +] + [[package]] name = "pymysql" version = "1.1.1" @@ -3119,6 +3318,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl", hash = "sha256:a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf", size = 111120, upload-time = "2025-03-25T05:01:24.908Z" }, ] +[[package]] +name = "pyright" +version = "1.1.404" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nodeenv" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e2/6e/026be64c43af681d5632722acd100b06d3d39f383ec382ff50a71a6d5bce/pyright-1.1.404.tar.gz", hash = "sha256:455e881a558ca6be9ecca0b30ce08aa78343ecc031d37a198ffa9a7a1abeb63e", size = 4065679, upload-time = "2025-08-20T18:46:14.029Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/30/89aa7f7d7a875bbb9a577d4b1dc5a3e404e3d2ae2657354808e905e358e0/pyright-1.1.404-py3-none-any.whl", hash = "sha256:c7b7ff1fdb7219c643079e4c3e7d4125f0dafcc19d253b47e898d130ea426419", size = 5902951, upload-time = "2025-08-20T18:46:12.096Z" }, +] + [[package]] name = "pysocks" version = "1.7.1" @@ -3494,6 +3706,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424, upload-time = "2024-11-01T16:43:55.817Z" }, ] +[[package]] +name = "roman-numerals-py" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/76/48fd56d17c5bdbdf65609abbc67288728a98ed4c02919428d4f52d23b24b/roman_numerals_py-3.1.0.tar.gz", hash = "sha256:be4bf804f083a4ce001b5eb7e3c0862479d10f94c936f6c4e5f250aa5ff5bd2d", size = 9017, upload-time = "2025-02-22T07:34:54.333Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/97/d2cbbaa10c9b826af0e10fdf836e1bf344d9f0abb873ebc34d1f49642d3f/roman_numerals_py-3.1.0-py3-none-any.whl", hash = "sha256:9da2ad2fb670bcf24e81070ceb3be72f6c11c440d73bd579fbeca1e9f330954c", size = 7742, upload-time = "2025-02-22T07:34:52.422Z" }, +] + [[package]] name = "rpds-py" version = "0.26.0" @@ -3726,6 +3947,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl", hash = "sha256:c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4", size = 61946, upload-time = "2025-07-03T10:06:29.599Z" }, ] +[[package]] +name = "snowballstemmer" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/75/a7/9810d872919697c9d01295633f5d574fb416d47e535f258272ca1f01f447/snowballstemmer-3.0.1.tar.gz", hash = "sha256:6d5eeeec8e9f84d4d56b847692bacf79bc2c8e90c7f80ca4444ff8b6f2e52895", size = 105575, upload-time = "2025-05-09T16:34:51.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/78/3565d011c61f5a43488987ee32b6f3f656e7f107ac2782dd57bdd7d91d9a/snowballstemmer-3.0.1-py3-none-any.whl", hash = "sha256:6cd7b3897da8d6c9ffb968a6781fa6532dce9c3618a4b127d920dab764a19064", size = 103274, upload-time = "2025-05-09T16:34:50.371Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -3735,6 +3965,88 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, ] +[[package]] +name = "sphinx" +version = "8.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "alabaster" }, + { name = "babel" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "docutils" }, + { name = "imagesize" }, + { name = "jinja2" }, + { name = "packaging" }, + { name = "pygments" }, + { name = "requests" }, + { name = "roman-numerals-py" }, + { name = "snowballstemmer" }, + { name = "sphinxcontrib-applehelp" }, + { name = "sphinxcontrib-devhelp" }, + { name = "sphinxcontrib-htmlhelp" }, + { name = "sphinxcontrib-jsmath" }, + { name = "sphinxcontrib-qthelp" }, + { name = "sphinxcontrib-serializinghtml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/ad/4360e50ed56cb483667b8e6dadf2d3fda62359593faabbe749a27c4eaca6/sphinx-8.2.3.tar.gz", hash = "sha256:398ad29dee7f63a75888314e9424d40f52ce5a6a87ae88e7071e80af296ec348", size = 8321876, upload-time = "2025-03-02T22:31:59.658Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/53/136e9eca6e0b9dc0e1962e2c908fbea2e5ac000c2a2fbd9a35797958c48b/sphinx-8.2.3-py3-none-any.whl", hash = "sha256:4405915165f13521d875a8c29c8970800a0141c14cc5416a38feca4ea5d9b9c3", size = 3589741, upload-time = "2025-03-02T22:31:56.836Z" }, +] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" }, +] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" }, +] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" }, +] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" }, +] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" }, +] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, +] + [[package]] name = "stack-data" version = "0.6.3" From 0fe06b387eebefb3e5072d64d80b193457be6ded Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 30 Aug 2025 13:38:21 -0700 Subject: [PATCH 214/224] refactor: remove unused module --- src/orcapod/utils/stream_utils.py | 132 ------------------------------ 1 file changed, 132 deletions(-) delete mode 100644 src/orcapod/utils/stream_utils.py diff --git a/src/orcapod/utils/stream_utils.py b/src/orcapod/utils/stream_utils.py deleted file mode 100644 index 4246088..0000000 --- a/src/orcapod/utils/stream_utils.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Utility functions for handling tags -""" - -from collections.abc import Collection, Mapping -from typing import TypeVar, Hashable, Any - -from orcapod.types import Packet, Tag, TypeSpec - - -K = TypeVar("K", bound=Hashable) -V = TypeVar("V") - - -def merge_dicts(left: dict[K, V], right: dict[K, V]) -> dict[K, V]: - merged = left.copy() - for key, right_value in right.items(): - if key in merged: - if merged[key] != right_value: - raise ValueError( - f"Conflicting values for key '{key}': {merged[key]} vs {right_value}" - ) - else: - merged[key] = right_value - return merged - - -def common_elements(*values) -> Collection[str]: - """ - Returns the common keys between all lists of values. The identified common elements are - order preserved with respect to the first list of values - """ - if len(values) == 0: - return [] - common_keys = set(values[0]) - for tag in values[1:]: - common_keys.intersection_update(tag) - # Preserve the order of the first list of values - common_keys = [k for k in values[0] if k in common_keys] - return common_keys - - -def join_tags(tag1: Mapping[K, V], tag2: Mapping[K, V]) -> dict[K, V] | None: - """ - Joins two tags together. If the tags have the same key, the value must be the same or None will be returned. - """ - # create a dict copy of tag1 - joined_tag = dict(tag1) - for k, v in tag2.items(): - if k in joined_tag and joined_tag[k] != v: - # Detected a mismatch in the tags, return None - return None - else: - joined_tag[k] = v - return joined_tag - - -def semijoin_tags( - tag1: Mapping[K, V], tag2: Mapping[K, V], target_keys: Collection[K] | None = None -) -> dict[K, V] | None: - """ - Semijoin two tags. If the tags have the same key, the value must be the same or None will be returned. If all shared - key's value match, tag1 would be returned - """ - if target_keys is None: - target_keys = set(tag1.keys()).intersection(set(tag2.keys())) - if not target_keys: - return dict(tag1) - - for key in target_keys: - if tag1[key] != tag2[key]: - return None - return dict(tag1) - - -def check_packet_compatibility(packet1: Packet, packet2: Packet) -> bool: - """ - Checks if two packets are compatible. If the packets have the same key, the value must be the same or False will be returned. - If the packets have different keys, they are compatible. - """ - for k in packet1.keys(): - if k in packet2 and packet1[k] != packet2[k]: - return False - return True - - -def batch_tags(all_tags: Collection[Tag]) -> Tag: - """ - Batches the tags together. Grouping values under the same key into a list. - """ - all_keys: set[str] = set() - for tag in all_tags: - all_keys.update(tag.keys()) - batch_tag = {key: [] for key in all_keys} # Initialize batch_tag with all keys - for tag in all_tags: - for k in all_keys: - batch_tag[k].append( - tag.get(k, None) - ) # Append the value or None if the key is not present - return batch_tag - - -def batch_packet( - all_packets: Collection[Packet], drop_missing_keys: bool = True -) -> Packet: - """ - Batches the packets together. Grouping values under the same key into a list. - If all packets do not have the same key, raise an error unless drop_missing_keys is True - """ - all_keys: set[str] = set() - for p in all_packets: - all_keys.update(p.keys()) - batch_packet = {key: [] for key in all_keys} - for p in all_packets: - for k in all_keys: - if k not in p: - if drop_missing_keys: - continue - else: - raise KeyError(f"Packet {p} does not have key {k}") - batch_packet[k].append(p[k]) - return batch_packet - - -def fill_missing(dict, keys, default=None): - """ - Fill the missing keys in the dictionary with the specified default value. - """ - for key in keys: - if key not in dict: - dict[key] = default - return dict From 340cfb571bbe1d2c55837bb0f853a2109b0bd220 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 30 Aug 2025 13:51:13 -0700 Subject: [PATCH 215/224] refactor: rename data to core subpackage --- src/orcapod/__init__.py | 10 +++---- src/orcapod/{data => core}/__init__.py | 0 .../{data => core}/arrow_data_utils.py | 2 +- src/orcapod/{data => core}/base.py | 0 .../{data => core}/datagrams/__init__.py | 0 .../datagrams/arrow_datagram.py | 4 +-- .../datagrams/arrow_tag_packet.py | 4 +-- src/orcapod/{data => core}/datagrams/base.py | 2 +- .../{data => core}/datagrams/dict_datagram.py | 4 +-- .../datagrams/dict_tag_packet.py | 4 +-- src/orcapod/{data => core}/kernels.py | 6 ++-- .../{data => core}/operators/__init__.py | 0 src/orcapod/{data => core}/operators/base.py | 2 +- src/orcapod/{data => core}/operators/batch.py | 7 +++-- src/orcapod/{data => core}/operators/join.py | 4 +-- .../{data => core}/operators/mappers.py | 6 ++-- .../{data => core}/operators/semijoin.py | 4 +-- src/orcapod/{data => core}/pods.py | 10 +++---- .../{data => core}/sources/__init__.py | 0 .../sources/arrow_table_source.py | 10 +++---- src/orcapod/{data => core}/sources/base.py | 4 +-- .../{data => core}/sources/csv_source.py | 5 ++-- .../sources/delta_table_source.py | 6 ++-- .../{data => core}/sources/dict_source.py | 6 ++-- .../{data => core}/sources/list_source.py | 10 +++---- .../sources/manual_table_source.py | 28 ++++++++++++------- .../{data => core}/sources/source_registry.py | 0 src/orcapod/{data => core}/streams.py | 16 +++++------ .../{data => core}/system_constants.py | 0 src/orcapod/{data => core}/trackers.py | 4 +-- src/orcapod/types.py | 2 +- 31 files changed, 84 insertions(+), 76 deletions(-) rename src/orcapod/{data => core}/__init__.py (100%) rename src/orcapod/{data => core}/arrow_data_utils.py (97%) rename src/orcapod/{data => core}/base.py (100%) rename src/orcapod/{data => core}/datagrams/__init__.py (100%) rename src/orcapod/{data => core}/datagrams/arrow_datagram.py (99%) rename src/orcapod/{data => core}/datagrams/arrow_tag_packet.py (99%) rename src/orcapod/{data => core}/datagrams/base.py (99%) rename src/orcapod/{data => core}/datagrams/dict_datagram.py (99%) rename src/orcapod/{data => core}/datagrams/dict_tag_packet.py (99%) rename src/orcapod/{data => core}/kernels.py (98%) rename src/orcapod/{data => core}/operators/__init__.py (100%) rename src/orcapod/{data => core}/operators/base.py (99%) rename src/orcapod/{data => core}/operators/batch.py (93%) rename src/orcapod/{data => core}/operators/join.py (97%) rename src/orcapod/{data => core}/operators/mappers.py (98%) rename src/orcapod/{data => core}/operators/semijoin.py (98%) rename src/orcapod/{data => core}/pods.py (99%) rename src/orcapod/{data => core}/sources/__init__.py (100%) rename src/orcapod/{data => core}/sources/arrow_table_source.py (92%) rename src/orcapod/{data => core}/sources/base.py (99%) rename src/orcapod/{data => core}/sources/csv_source.py (93%) rename src/orcapod/{data => core}/sources/delta_table_source.py (97%) rename src/orcapod/{data => core}/sources/dict_source.py (95%) rename src/orcapod/{data => core}/sources/list_source.py (96%) rename src/orcapod/{data => core}/sources/manual_table_source.py (93%) rename src/orcapod/{data => core}/sources/source_registry.py (100%) rename src/orcapod/{data => core}/streams.py (99%) rename src/orcapod/{data => core}/system_constants.py (100%) rename src/orcapod/{data => core}/trackers.py (98%) diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 3064dd4..3311d8b 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,8 +1,8 @@ -from .data import DEFAULT_TRACKER_MANAGER -from .data.pods import function_pod, FunctionPod, CachedPod -from .data import streams -from .data import operators -from .data import sources +from .core import DEFAULT_TRACKER_MANAGER +from .core.pods import function_pod, FunctionPod, CachedPod +from .core import streams +from .core import operators +from .core import sources from . import databases from .pipeline import Pipeline diff --git a/src/orcapod/data/__init__.py b/src/orcapod/core/__init__.py similarity index 100% rename from src/orcapod/data/__init__.py rename to src/orcapod/core/__init__.py diff --git a/src/orcapod/data/arrow_data_utils.py b/src/orcapod/core/arrow_data_utils.py similarity index 97% rename from src/orcapod/data/arrow_data_utils.py rename to src/orcapod/core/arrow_data_utils.py index d9e8bf8..2443cdb 100644 --- a/src/orcapod/data/arrow_data_utils.py +++ b/src/orcapod/core/arrow_data_utils.py @@ -1,7 +1,7 @@ # Collection of functions to work with Arrow table data that underlies streams and/or datagrams from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from collections.abc import Collection if TYPE_CHECKING: diff --git a/src/orcapod/data/base.py b/src/orcapod/core/base.py similarity index 100% rename from src/orcapod/data/base.py rename to src/orcapod/core/base.py diff --git a/src/orcapod/data/datagrams/__init__.py b/src/orcapod/core/datagrams/__init__.py similarity index 100% rename from src/orcapod/data/datagrams/__init__.py rename to src/orcapod/core/datagrams/__init__.py diff --git a/src/orcapod/data/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py similarity index 99% rename from src/orcapod/data/datagrams/arrow_datagram.py rename to src/orcapod/core/datagrams/arrow_datagram.py index 2ff463e..db34ad6 100644 --- a/src/orcapod/data/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -5,8 +5,8 @@ import pyarrow as pa from orcapod import contexts -from orcapod.data.datagrams.base import BaseDatagram -from orcapod.data.system_constants import constants +from orcapod.core.datagrams.base import BaseDatagram +from orcapod.core.system_constants import constants from orcapod.types import DataValue, PythonSchema from orcapod.protocols.hashing_protocols import ContentHash from orcapod.utils import arrow_utils diff --git a/src/orcapod/data/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py similarity index 99% rename from src/orcapod/data/datagrams/arrow_tag_packet.py rename to src/orcapod/core/datagrams/arrow_tag_packet.py index e0be151..b5d93cd 100644 --- a/src/orcapod/data/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -5,14 +5,14 @@ import pyarrow as pa -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod import contexts from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils -from orcapod.data.datagrams.arrow_datagram import ArrowDatagram +from orcapod.core.datagrams.arrow_datagram import ArrowDatagram logger = logging.getLogger(__name__) diff --git a/src/orcapod/data/datagrams/base.py b/src/orcapod/core/datagrams/base.py similarity index 99% rename from src/orcapod/data/datagrams/base.py rename to src/orcapod/core/datagrams/base.py index 9199be7..48cbc53 100644 --- a/src/orcapod/data/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -21,7 +21,7 @@ from collections.abc import Collection, Iterator, Mapping from typing import Self, TypeAlias from orcapod import contexts -from orcapod.data.base import ContentIdentifiableBase +from orcapod.core.base import ContentIdentifiableBase from orcapod.protocols.hashing_protocols import ContentHash import pyarrow as pa diff --git a/src/orcapod/data/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py similarity index 99% rename from src/orcapod/data/datagrams/dict_datagram.py rename to src/orcapod/core/datagrams/dict_datagram.py index a56bf1a..ec9825e 100644 --- a/src/orcapod/data/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -3,9 +3,9 @@ from typing import Self, cast, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod import contexts -from orcapod.data.datagrams.base import BaseDatagram +from orcapod.core.datagrams.base import BaseDatagram from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils diff --git a/src/orcapod/data/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py similarity index 99% rename from src/orcapod/data/datagrams/dict_tag_packet.py rename to src/orcapod/core/datagrams/dict_tag_packet.py index be6160b..afc8aa2 100644 --- a/src/orcapod/data/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -4,9 +4,9 @@ import pyarrow as pa -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod import contexts -from orcapod.data.datagrams.dict_datagram import DictDatagram +from orcapod.core.datagrams.dict_datagram import DictDatagram from orcapod.utils import arrow_utils from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike diff --git a/src/orcapod/data/kernels.py b/src/orcapod/core/kernels.py similarity index 98% rename from src/orcapod/data/kernels.py rename to src/orcapod/core/kernels.py index 2dde9c7..40bf6b1 100644 --- a/src/orcapod/data/kernels.py +++ b/src/orcapod/core/kernels.py @@ -4,9 +4,9 @@ from typing import Any from orcapod.protocols import data_protocols as dp import logging -from orcapod.data.streams import KernelStream -from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod.data.trackers import DEFAULT_TRACKER_MANAGER +from orcapod.core.streams import KernelStream +from orcapod.core.base import LabeledContentIdentifiableBase +from orcapod.core.trackers import DEFAULT_TRACKER_MANAGER from orcapod.types import PythonSchema logger = logging.getLogger(__name__) diff --git a/src/orcapod/data/operators/__init__.py b/src/orcapod/core/operators/__init__.py similarity index 100% rename from src/orcapod/data/operators/__init__.py rename to src/orcapod/core/operators/__init__.py diff --git a/src/orcapod/data/operators/base.py b/src/orcapod/core/operators/base.py similarity index 99% rename from src/orcapod/data/operators/base.py rename to src/orcapod/core/operators/base.py index 1d5f07c..01c0123 100644 --- a/src/orcapod/data/operators/base.py +++ b/src/orcapod/core/operators/base.py @@ -1,5 +1,5 @@ from ast import Not -from orcapod.data.kernels import TrackedKernelBase +from orcapod.core.kernels import TrackedKernelBase from orcapod.protocols import data_protocols as dp from orcapod.types import PythonSchema from abc import abstractmethod diff --git a/src/orcapod/data/operators/batch.py b/src/orcapod/core/operators/batch.py similarity index 93% rename from src/orcapod/data/operators/batch.py rename to src/orcapod/core/operators/batch.py index 1281ea4..284b0ad 100644 --- a/src/orcapod/data/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -1,9 +1,9 @@ -from orcapod.data.operators.base import UnaryOperator +from orcapod.core.operators.base import UnaryOperator from collections.abc import Collection from orcapod.protocols import data_protocols as dp from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream if TYPE_CHECKING: import pyarrow as pa @@ -94,7 +94,8 @@ def op_output_types( batched_tag_types = {k: list[v] for k, v in tag_types.items()} batched_packet_types = {k: list[v] for k, v in packet_types.items()} - return batched_tag_types, batched_packet_types + # TODO: check if this is really necessary + return PythonSchema(batched_tag_types), PythonSchema(batched_packet_types) def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: return ( diff --git a/src/orcapod/data/operators/join.py b/src/orcapod/core/operators/join.py similarity index 97% rename from src/orcapod/data/operators/join.py rename to src/orcapod/core/operators/join.py index f6ecbb7..2b454a1 100644 --- a/src/orcapod/data/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -1,12 +1,12 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from orcapod.utils import types_utils from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from collections.abc import Collection from orcapod.errors import InputValidationError -from orcapod.data.operators.base import NonZeroInputOperator +from orcapod.core.operators.base import NonZeroInputOperator if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/data/operators/mappers.py b/src/orcapod/core/operators/mappers.py similarity index 98% rename from src/orcapod/data/operators/mappers.py rename to src/orcapod/core/operators/mappers.py index c3042c3..7ac127a 100644 --- a/src/orcapod/data/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -1,12 +1,12 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from collections.abc import Mapping from orcapod.errors import InputValidationError -from orcapod.data.system_constants import constants -from orcapod.data.operators.base import UnaryOperator +from orcapod.core.system_constants import constants +from orcapod.core.operators.base import UnaryOperator if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/data/operators/semijoin.py b/src/orcapod/core/operators/semijoin.py similarity index 98% rename from src/orcapod/data/operators/semijoin.py rename to src/orcapod/core/operators/semijoin.py index de537ee..8d70756 100644 --- a/src/orcapod/data/operators/semijoin.py +++ b/src/orcapod/core/operators/semijoin.py @@ -1,11 +1,11 @@ from orcapod.protocols import data_protocols as dp -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream from orcapod.utils import types_utils from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.errors import InputValidationError -from orcapod.data.operators.base import BinaryOperator +from orcapod.core.operators.base import BinaryOperator if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/data/pods.py b/src/orcapod/core/pods.py similarity index 99% rename from src/orcapod/data/pods.py rename to src/orcapod/core/pods.py index d29bc64..5fd94d4 100644 --- a/src/orcapod/data/pods.py +++ b/src/orcapod/core/pods.py @@ -7,14 +7,14 @@ from typing import TYPE_CHECKING, Any, Literal from orcapod import contexts -from orcapod.data.datagrams import ( +from orcapod.core.datagrams import ( ArrowPacket, DictPacket, ) -from orcapod.data.kernels import KernelStream, TrackedKernelBase -from orcapod.data.operators import Join -from orcapod.data.streams import EfficientPodResultStream, LazyPodResultStream -from orcapod.data.system_constants import constants +from orcapod.core.kernels import KernelStream, TrackedKernelBase +from orcapod.core.operators import Join +from orcapod.core.streams import EfficientPodResultStream, LazyPodResultStream +from orcapod.core.system_constants import constants from orcapod.hashing.hash_utils import get_function_components, get_function_signature from orcapod.protocols import data_protocols as dp from orcapod.protocols import hashing_protocols as hp diff --git a/src/orcapod/data/sources/__init__.py b/src/orcapod/core/sources/__init__.py similarity index 100% rename from src/orcapod/data/sources/__init__.py rename to src/orcapod/core/sources/__init__.py diff --git a/src/orcapod/data/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py similarity index 92% rename from src/orcapod/data/sources/arrow_table_source.py rename to src/orcapod/core/sources/arrow_table_source.py index 801adf3..e2d548f 100644 --- a/src/orcapod/data/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -3,15 +3,15 @@ from typing import TYPE_CHECKING, Any -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream from orcapod.protocols import data_protocols as dp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import constants -from orcapod.data import arrow_data_utils -from orcapod.data.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry +from orcapod.core.system_constants import constants +from orcapod.core import arrow_data_utils +from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry -from orcapod.data.sources.base import SourceBase +from orcapod.core.sources.base import SourceBase if TYPE_CHECKING: import pyarrow as pa diff --git a/src/orcapod/data/sources/base.py b/src/orcapod/core/sources/base.py similarity index 99% rename from src/orcapod/data/sources/base.py rename to src/orcapod/core/sources/base.py index 5b6ddc2..c5f9b42 100644 --- a/src/orcapod/data/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -3,8 +3,8 @@ from typing import TYPE_CHECKING, Any -from orcapod.data.kernels import TrackedKernelBase -from orcapod.data.streams import ( +from orcapod.core.kernels import TrackedKernelBase +from orcapod.core.streams import ( KernelStream, StatefulStreamBase, ) diff --git a/src/orcapod/data/sources/csv_source.py b/src/orcapod/core/sources/csv_source.py similarity index 93% rename from src/orcapod/data/sources/csv_source.py rename to src/orcapod/core/sources/csv_source.py index d3b4709..97a05de 100644 --- a/src/orcapod/data/sources/csv_source.py +++ b/src/orcapod/core/sources/csv_source.py @@ -1,7 +1,7 @@ from typing import TYPE_CHECKING, Any -from orcapod.data.streams import ( +from orcapod.core.streams import ( TableStream, ) from orcapod.protocols import data_protocols as dp @@ -17,7 +17,7 @@ pd = LazyModule("pandas") pa = LazyModule("pyarrow") -from orcapod.data.sources.base import SourceBase +from orcapod.core.sources.base import SourceBase class CSVSource(SourceBase): @@ -35,7 +35,6 @@ def __init__( self.tag_columns = tag_columns or [] if source_id is None: source_id = self.file_path - self.source_id = source_id def source_identity_structure(self) -> Any: return (self.__class__.__name__, self.source_id, tuple(self.tag_columns)) diff --git a/src/orcapod/data/sources/delta_table_source.py b/src/orcapod/core/sources/delta_table_source.py similarity index 97% rename from src/orcapod/data/sources/delta_table_source.py rename to src/orcapod/core/sources/delta_table_source.py index eb4cd52..fca9dcd 100644 --- a/src/orcapod/data/sources/delta_table_source.py +++ b/src/orcapod/core/sources/delta_table_source.py @@ -2,15 +2,15 @@ from typing import TYPE_CHECKING, Any -from orcapod.data.streams import TableStream +from orcapod.core.streams import TableStream from orcapod.protocols import data_protocols as dp from orcapod.types import PathLike, PythonSchema from orcapod.utils.lazy_module import LazyModule from pathlib import Path -from orcapod.data.sources.base import SourceBase -from orcapod.data.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry +from orcapod.core.sources.base import SourceBase +from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry from deltalake import DeltaTable from deltalake.exceptions import TableNotFoundError diff --git a/src/orcapod/data/sources/dict_source.py b/src/orcapod/core/sources/dict_source.py similarity index 95% rename from src/orcapod/data/sources/dict_source.py rename to src/orcapod/core/sources/dict_source.py index 6cc3ae4..4c0d324 100644 --- a/src/orcapod/data/sources/dict_source.py +++ b/src/orcapod/core/sources/dict_source.py @@ -5,15 +5,15 @@ from orcapod.protocols import data_protocols as dp from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import constants -from orcapod.data.sources.arrow_table_source import ArrowTableSource +from orcapod.core.system_constants import constants +from orcapod.core.sources.arrow_table_source import ArrowTableSource if TYPE_CHECKING: import pyarrow as pa else: pa = LazyModule("pyarrow") -from orcapod.data.sources.base import SourceBase +from orcapod.core.sources.base import SourceBase def add_source_field( diff --git a/src/orcapod/data/sources/list_source.py b/src/orcapod/core/sources/list_source.py similarity index 96% rename from src/orcapod/data/sources/list_source.py rename to src/orcapod/core/sources/list_source.py index 20503e9..53a3b32 100644 --- a/src/orcapod/data/sources/list_source.py +++ b/src/orcapod/core/sources/list_source.py @@ -5,9 +5,9 @@ from deltalake import DeltaTable, write_deltalake from pyarrow.lib import Table -from orcapod.data.datagrams import DictTag -from orcapod.data.kernels import TrackedKernelBase -from orcapod.data.streams import ( +from orcapod.core.datagrams import DictTag +from orcapod.core.kernels import TrackedKernelBase +from orcapod.core.streams import ( TableStream, KernelStream, StatefulStreamBase, @@ -17,7 +17,7 @@ from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: @@ -29,7 +29,7 @@ pd = LazyModule("pandas") pa = LazyModule("pyarrow") -from orcapod.data.sources.base import SourceBase +from orcapod.core.sources.base import SourceBase class ListSource(SourceBase): diff --git a/src/orcapod/data/sources/manual_table_source.py b/src/orcapod/core/sources/manual_table_source.py similarity index 93% rename from src/orcapod/data/sources/manual_table_source.py rename to src/orcapod/core/sources/manual_table_source.py index 0e9d49b..46e544f 100644 --- a/src/orcapod/data/sources/manual_table_source.py +++ b/src/orcapod/core/sources/manual_table_source.py @@ -6,18 +6,19 @@ from deltalake.exceptions import TableNotFoundError from pyarrow.lib import Table -from orcapod.data.kernels import TrackedKernelBase -from orcapod.data.streams import ( +from orcapod.core.kernels import TrackedKernelBase +from orcapod.core.streams import ( TableStream, KernelStream, StatefulStreamBase, ) +from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry from orcapod.errors import DuplicateTagError from orcapod.protocols import data_protocols as dp -from orcapod.types import DataValue, PythonSchema +from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: @@ -29,7 +30,7 @@ pd = LazyModule("pandas") pa = LazyModule("pyarrow") -from orcapod.data.sources.base import SourceBase +from orcapod.core.sources.base import SourceBase class ManualDeltaTableSource(SourceBase): @@ -45,8 +46,10 @@ class ManualDeltaTableSource(SourceBase): def __init__( self, table_path: str | Path, - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, tag_columns: Collection[str] | None = None, + source_name: str | None = None, + source_registry: SourceRegistry | None = None, **kwargs, ) -> None: """ @@ -54,6 +57,11 @@ def __init__( """ super().__init__(**kwargs) + if source_name is None: + source_name = Path(table_path).name + + self._source_name = source_name + self.table_path = Path(table_path) self._delta_table: DeltaTable | None = None self.load_delta_table() @@ -69,7 +77,7 @@ def __init__( "At least one tag column must be provided when creating a new Delta table." ) arrow_schema = ( - self._data_context.type_converter.python_schema_to_arrow_schema( + self.data_context.type_converter.python_schema_to_arrow_schema( python_schema ) ) @@ -84,7 +92,7 @@ def __init__( else: arrow_schema = pa.schema(self._delta_table.schema().to_arrow()) python_schema = ( - self._data_context.type_converter.arrow_schema_to_python_schema( + self.data_context.type_converter.arrow_schema_to_python_schema( arrow_schema ) ) @@ -103,8 +111,8 @@ def __init__( self.tag_columns = list(tag_columns) if tag_columns else [] @property - def kernel_id(self) -> tuple[str, ...]: - return (self.__class__.__name__, str(self.table_path)) + def reference(self) -> tuple[str, ...]: + return ("manual_delta", self._source_name) @property def delta_table_version(self) -> int | None: diff --git a/src/orcapod/data/sources/source_registry.py b/src/orcapod/core/sources/source_registry.py similarity index 100% rename from src/orcapod/data/sources/source_registry.py rename to src/orcapod/core/sources/source_registry.py diff --git a/src/orcapod/data/streams.py b/src/orcapod/core/streams.py similarity index 99% rename from src/orcapod/data/streams.py rename to src/orcapod/core/streams.py index b2a2909..b934f60 100644 --- a/src/orcapod/data/streams.py +++ b/src/orcapod/core/streams.py @@ -7,13 +7,13 @@ from typing import TYPE_CHECKING, Any, cast from orcapod import contexts -from orcapod.data.base import LabeledContentIdentifiableBase -from orcapod.data.datagrams import ( +from orcapod.core.base import LabeledContentIdentifiableBase +from orcapod.core.datagrams import ( ArrowPacket, ArrowTag, DictTag, ) -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod.protocols import data_protocols as dp from orcapod.types import PythonSchema from orcapod.utils import arrow_utils @@ -70,7 +70,7 @@ def join(self, other_stream: dp.Stream, label: str | None = None) -> dp.Stream: Joins this stream with another stream, returning a new stream that contains the combined data from both streams. """ - from orcapod.data.operators import Join + from orcapod.core.operators import Join return Join()(self, other_stream, label=label) # type: ignore @@ -83,7 +83,7 @@ def semi_join( Performs a semi-join with another stream, returning a new stream that contains only the packets from this stream that have matching tags in the other stream. """ - from orcapod.data.operators import SemiJoin + from orcapod.core.operators import SemiJoin return SemiJoin()(self, other_stream, label=label) # type: ignore @@ -97,7 +97,7 @@ def map_tags( Maps the tags in this stream according to the provided name_map. If drop_unmapped is True, any tags that are not in the name_map will be dropped. """ - from orcapod.data.operators import MapTags + from orcapod.core.operators import MapTags return MapTags(name_map, drop_unmapped)(self, label=label) # type: ignore @@ -111,7 +111,7 @@ def map_packets( Maps the packets in this stream according to the provided packet_map. If drop_unmapped is True, any packets that are not in the packet_map will be dropped. """ - from orcapod.data.operators import MapPackets + from orcapod.core.operators import MapPackets return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore @@ -125,7 +125,7 @@ def batch( Batch stream into fixed-size chunks, each of size batch_size. If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. """ - from orcapod.data.operators import Batch + from orcapod.core.operators import Batch return Batch(batch_size=batch_size, drop_last=drop_last)(self, label=label) # type: ignore diff --git a/src/orcapod/data/system_constants.py b/src/orcapod/core/system_constants.py similarity index 100% rename from src/orcapod/data/system_constants.py rename to src/orcapod/core/system_constants.py diff --git a/src/orcapod/data/trackers.py b/src/orcapod/core/trackers.py similarity index 98% rename from src/orcapod/data/trackers.py rename to src/orcapod/core/trackers.py index 29fe622..3b27b00 100644 --- a/src/orcapod/data/trackers.py +++ b/src/orcapod/core/trackers.py @@ -1,4 +1,4 @@ -from orcapod.data.base import LabeledContentIdentifiableBase +from orcapod.core.base import LabeledContentIdentifiableBase from orcapod.protocols import data_protocols as dp from collections import defaultdict from collections.abc import Generator @@ -156,7 +156,7 @@ def parents(self) -> tuple["Invocation", ...]: parent_invoctions.append(Invocation(stream.source, stream.upstreams)) else: # import JIT to avoid circular imports - from orcapod.data.sources.base import StreamSource + from orcapod.core.sources.base import StreamSource source = StreamSource(stream) parent_invoctions.append(Invocation(source)) diff --git a/src/orcapod/types.py b/src/orcapod/types.py index 745568e..0f84d9c 100644 --- a/src/orcapod/types.py +++ b/src/orcapod/types.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -DataType: TypeAlias = type | UnionType +DataType: TypeAlias = type | UnionType | list[type] | tuple[type, ...] PythonSchema: TypeAlias = dict[str, DataType] # dict of parameter names to their types From 83bd86a539b5824bd3bbababb164e05bdde64f6f Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 30 Aug 2025 13:53:05 -0700 Subject: [PATCH 216/224] refactor: update data protocols to core protocols --- src/orcapod/core/kernels.py | 2 +- src/orcapod/core/operators/base.py | 2 +- src/orcapod/core/operators/batch.py | 2 +- src/orcapod/core/operators/join.py | 2 +- src/orcapod/core/operators/mappers.py | 2 +- src/orcapod/core/operators/semijoin.py | 2 +- src/orcapod/core/pods.py | 2 +- src/orcapod/core/sources/arrow_table_source.py | 2 +- src/orcapod/core/sources/base.py | 2 +- src/orcapod/core/sources/csv_source.py | 2 +- src/orcapod/core/sources/delta_table_source.py | 2 +- src/orcapod/core/sources/dict_source.py | 2 +- src/orcapod/core/sources/list_source.py | 2 +- src/orcapod/core/sources/manual_table_source.py | 2 +- src/orcapod/core/sources/source_registry.py | 2 +- src/orcapod/core/streams.py | 2 +- src/orcapod/core/trackers.py | 2 +- src/orcapod/databases/delta_lake_databases.py | 2 +- src/orcapod/pipeline/graph.py | 4 ++-- src/orcapod/pipeline/nodes.py | 10 +++++----- .../{data_protocols => core_protocols}/__init__.py | 0 .../{data_protocols => core_protocols}/base.py | 0 .../{data_protocols => core_protocols}/datagrams.py | 0 .../{data_protocols => core_protocols}/kernel.py | 4 ++-- .../{data_protocols => core_protocols}/pods.py | 6 +++--- .../{data_protocols => core_protocols}/source.py | 4 ++-- .../{data_protocols => core_protocols}/streams.py | 6 +++--- .../{data_protocols => core_protocols}/trackers.py | 8 ++++---- src/orcapod/protocols/pipeline_protocols.py | 4 ++-- tests/test_data/test_datagrams/test_arrow_datagram.py | 6 +++--- .../test_data/test_datagrams/test_arrow_tag_packet.py | 4 ++-- .../test_data/test_datagrams/test_base_integration.py | 6 +++--- tests/test_data/test_datagrams/test_dict_datagram.py | 4 ++-- tests/test_data/test_datagrams/test_dict_tag_packet.py | 4 ++-- 34 files changed, 53 insertions(+), 53 deletions(-) rename src/orcapod/protocols/{data_protocols => core_protocols}/__init__.py (100%) rename src/orcapod/protocols/{data_protocols => core_protocols}/base.py (100%) rename src/orcapod/protocols/{data_protocols => core_protocols}/datagrams.py (100%) rename src/orcapod/protocols/{data_protocols => core_protocols}/kernel.py (98%) rename src/orcapod/protocols/{data_protocols => core_protocols}/pods.py (97%) rename src/orcapod/protocols/{data_protocols => core_protocols}/source.py (89%) rename src/orcapod/protocols/{data_protocols => core_protocols}/streams.py (98%) rename src/orcapod/protocols/{data_protocols => core_protocols}/trackers.py (96%) diff --git a/src/orcapod/core/kernels.py b/src/orcapod/core/kernels.py index 40bf6b1..4045806 100644 --- a/src/orcapod/core/kernels.py +++ b/src/orcapod/core/kernels.py @@ -2,7 +2,7 @@ from collections.abc import Collection from datetime import datetime, timezone from typing import Any -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp import logging from orcapod.core.streams import KernelStream from orcapod.core.base import LabeledContentIdentifiableBase diff --git a/src/orcapod/core/operators/base.py b/src/orcapod/core/operators/base.py index 01c0123..21eece9 100644 --- a/src/orcapod/core/operators/base.py +++ b/src/orcapod/core/operators/base.py @@ -1,6 +1,6 @@ from ast import Not from orcapod.core.kernels import TrackedKernelBase -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PythonSchema from abc import abstractmethod from typing import Any diff --git a/src/orcapod/core/operators/batch.py b/src/orcapod/core/operators/batch.py index 284b0ad..653c0f0 100644 --- a/src/orcapod/core/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -1,6 +1,6 @@ from orcapod.core.operators.base import UnaryOperator from collections.abc import Collection -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.core.streams import TableStream diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 2b454a1..447cbc6 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -1,4 +1,4 @@ -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from orcapod.utils import types_utils diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 7ac127a..24f959b 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -1,4 +1,4 @@ -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING diff --git a/src/orcapod/core/operators/semijoin.py b/src/orcapod/core/operators/semijoin.py index 8d70756..75abef1 100644 --- a/src/orcapod/core/operators/semijoin.py +++ b/src/orcapod/core/operators/semijoin.py @@ -1,4 +1,4 @@ -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.core.streams import TableStream from orcapod.utils import types_utils from orcapod.types import PythonSchema diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py index 5fd94d4..8ac0545 100644 --- a/src/orcapod/core/pods.py +++ b/src/orcapod/core/pods.py @@ -16,7 +16,7 @@ from orcapod.core.streams import EfficientPodResultStream, LazyPodResultStream from orcapod.core.system_constants import constants from orcapod.hashing.hash_utils import get_function_components, get_function_signature -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.database_protocols import ArrowDatabase from orcapod.types import DataValue, PythonSchema, PythonSchemaLike diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index e2d548f..1d263a1 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -4,7 +4,7 @@ from orcapod.core.streams import TableStream -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from orcapod.core.system_constants import constants diff --git a/src/orcapod/core/sources/base.py b/src/orcapod/core/sources/base.py index c5f9b42..89c0b31 100644 --- a/src/orcapod/core/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -8,7 +8,7 @@ KernelStream, StatefulStreamBase, ) -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule diff --git a/src/orcapod/core/sources/csv_source.py b/src/orcapod/core/sources/csv_source.py index 97a05de..9f71dec 100644 --- a/src/orcapod/core/sources/csv_source.py +++ b/src/orcapod/core/sources/csv_source.py @@ -4,7 +4,7 @@ from orcapod.core.streams import ( TableStream, ) -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule diff --git a/src/orcapod/core/sources/delta_table_source.py b/src/orcapod/core/sources/delta_table_source.py index fca9dcd..3f2a03f 100644 --- a/src/orcapod/core/sources/delta_table_source.py +++ b/src/orcapod/core/sources/delta_table_source.py @@ -3,7 +3,7 @@ from orcapod.core.streams import TableStream -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PathLike, PythonSchema from orcapod.utils.lazy_module import LazyModule from pathlib import Path diff --git a/src/orcapod/core/sources/dict_source.py b/src/orcapod/core/sources/dict_source.py index 4c0d324..f96eeba 100644 --- a/src/orcapod/core/sources/dict_source.py +++ b/src/orcapod/core/sources/dict_source.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule from orcapod.core.system_constants import constants diff --git a/src/orcapod/core/sources/list_source.py b/src/orcapod/core/sources/list_source.py index 53a3b32..699d3ab 100644 --- a/src/orcapod/core/sources/list_source.py +++ b/src/orcapod/core/sources/list_source.py @@ -13,7 +13,7 @@ StatefulStreamBase, ) from orcapod.errors import DuplicateTagError -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule diff --git a/src/orcapod/core/sources/manual_table_source.py b/src/orcapod/core/sources/manual_table_source.py index 46e544f..a428695 100644 --- a/src/orcapod/core/sources/manual_table_source.py +++ b/src/orcapod/core/sources/manual_table_source.py @@ -14,7 +14,7 @@ ) from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry from orcapod.errors import DuplicateTagError -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule diff --git a/src/orcapod/core/sources/source_registry.py b/src/orcapod/core/sources/source_registry.py index ea1fe94..66f9bf7 100644 --- a/src/orcapod/core/sources/source_registry.py +++ b/src/orcapod/core/sources/source_registry.py @@ -1,6 +1,6 @@ import logging from collections.abc import Iterator -from orcapod.protocols.data_protocols import Source +from orcapod.protocols.core_protocols import Source logger = logging.getLogger(__name__) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index b934f60..b190691 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -14,7 +14,7 @@ DictTag, ) from orcapod.core.system_constants import constants -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule diff --git a/src/orcapod/core/trackers.py b/src/orcapod/core/trackers.py index 3b27b00..b3e7689 100644 --- a/src/orcapod/core/trackers.py +++ b/src/orcapod/core/trackers.py @@ -1,5 +1,5 @@ from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from collections import defaultdict from collections.abc import Generator from abc import ABC, abstractmethod diff --git a/src/orcapod/databases/delta_lake_databases.py b/src/orcapod/databases/delta_lake_databases.py index 96181ee..e8aba77 100644 --- a/src/orcapod/databases/delta_lake_databases.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -7,7 +7,7 @@ from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -from orcapod.data import constants +from orcapod.core import constants from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 783c2ed..14f27a9 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -1,8 +1,8 @@ -from orcapod.data.trackers import GraphTracker, Invocation +from orcapod.core.trackers import GraphTracker, Invocation from orcapod.pipeline.nodes import KernelNode, PodNode from orcapod.protocols.pipeline_protocols import Node from orcapod import contexts -from orcapod.protocols import data_protocols as dp +from orcapod.protocols import core_protocols as dp from orcapod.protocols import database_protocols as dbp from typing import Any from collections.abc import Collection diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index a6e7f0f..97c91bd 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,13 +1,13 @@ from abc import abstractmethod from einops import pack -from orcapod.data.kernels import KernelStream, WrappedKernel -from orcapod.data.sources import SourceBase -from orcapod.data.pods import CachedPod -from orcapod.protocols import data_protocols as dp, database_protocols as dbp +from orcapod.core.kernels import KernelStream, WrappedKernel +from orcapod.core.sources import SourceBase +from orcapod.core.pods import CachedPod +from orcapod.protocols import core_protocols as dp, database_protocols as dbp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants from orcapod.utils import arrow_utils from collections.abc import Collection diff --git a/src/orcapod/protocols/data_protocols/__init__.py b/src/orcapod/protocols/core_protocols/__init__.py similarity index 100% rename from src/orcapod/protocols/data_protocols/__init__.py rename to src/orcapod/protocols/core_protocols/__init__.py diff --git a/src/orcapod/protocols/data_protocols/base.py b/src/orcapod/protocols/core_protocols/base.py similarity index 100% rename from src/orcapod/protocols/data_protocols/base.py rename to src/orcapod/protocols/core_protocols/base.py diff --git a/src/orcapod/protocols/data_protocols/datagrams.py b/src/orcapod/protocols/core_protocols/datagrams.py similarity index 100% rename from src/orcapod/protocols/data_protocols/datagrams.py rename to src/orcapod/protocols/core_protocols/datagrams.py diff --git a/src/orcapod/protocols/data_protocols/kernel.py b/src/orcapod/protocols/core_protocols/kernel.py similarity index 98% rename from src/orcapod/protocols/data_protocols/kernel.py rename to src/orcapod/protocols/core_protocols/kernel.py index a4f5ea2..842d7af 100644 --- a/src/orcapod/protocols/data_protocols/kernel.py +++ b/src/orcapod/protocols/core_protocols/kernel.py @@ -3,8 +3,8 @@ from typing import Any, Protocol, runtime_checkable from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import PythonSchema -from orcapod.protocols.data_protocols.base import Labelable -from orcapod.protocols.data_protocols.streams import Stream, LiveStream +from orcapod.protocols.core_protocols.base import Labelable +from orcapod.protocols.core_protocols.streams import Stream, LiveStream @runtime_checkable diff --git a/src/orcapod/protocols/data_protocols/pods.py b/src/orcapod/protocols/core_protocols/pods.py similarity index 97% rename from src/orcapod/protocols/data_protocols/pods.py rename to src/orcapod/protocols/core_protocols/pods.py index 5c04f5c..4546e36 100644 --- a/src/orcapod/protocols/data_protocols/pods.py +++ b/src/orcapod/protocols/core_protocols/pods.py @@ -1,8 +1,8 @@ from typing import TYPE_CHECKING, Protocol, runtime_checkable -from orcapod.protocols.data_protocols.base import ExecutionEngine -from orcapod.protocols.data_protocols.datagrams import Packet, Tag -from orcapod.protocols.data_protocols.kernel import Kernel +from orcapod.protocols.core_protocols.base import ExecutionEngine +from orcapod.protocols.core_protocols.datagrams import Packet, Tag +from orcapod.protocols.core_protocols.kernel import Kernel from orcapod.types import PythonSchema if TYPE_CHECKING: diff --git a/src/orcapod/protocols/data_protocols/source.py b/src/orcapod/protocols/core_protocols/source.py similarity index 89% rename from src/orcapod/protocols/data_protocols/source.py rename to src/orcapod/protocols/core_protocols/source.py index 6af8f13..e94f336 100644 --- a/src/orcapod/protocols/data_protocols/source.py +++ b/src/orcapod/protocols/core_protocols/source.py @@ -1,7 +1,7 @@ from typing import Protocol, runtime_checkable -from orcapod.protocols.data_protocols.kernel import Kernel -from orcapod.protocols.data_protocols.streams import Stream +from orcapod.protocols.core_protocols.kernel import Kernel +from orcapod.protocols.core_protocols.streams import Stream @runtime_checkable diff --git a/src/orcapod/protocols/data_protocols/streams.py b/src/orcapod/protocols/core_protocols/streams.py similarity index 98% rename from src/orcapod/protocols/data_protocols/streams.py rename to src/orcapod/protocols/core_protocols/streams.py index 1a9dc25..aea9dfd 100644 --- a/src/orcapod/protocols/data_protocols/streams.py +++ b/src/orcapod/protocols/core_protocols/streams.py @@ -2,8 +2,8 @@ from datetime import datetime from typing import TYPE_CHECKING, Protocol, runtime_checkable -from orcapod.protocols.data_protocols.base import ExecutionEngine, Labelable -from orcapod.protocols.data_protocols.datagrams import Packet, Tag +from orcapod.protocols.core_protocols.base import ExecutionEngine, Labelable +from orcapod.protocols.core_protocols.datagrams import Packet, Tag from orcapod.protocols.hashing_protocols import ContentIdentifiable from orcapod.types import PythonSchema @@ -11,7 +11,7 @@ import polars as pl import pyarrow as pa import pandas as pd - from orcapod.protocols.data_protocols.kernel import Kernel + from orcapod.protocols.core_protocols.kernel import Kernel @runtime_checkable diff --git a/src/orcapod/protocols/data_protocols/trackers.py b/src/orcapod/protocols/core_protocols/trackers.py similarity index 96% rename from src/orcapod/protocols/data_protocols/trackers.py rename to src/orcapod/protocols/core_protocols/trackers.py index 0e983db..7bc9a1e 100644 --- a/src/orcapod/protocols/data_protocols/trackers.py +++ b/src/orcapod/protocols/core_protocols/trackers.py @@ -1,9 +1,9 @@ from typing import Protocol, runtime_checkable from contextlib import AbstractContextManager -from orcapod.protocols.data_protocols.kernel import Kernel -from orcapod.protocols.data_protocols.pods import Pod -from orcapod.protocols.data_protocols.source import Source -from orcapod.protocols.data_protocols.streams import Stream +from orcapod.protocols.core_protocols.kernel import Kernel +from orcapod.protocols.core_protocols.pods import Pod +from orcapod.protocols.core_protocols.source import Source +from orcapod.protocols.core_protocols.streams import Stream @runtime_checkable diff --git a/src/orcapod/protocols/pipeline_protocols.py b/src/orcapod/protocols/pipeline_protocols.py index 725bbe7..4e0ce73 100644 --- a/src/orcapod/protocols/pipeline_protocols.py +++ b/src/orcapod/protocols/pipeline_protocols.py @@ -1,7 +1,7 @@ # Protocols for pipeline and nodes from typing import Protocol, runtime_checkable, TYPE_CHECKING -from orcapod.protocols.data_protocols.source import Source -from orcapod.protocols.data_protocols.pods import CachedPod +from orcapod.protocols.core_protocols.source import Source +from orcapod.protocols.core_protocols.pods import CachedPod if TYPE_CHECKING: diff --git a/tests/test_data/test_datagrams/test_arrow_datagram.py b/tests/test_data/test_datagrams/test_arrow_datagram.py index 304bace..d23a4fd 100644 --- a/tests/test_data/test_datagrams/test_arrow_datagram.py +++ b/tests/test_data/test_datagrams/test_arrow_datagram.py @@ -18,9 +18,9 @@ import pyarrow as pa from datetime import datetime, date -from orcapod.data.datagrams import ArrowDatagram -from orcapod.data.system_constants import constants -from orcapod.protocols.data_protocols import Datagram +from orcapod.core.datagrams import ArrowDatagram +from orcapod.core.system_constants import constants +from orcapod.protocols.core_protocols import Datagram from orcapod.protocols.hashing_protocols import ContentHash diff --git a/tests/test_data/test_datagrams/test_arrow_tag_packet.py b/tests/test_data/test_datagrams/test_arrow_tag_packet.py index 04844cc..3154bdc 100644 --- a/tests/test_data/test_datagrams/test_arrow_tag_packet.py +++ b/tests/test_data/test_datagrams/test_arrow_tag_packet.py @@ -13,8 +13,8 @@ import pyarrow as pa from datetime import datetime, date -from orcapod.data.datagrams import ArrowTag, ArrowPacket -from orcapod.data.system_constants import constants +from orcapod.core.datagrams import ArrowTag, ArrowPacket +from orcapod.core.system_constants import constants class TestArrowTagInitialization: diff --git a/tests/test_data/test_datagrams/test_base_integration.py b/tests/test_data/test_datagrams/test_base_integration.py index d0bd19f..896a60f 100644 --- a/tests/test_data/test_datagrams/test_base_integration.py +++ b/tests/test_data/test_datagrams/test_base_integration.py @@ -11,7 +11,7 @@ import pytest import pyarrow as pa -from orcapod.data.datagrams import ( +from orcapod.core.datagrams import ( DictDatagram, ArrowDatagram, DictTag, @@ -19,12 +19,12 @@ ArrowTag, ArrowPacket, ) -from orcapod.data.datagrams.base import ( +from orcapod.core.datagrams.base import ( BaseDatagram, ImmutableDict, contains_prefix_from, ) -from orcapod.data.system_constants import constants +from orcapod.core.system_constants import constants class TestImmutableDict: diff --git a/tests/test_data/test_datagrams/test_dict_datagram.py b/tests/test_data/test_datagrams/test_dict_datagram.py index 3a1d40d..5538d59 100644 --- a/tests/test_data/test_datagrams/test_dict_datagram.py +++ b/tests/test_data/test_datagrams/test_dict_datagram.py @@ -15,8 +15,8 @@ import pytest import pyarrow as pa -from orcapod.data.datagrams import DictDatagram -from orcapod.data.system_constants import constants +from orcapod.core.datagrams import DictDatagram +from orcapod.core.system_constants import constants class TestDictDatagramInitialization: diff --git a/tests/test_data/test_datagrams/test_dict_tag_packet.py b/tests/test_data/test_datagrams/test_dict_tag_packet.py index 57933c2..a255f79 100644 --- a/tests/test_data/test_datagrams/test_dict_tag_packet.py +++ b/tests/test_data/test_datagrams/test_dict_tag_packet.py @@ -10,8 +10,8 @@ import pytest -from orcapod.data.datagrams import DictTag, DictPacket -from orcapod.data.system_constants import constants +from orcapod.core.datagrams import DictTag, DictPacket +from orcapod.core.system_constants import constants class TestDictTagInitialization: From 6dff27b2bd5f764f5f2aac29fc4889e1e0847156 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sat, 30 Aug 2025 23:11:22 -0700 Subject: [PATCH 217/224] fix: tag handling bugs and refactor to use core protocol --- src/orcapod/core/arrow_data_utils.py | 49 +- src/orcapod/core/kernels.py | 34 +- src/orcapod/core/operators/base.py | 63 +- src/orcapod/core/operators/batch.py | 16 +- src/orcapod/core/operators/join.py | 26 +- src/orcapod/core/operators/mappers.py | 18 +- src/orcapod/core/operators/semijoin.py | 14 +- src/orcapod/core/pods.py | 110 ++-- .../core/sources/arrow_table_source.py | 39 +- src/orcapod/core/sources/base.py | 48 +- src/orcapod/core/sources/csv_source.py | 4 +- .../core/sources/delta_table_source.py | 6 +- src/orcapod/core/sources/dict_source.py | 4 +- src/orcapod/core/sources/list_source.py | 6 +- .../core/sources/manual_table_source.py | 23 +- src/orcapod/core/streams.py | 183 +++--- src/orcapod/core/system_constants.py | 14 +- src/orcapod/core/trackers.py | 52 +- src/orcapod/hashing/hash_utils.py | 560 +++++++++--------- src/orcapod/pipeline/graph.py | 69 ++- src/orcapod/pipeline/nodes.py | 45 +- 21 files changed, 746 insertions(+), 637 deletions(-) diff --git a/src/orcapod/core/arrow_data_utils.py b/src/orcapod/core/arrow_data_utils.py index 2443cdb..7194208 100644 --- a/src/orcapod/core/arrow_data_utils.py +++ b/src/orcapod/core/arrow_data_utils.py @@ -25,7 +25,7 @@ def drop_columns_with_prefix( def drop_system_columns( - table, + table: "pa.Table", system_column_prefix: tuple[str, ...] = ( constants.META_PREFIX, constants.DATAGRAM_PREFIX, @@ -34,6 +34,53 @@ def drop_system_columns( return drop_columns_with_prefix(table, system_column_prefix) +def get_system_columns(table: "pa.Table") -> "pa.Table": + """Get system columns from an Arrow table.""" + return table.select( + [ + col + for col in table.column_names + if col.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + + +def add_system_tag_column( + table: "pa.Table", + system_tag_column_name: str, + system_tag_values: str | Collection[str], +) -> "pa.Table": + """Add a system tags column to an Arrow table.""" + if not table.column_names: + raise ValueError("Table is empty") + if isinstance(system_tag_values, str): + system_tag_values = [system_tag_values] * table.num_rows + else: + system_tag_values = list(system_tag_values) + if len(system_tag_values) != table.num_rows: + raise ValueError( + "Length of system_tag_values must match number of rows in the table." + ) + if not system_tag_column_name.startswith(constants.SYSTEM_TAG_PREFIX): + system_tag_column_name = ( + f"{constants.SYSTEM_TAG_PREFIX}{system_tag_column_name}" + ) + tags_column = pa.array(system_tag_values, type=pa.large_string()) + return table.append_column(system_tag_column_name, tags_column) + + +def append_to_system_tags(table: "pa.Table", value: str) -> "pa.Table": + """Append a value to the system tags column in an Arrow table.""" + if not table.column_names: + raise ValueError("Table is empty") + + column_name_map = { + c: f"{c}:{value}" if c.startswith(constants.SYSTEM_TAG_PREFIX) else c + for c in table.column_names + } + return table.rename_columns(column_name_map) + + def add_source_info( table: "pa.Table", source_info: str | Collection[str] | None, diff --git a/src/orcapod/core/kernels.py b/src/orcapod/core/kernels.py index 4045806..52e1f8c 100644 --- a/src/orcapod/core/kernels.py +++ b/src/orcapod/core/kernels.py @@ -2,7 +2,7 @@ from collections.abc import Collection from datetime import datetime, timezone from typing import Any -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp import logging from orcapod.core.streams import KernelStream from orcapod.core.base import LabeledContentIdentifiableBase @@ -28,7 +28,7 @@ def __init__( self, label: str | None = None, skip_tracking: bool = False, - tracker_manager: dp.TrackerManager | None = None, + tracker_manager: cp.TrackerManager | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -79,7 +79,7 @@ def _set_modified_time( @abstractmethod def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ Return the output types of the kernel given the input streams. @@ -87,7 +87,7 @@ def kernel_output_types( ... def output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) @@ -97,7 +97,7 @@ def output_types( @abstractmethod def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: """ Identity structure for this kernel. Input stream(s), if present, have already been preprocessed @@ -105,7 +105,7 @@ def kernel_identity_structure( """ ... - def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> Any: + def identity_structure(self, streams: Collection[cp.Stream] | None = None) -> Any: """ Default implementation of identity_structure for the kernel only concerns the kernel class and the streams if present. Subclasses of @@ -128,14 +128,14 @@ def identity_structure(self, streams: Collection[dp.Stream] | None = None) -> An return self.kernel_identity_structure(streams) @abstractmethod - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Trigger the main computation of the kernel on a collection of streams. This method is called when the kernel is invoked with a collection of streams. Subclasses should override this method to provide the kernel with its unique behavior """ - def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]: """ Pre-processing step that can be overridden by subclasses to perform any necessary pre-processing on the input streams before the main computation. This is useful if you need to modify the input streams @@ -146,14 +146,14 @@ def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: return streams @abstractmethod - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: """ Validate the input streams before the main computation but after the pre-kernel processing """ ... def prepare_output_stream( - self, *streams: dp.Stream, label: str | None = None + self, *streams: cp.Stream, label: str | None = None ) -> KernelStream: """ Prepare the output stream for the kernel invocation. @@ -162,7 +162,7 @@ def prepare_output_stream( """ return KernelStream(source=self, upstreams=streams, label=label) - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: """ Track the invocation of the kernel with the provided streams. This is a convenience method that calls record_kernel_invocation. @@ -171,7 +171,7 @@ def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> Non self._tracker_manager.record_kernel_invocation(self, streams, label=label) def __call__( - self, *streams: dp.Stream, label: str | None = None, **kwargs + self, *streams: cp.Stream, label: str | None = None, **kwargs ) -> KernelStream: processed_streams = self.pre_kernel_processing(*streams) self.validate_inputs(*processed_streams) @@ -199,7 +199,7 @@ class WrappedKernel(TrackedKernelBase): `Kernel` protocol. Refer to `orcapod.protocols.data_protocols.Kernel` for more details. """ - def __init__(self, kernel: dp.Kernel, **kwargs) -> None: + def __init__(self, kernel: cp.Kernel, **kwargs) -> None: # TODO: handle fixed input stream already set on the kernel super().__init__(**kwargs) self.kernel = kernel @@ -217,21 +217,21 @@ def reference(self) -> tuple[str, ...]: return self.kernel.reference def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: return self.kernel.output_types( *streams, include_system_tags=include_system_tags ) def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: return self.kernel.identity_structure(streams) - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: return self.kernel.validate_inputs(*streams) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: return self.kernel.forward(*streams) def __repr__(self): diff --git a/src/orcapod/core/operators/base.py b/src/orcapod/core/operators/base.py index 21eece9..b87748c 100644 --- a/src/orcapod/core/operators/base.py +++ b/src/orcapod/core/operators/base.py @@ -1,6 +1,5 @@ -from ast import Not from orcapod.core.kernels import TrackedKernelBase -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from abc import abstractmethod from typing import Any @@ -24,7 +23,7 @@ class UnaryOperator(Operator): def check_unary_input( self, - streams: Collection[dp.Stream], + streams: Collection[cp.Stream], ) -> None: """ Check that the inputs to the unary operator are valid. @@ -32,12 +31,12 @@ def check_unary_input( if len(streams) != 1: raise ValueError("UnaryOperator requires exactly one input stream.") - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: self.check_unary_input(streams) stream = streams[0] return self.op_validate_inputs(stream) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Forward method for unary operators. It expects exactly one stream as input. @@ -62,13 +61,13 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: # return output_substreams[0] def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: stream = streams[0] return self.op_output_types(stream, include_system_tags=include_system_tags) def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: """ Return a structure that represents the identity of this operator. @@ -76,11 +75,11 @@ def kernel_identity_structure( """ if streams is not None: stream = list(streams)[0] - self.op_identity_structure(stream) + return self.op_identity_structure(stream) return self.op_identity_structure() @abstractmethod - def op_validate_inputs(self, stream: dp.Stream) -> None: + def op_validate_inputs(self, stream: cp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -88,7 +87,7 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: ... @abstractmethod - def op_forward(self, stream: dp.Stream) -> dp.Stream: + def op_forward(self, stream: cp.Stream) -> cp.Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. It takes two streams as input and returns a new stream as output. @@ -97,7 +96,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: @abstractmethod def op_output_types( - self, stream: dp.Stream, include_system_tags: bool = False + self, stream: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. @@ -106,7 +105,7 @@ def op_output_types( ... @abstractmethod - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: """ This method should be implemented by subclasses to return a structure that represents the identity of the operator. It takes two streams as input and returns a tuple containing the operator name and a set of streams. @@ -121,7 +120,7 @@ class BinaryOperator(Operator): def check_binary_inputs( self, - streams: Collection[dp.Stream], + streams: Collection[cp.Stream], ) -> None: """ Check that the inputs to the binary operator are valid. @@ -130,12 +129,12 @@ def check_binary_inputs( if len(streams) != 2: raise ValueError("BinaryOperator requires exactly two input streams.") - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: self.check_binary_inputs(streams) left_stream, right_stream = streams return self.op_validate_inputs(left_stream, right_stream) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Forward method for binary operators. It expects exactly two streams as input. @@ -144,7 +143,7 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: return self.op_forward(left_stream, right_stream) def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: left_stream, right_stream = streams return self.op_output_types( @@ -152,7 +151,7 @@ def kernel_output_types( ) def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: """ Return a structure that represents the identity of this operator. @@ -165,7 +164,7 @@ def kernel_identity_structure( @abstractmethod def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream + self, left_stream: cp.Stream, right_stream: cp.Stream ) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. @@ -174,7 +173,7 @@ def op_validate_inputs( ... @abstractmethod - def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. It takes two streams as input and returns a new stream as output. @@ -184,8 +183,8 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre @abstractmethod def op_output_types( self, - left_stream: dp.Stream, - right_stream: dp.Stream, + left_stream: cp.Stream, + right_stream: cp.Stream, include_system_tags: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ @@ -197,8 +196,8 @@ def op_output_types( @abstractmethod def op_identity_structure( self, - left_stream: dp.Stream | None = None, - right_stream: dp.Stream | None = None, + left_stream: cp.Stream | None = None, + right_stream: cp.Stream | None = None, ) -> Any: """ This method should be implemented by subclasses to return a structure that represents the identity of the operator. @@ -216,7 +215,7 @@ class NonZeroInputOperator(Operator): def verify_non_zero_input( self, - streams: Collection[dp.Stream], + streams: Collection[cp.Stream], ) -> None: """ Check that the inputs to the variable inputs operator are valid. @@ -227,11 +226,11 @@ def verify_non_zero_input( f"Operator {self.__class__.__name__} requires at least one input stream." ) - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: self.verify_non_zero_input(streams) return self.op_validate_inputs(*streams) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Forward method for variable inputs operators. It expects at least one stream as input. @@ -239,12 +238,12 @@ def forward(self, *streams: dp.Stream) -> dp.Stream: return self.op_forward(*streams) def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: return self.op_output_types(*streams, include_system_tags=include_system_tags) def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: """ Return a structure that represents the identity of this operator. @@ -253,7 +252,7 @@ def kernel_identity_structure( return self.op_identity_structure(streams) @abstractmethod - def op_validate_inputs(self, *streams: dp.Stream) -> None: + def op_validate_inputs(self, *streams: cp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -261,7 +260,7 @@ def op_validate_inputs(self, *streams: dp.Stream) -> None: ... @abstractmethod - def op_forward(self, *streams: dp.Stream) -> dp.Stream: + def op_forward(self, *streams: cp.Stream) -> cp.Stream: """ This method should be implemented by subclasses to define the specific behavior of the non-zero input operator. It takes variable number of streams as input and returns a new stream as output. @@ -270,7 +269,7 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: @abstractmethod def op_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. @@ -280,7 +279,7 @@ def op_output_types( @abstractmethod def op_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: """ This method should be implemented by subclasses to return a structure that represents the identity of the operator. diff --git a/src/orcapod/core/operators/batch.py b/src/orcapod/core/operators/batch.py index 653c0f0..b4323b8 100644 --- a/src/orcapod/core/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -1,6 +1,6 @@ from orcapod.core.operators.base import UnaryOperator from collections.abc import Collection -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from typing import Any, TYPE_CHECKING from orcapod.utils.lazy_module import LazyModule from orcapod.core.streams import TableStream @@ -31,7 +31,7 @@ def __init__(self, batch_size: int = 0, drop_last_batch: bool = False, **kwargs) def check_unary_input( self, - streams: Collection[dp.Stream], + streams: Collection[cp.Stream], ) -> None: """ Check that the inputs to the unary operator are valid. @@ -39,19 +39,19 @@ def check_unary_input( if len(streams) != 1: raise ValueError("UnaryOperator requires exactly one input stream.") - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: self.check_unary_input(streams) stream = streams[0] return self.op_validate_inputs(stream) - def op_validate_inputs(self, stream: dp.Stream) -> None: + def op_validate_inputs(self, stream: cp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. """ return None - def op_forward(self, stream: dp.Stream) -> dp.Stream: + def op_forward(self, stream: cp.Stream) -> cp.Stream: """ This method should be implemented by subclasses to define the specific behavior of the binary operator. It takes two streams as input and returns a new stream as output. @@ -84,20 +84,20 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: return TableStream(batched_table, tag_columns=tag_columns) def op_output_types( - self, stream: dp.Stream, include_system_tags: bool = False + self, stream: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ This method should be implemented by subclasses to return the typespecs of the input and output streams. It takes two streams as input and returns a tuple of typespecs. """ - tag_types, packet_types = stream.types() + tag_types, packet_types = stream.types(include_system_tags=include_system_tags) batched_tag_types = {k: list[v] for k, v in tag_types.items()} batched_packet_types = {k: list[v] for k, v in packet_types.items()} # TODO: check if this is really necessary return PythonSchema(batched_tag_types), PythonSchema(batched_packet_types) - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: return ( (self.__class__.__name__, self.batch_size, self.drop_last_batch) + (stream,) if stream is not None diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 447cbc6..95b0371 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -1,4 +1,4 @@ -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from orcapod.utils import types_utils @@ -7,6 +7,7 @@ from collections.abc import Collection from orcapod.errors import InputValidationError from orcapod.core.operators.base import NonZeroInputOperator +from orcapod.core import arrow_data_utils if TYPE_CHECKING: import pyarrow as pa @@ -25,20 +26,26 @@ def kernel_id(self) -> tuple[str, ...]: """ return (f"{self.__class__.__name__}",) - def op_validate_inputs(self, *streams: dp.Stream) -> None: + def op_validate_inputs(self, *streams: cp.Stream) -> None: try: self.op_output_types(*streams) except Exception as e: # raise InputValidationError(f"Input streams are not compatible: {e}") from e raise e + def order_input_streams(self, *streams: cp.Stream) -> list[cp.Stream]: + # order the streams based on their hashes to offer deterministic operation + return sorted(streams, key=lambda s: s.content_hash().to_hex()) + def op_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: if len(streams) == 1: # If only one stream is provided, return its typespecs return streams[0].types(include_system_tags=include_system_tags) + # output type computation does NOT require consistent ordering of streams + # TODO: consider performing the check always with system tags on stream = streams[0] tag_typespec, packet_typespec = stream.types( @@ -62,7 +69,7 @@ def op_output_types( return tag_typespec, packet_typespec - def op_forward(self, *streams: dp.Stream) -> dp.Stream: + def op_forward(self, *streams: cp.Stream) -> cp.Stream: """ Joins two streams together based on their tags. The resulting stream will contain all the tags from both streams. @@ -78,12 +85,21 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: table = stream.as_table(include_source=True, include_system_tags=True) # trick to get cartesian product table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) + N_CHAR = 12 + table = arrow_data_utils.append_to_system_tags( + table, stream.content_hash().to_hex(char_count=N_CHAR) + ) for next_stream in streams[1:]: next_tag_keys, _ = next_stream.keys() next_table = next_stream.as_table( include_source=True, include_system_tags=True ) + next_table = arrow_data_utils.append_to_system_tags( + next_table, next_stream.content_hash().to_hex(char_count=N_CHAR) + ) + # trick to ensure that there will always be at least one shared key + # this ensure that no overlap in keys lead to full caretesian product next_table = next_table.add_column( 0, COMMON_JOIN_KEY, pa.array([0] * len(next_table)) ) @@ -112,7 +128,7 @@ def op_forward(self, *streams: dp.Stream) -> dp.Stream: ) def op_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: return ( (self.__class__.__name__,) + (set(streams),) if streams is not None else () diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 24f959b..3806080 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -1,4 +1,4 @@ -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.core.streams import TableStream from orcapod.types import PythonSchema from typing import Any, TYPE_CHECKING @@ -28,7 +28,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def op_forward(self, stream: dp.Stream) -> dp.Stream: + def op_forward(self, stream: cp.Stream) -> cp.Stream: tag_columns, packet_columns = stream.keys() unmapped_columns = set(packet_columns) - set(self.name_map.keys()) @@ -68,7 +68,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: renamed_table, tag_columns=tag_columns, source=self, upstreams=(stream,) ) - def op_validate_inputs(self, stream: dp.Stream) -> None: + def op_validate_inputs(self, stream: cp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -96,7 +96,7 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: raise InputValidationError(message) def op_output_types( - self, stream: dp.Stream, include_system_tags: bool = False + self, stream: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: tag_typespec, packet_typespec = stream.types( include_system_tags=include_system_tags @@ -109,7 +109,7 @@ def op_output_types( return tag_typespec, new_packet_typespec - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: return ( self.__class__.__name__, self.name_map, @@ -131,7 +131,7 @@ def __init__( self.drop_unmapped = drop_unmapped super().__init__(**kwargs) - def op_forward(self, stream: dp.Stream) -> dp.Stream: + def op_forward(self, stream: cp.Stream) -> cp.Stream: tag_columns, packet_columns = stream.keys() missing_tags = set(tag_columns) - set(self.name_map.keys()) @@ -158,7 +158,7 @@ def op_forward(self, stream: dp.Stream) -> dp.Stream: renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) ) - def op_validate_inputs(self, stream: dp.Stream) -> None: + def op_validate_inputs(self, stream: cp.Stream) -> None: """ This method should be implemented by subclasses to validate the inputs to the operator. It takes two streams as input and raises an error if the inputs are not valid. @@ -184,7 +184,7 @@ def op_validate_inputs(self, stream: dp.Stream) -> None: raise InputValidationError(message) def op_output_types( - self, stream: dp.Stream, include_system_tags: bool = False + self, stream: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: tag_typespec, packet_typespec = stream.types( include_system_tags=include_system_tags @@ -195,7 +195,7 @@ def op_output_types( return new_tag_typespec, packet_typespec - def op_identity_structure(self, stream: dp.Stream | None = None) -> Any: + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: return ( self.__class__.__name__, self.name_map, diff --git a/src/orcapod/core/operators/semijoin.py b/src/orcapod/core/operators/semijoin.py index 75abef1..6cdff4c 100644 --- a/src/orcapod/core/operators/semijoin.py +++ b/src/orcapod/core/operators/semijoin.py @@ -1,4 +1,4 @@ -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.core.streams import TableStream from orcapod.utils import types_utils from orcapod.types import PythonSchema @@ -37,8 +37,8 @@ def kernel_id(self) -> tuple[str, ...]: def op_identity_structure( self, - left_stream: dp.Stream | None = None, - right_stream: dp.Stream | None = None, + left_stream: cp.Stream | None = None, + right_stream: cp.Stream | None = None, ) -> Any: """ Return a structure that represents the identity of this operator. @@ -50,7 +50,7 @@ def op_identity_structure( id_struct += (left_stream, right_stream) return id_struct - def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stream: + def op_forward(self, left_stream: cp.Stream, right_stream: cp.Stream) -> cp.Stream: """ Performs a semi-join between left and right streams. Returns entries from left stream that have matching entries in right stream. @@ -98,8 +98,8 @@ def op_forward(self, left_stream: dp.Stream, right_stream: dp.Stream) -> dp.Stre def op_output_types( self, - left_stream: dp.Stream, - right_stream: dp.Stream, + left_stream: cp.Stream, + right_stream: cp.Stream, include_system_tags: bool = False, ) -> tuple[PythonSchema, PythonSchema]: """ @@ -110,7 +110,7 @@ def op_output_types( return left_stream.types(include_system_tags=include_system_tags) def op_validate_inputs( - self, left_stream: dp.Stream, right_stream: dp.Stream + self, left_stream: cp.Stream, right_stream: cp.Stream ) -> None: """ Validates that the input streams are compatible for semi-join. diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py index 8ac0545..5593b16 100644 --- a/src/orcapod/core/pods.py +++ b/src/orcapod/core/pods.py @@ -16,7 +16,7 @@ from orcapod.core.streams import EfficientPodResultStream, LazyPodResultStream from orcapod.core.system_constants import constants from orcapod.hashing.hash_utils import get_function_components, get_function_signature -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.protocols import hashing_protocols as hp from orcapod.protocols.database_protocols import ArrowDatabase from orcapod.types import DataValue, PythonSchema, PythonSchemaLike @@ -84,7 +84,7 @@ def version(self) -> str: return self._version @abstractmethod - def get_record_id(self, packet: dp.Packet, execution_engine_hash: str) -> str: + def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: """ Return the record ID for the input packet. This is used to identify the pod in the system. """ @@ -127,7 +127,7 @@ def major_version(self) -> int: return self._major_version def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ Return the input and output typespecs for the pod. @@ -149,7 +149,7 @@ def set_active(self, active: bool) -> None: self._active = active @staticmethod - def _join_streams(*streams: dp.Stream) -> dp.Stream: + def _join_streams(*streams: cp.Stream) -> cp.Stream: if not streams: raise ValueError("No streams provided for joining") # Join the streams using a suitable join strategy @@ -161,7 +161,7 @@ def _join_streams(*streams: dp.Stream) -> dp.Stream: joined_stream = Join()(joined_stream, next_stream) return joined_stream - def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: + def pre_kernel_processing(self, *streams: cp.Stream) -> tuple[cp.Stream, ...]: """ Prepare the incoming streams for execution in the pod. At least one stream must be present. If more than one stream is present, the join of the provided streams will be returned. @@ -174,7 +174,7 @@ def pre_kernel_processing(self, *streams: dp.Stream) -> tuple[dp.Stream, ...]: output_stream = self._join_streams(*streams) return (output_stream,) - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: if len(streams) != 1: raise ValueError( f"{self.__class__.__name__} expects exactly one input stream, got {len(streams)}" @@ -190,33 +190,33 @@ def validate_inputs(self, *streams: dp.Stream) -> None: ) def prepare_output_stream( - self, *streams: dp.Stream, label: str | None = None + self, *streams: cp.Stream, label: str | None = None ) -> KernelStream: return KernelStream(source=self, upstreams=streams, label=label) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" return LazyPodResultStream(pod=self, prepared_stream=streams[0]) @abstractmethod def call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, dp.Packet | None]: ... + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: ... @abstractmethod async def async_call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, dp.Packet | None]: ... + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: ... - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: self._tracker_manager.record_pod_invocation(self, streams, label=label) @@ -276,7 +276,7 @@ def decorator(func) -> FunctionPod: class FunctionPod(ActivatablePodBase): def __init__( self, - function: dp.PodFunction, + function: cp.PodFunction, output_keys: str | Collection[str] | None = None, function_name=None, version: str = "v0.0", @@ -356,7 +356,7 @@ def reference(self) -> tuple[str, ...]: def get_record_id( self, - packet: dp.Packet, + packet: cp.Packet, execution_engine_hash: str, ) -> str: return combine_hashes( @@ -394,11 +394,11 @@ def __str__(self) -> str: def call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, DictPacket | None]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, DictPacket | None]: if not self.is_active(): logger.info( f"Pod is not active: skipping computation on input packet {packet}" @@ -444,11 +444,11 @@ def combine(*components: tuple[str, ...]) -> str: async def async_call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, dp.Packet | None]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: """ Asynchronous call to the function pod. This is a placeholder for future implementation. Currently, it behaves like the synchronous call. @@ -518,7 +518,7 @@ def process_function_output(self, values: Any) -> dict[str, DataValue]: return {k: v for k, v in zip(self.output_keys, output_values)} def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: id_struct = (self.__class__.__name__,) + self.reference # if streams are provided, perform pre-processing step, validate, and add the @@ -538,7 +538,7 @@ class WrappedPod(ActivatablePodBase): def __init__( self, - pod: dp.Pod, + pod: cp.Pod, label: str | None = None, data_context: str | contexts.DataContext | None = None, **kwargs, @@ -561,7 +561,7 @@ def reference(self) -> tuple[str, ...]: """ return self.pod.reference - def get_record_id(self, packet: dp.Packet, execution_engine_hash: str) -> str: + def get_record_id(self, packet: cp.Packet, execution_engine_hash: str) -> str: return self.pod.get_record_id(packet, execution_engine_hash) @property @@ -588,33 +588,33 @@ def output_packet_types(self) -> PythonSchema: """ return self.pod.output_packet_types() - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: self.pod.validate_inputs(*streams) def call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, dp.Packet | None]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: return self.pod.call( tag, packet, record_id=record_id, execution_engine=execution_engine ) async def async_call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, - ) -> tuple[dp.Tag, dp.Packet | None]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> tuple[cp.Tag, cp.Packet | None]: return await self.pod.async_call( tag, packet, record_id=record_id, execution_engine=execution_engine ) def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: return self.pod.identity_structure(streams) @@ -636,7 +636,7 @@ class CachedPod(WrappedPod): def __init__( self, - pod: dp.Pod, + pod: cp.Pod, result_database: ArrowDatabase, record_path_prefix: tuple[str, ...] = (), match_tier: str | None = None, @@ -663,13 +663,13 @@ def record_path(self) -> tuple[str, ...]: def call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, - ) -> tuple[dp.Tag, dp.Packet | None]: + ) -> tuple[cp.Tag, cp.Packet | None]: # TODO: consider logic for overwriting existing records execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: @@ -690,13 +690,13 @@ def call( async def async_call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, - ) -> tuple[dp.Tag, dp.Packet | None]: + ) -> tuple[cp.Tag, cp.Packet | None]: # TODO: consider logic for overwriting existing records execution_engine_hash = execution_engine.name if execution_engine else "default" @@ -721,18 +721,18 @@ async def async_call( return tag, output_packet - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" return EfficientPodResultStream(pod=self, input_stream=streams[0]) def record_packet( self, - input_packet: dp.Packet, - output_packet: dp.Packet, + input_packet: cp.Packet, + output_packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, skip_duplicates: bool = False, - ) -> dp.Packet: + ) -> cp.Packet: """ Record the output packet against the input packet in the result store. """ @@ -784,7 +784,7 @@ def record_packet( # # TODO: make store return retrieved table return output_packet - def get_cached_output_for_packet(self, input_packet: dp.Packet) -> dp.Packet | None: + def get_cached_output_for_packet(self, input_packet: cp.Packet) -> cp.Packet | None: """ Retrieve the output packet from the result store based on the input packet. If more than one output packet is found, conflict resolution strategy diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index 1d263a1..c4f020f 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -4,7 +4,7 @@ from orcapod.core.streams import TableStream -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from orcapod.core.system_constants import constants @@ -41,6 +41,26 @@ def __init__( if not preserve_system_columns: arrow_table = arrow_data_utils.drop_system_columns(arrow_table) + N_CHAR = 12 + + non_system_columns = arrow_data_utils.drop_system_columns(arrow_table) + tag_schema = non_system_columns.select(tag_columns).schema + # FIXME: ensure tag_columns are found among non system columns + packet_schema = non_system_columns.drop(list(tag_columns)).schema + + tag_python_schema = ( + self.data_context.type_converter.arrow_schema_to_python_schema(tag_schema) + ) + packet_python_schema = ( + self.data_context.type_converter.arrow_schema_to_python_schema( + packet_schema + ) + ) + + schema_hash = self.data_context.object_hasher.hash_object( + (tag_python_schema, packet_python_schema) + ).to_hex(char_count=N_CHAR) + self.tag_columns = [ col for col in tag_columns if col in arrow_table.column_names ] @@ -48,23 +68,24 @@ def __init__( self.table_hash = self.data_context.arrow_hasher.hash_table(arrow_table) if source_name is None: - source_name = self.content_hash().to_hex() + # TODO: extract this from system config + source_name = self.content_hash().to_hex(char_count=12) self._source_name = source_name row_index = list(range(arrow_table.num_rows)) - source_info = [f"{self.source_id}::row_{i}" for i in row_index] + source_info = [ + f"{self.source_id}{constants.BLOCK_SEPARATOR}row_{i}" for i in row_index + ] # add source info arrow_table = arrow_data_utils.add_source_info( arrow_table, source_info, exclude_columns=tag_columns ) - arrow_table = arrow_table.add_column( - 0, - f"{constants.SYSTEM_TAG_PREFIX}{self.source_id}::row_index", - pa.array(row_index, pa.int64()), + arrow_table = arrow_data_utils.add_system_tag_column( + arrow_table, f"source{constants.FIELD_SEPARATOR}{schema_hash}", source_info ) self._table = arrow_table @@ -83,7 +104,7 @@ def __init__( @property def reference(self) -> tuple[str, ...]: - return ("arrow_table", self._source_name) + return ("arrow_table", f"source_{self._source_name}") @property def table(self) -> "pa.Table": @@ -97,7 +118,7 @@ def get_all_records( ) -> "pa.Table | None": return self().as_table(include_source=include_system_columns) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Load data from file and return a static stream. diff --git a/src/orcapod/core/sources/base.py b/src/orcapod/core/sources/base.py index 89c0b31..2741099 100644 --- a/src/orcapod/core/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -8,7 +8,7 @@ KernelStream, StatefulStreamBase, ) -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -36,16 +36,26 @@ def __init__(self, **kwargs): super().__init__(**kwargs) # Cache the KernelStream for reuse across all stream method calls self._cached_kernel_stream: KernelStream | None = None + self._schema_hash: str | None = None + + def schema_hash(self) -> str: + # TODO: Migrate this to central config + N_CHAR = 12 + if self._schema_hash is None: + self._schema_hash = self.data_context.object_hasher.hash_object( + (self.tag_types(), self.packet_types()) + ).to_hex(N_CHAR) + return self._schema_hash def kernel_identity_structure( - self, streams: Collection[dp.Stream] | None = None + self, streams: Collection[cp.Stream] | None = None ) -> Any: if streams is not None: # when checked for invocation id, act as a source # and just return the output packet types # _, packet_types = self.stream.types() # return packet_types - return None + return self.schema_hash() # otherwise, return the identity structure of the stream return self.source_identity_structure() @@ -61,7 +71,7 @@ def reference(self) -> tuple[str, ...]: ... def kernel_output_types( - self, *streams: dp.Stream, include_system_tags: bool = False + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: return self.source_output_types(include_system_tags=include_system_tags) @@ -95,7 +105,7 @@ def source_output_types(self, include_system_tags: bool = False) -> Any: ... # self, streams: Collection[dp.Stream] | None = None # ) -> dp.Any: ... - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: """Sources take no input streams.""" if len(streams) > 0: raise ValueError( @@ -103,7 +113,7 @@ def validate_inputs(self, *streams: dp.Stream) -> None: ) def prepare_output_stream( - self, *streams: dp.Stream, label: str | None = None + self, *streams: cp.Stream, label: str | None = None ) -> KernelStream: if self._cached_kernel_stream is None: self._cached_kernel_stream = super().prepare_output_stream( @@ -111,19 +121,19 @@ def prepare_output_stream( ) return self._cached_kernel_stream - def track_invocation(self, *streams: dp.Stream, label: str | None = None) -> None: + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: if not self._skip_tracking and self._tracker_manager is not None: self._tracker_manager.record_source_invocation(self, label=label) # ==================== Stream Protocol (Delegation) ==================== @property - def source(self) -> dp.Kernel | None: + def source(self) -> cp.Kernel | None: """Sources are their own source.""" return self @property - def upstreams(self) -> tuple[dp.Stream, ...]: + def upstreams(self) -> tuple[cp.Stream, ...]: """Sources have no upstream dependencies.""" return () @@ -149,7 +159,7 @@ def is_current(self) -> bool: """Delegate to the cached KernelStream.""" return self().is_current - def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: + def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: """ Iterate over the cached KernelStream. @@ -159,8 +169,8 @@ def __iter__(self) -> Iterator[tuple[dp.Tag, dp.Packet]]: def iter_packets( self, - execution_engine: dp.ExecutionEngine | None = None, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().iter_packets(execution_engine=execution_engine) @@ -171,7 +181,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": """Delegate to the cached KernelStream.""" return self().as_table( @@ -184,12 +194,12 @@ def as_table( ) def flow( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Collection[tuple[dp.Tag, dp.Packet]]: + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Collection[tuple[cp.Tag, cp.Packet]]: """Delegate to the cached KernelStream.""" return self().flow(execution_engine=execution_engine) - def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: """ Run the source node, executing the contained source. @@ -198,7 +208,7 @@ def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: self().run(execution_engine=execution_engine) async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None + self, execution_engine: cp.ExecutionEngine | None = None ) -> None: """ Run the source node asynchronously, executing the contained source. @@ -232,7 +242,7 @@ def reset_cache(self) -> None: class StreamSource(SourceBase): - def __init__(self, stream: dp.Stream, label: str | None = None, **kwargs) -> None: + def __init__(self, stream: cp.Stream, label: str | None = None, **kwargs) -> None: """ A placeholder source based on stream This is used to represent a kernel that has no computation. @@ -254,7 +264,7 @@ def source_output_types( def reference(self) -> tuple[str, ...]: return ("stream", self.stream.content_hash().to_string()) - def forward(self, *args: Any, **kwargs: Any) -> dp.Stream: + def forward(self, *args: Any, **kwargs: Any) -> cp.Stream: """ Forward the stream through the stub kernel. This is a no-op and simply returns the stream. diff --git a/src/orcapod/core/sources/csv_source.py b/src/orcapod/core/sources/csv_source.py index 9f71dec..cafc6c7 100644 --- a/src/orcapod/core/sources/csv_source.py +++ b/src/orcapod/core/sources/csv_source.py @@ -4,7 +4,7 @@ from orcapod.core.streams import ( TableStream, ) -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule @@ -39,7 +39,7 @@ def __init__( def source_identity_structure(self) -> Any: return (self.__class__.__name__, self.source_id, tuple(self.tag_columns)) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Load data from file and return a static stream. diff --git a/src/orcapod/core/sources/delta_table_source.py b/src/orcapod/core/sources/delta_table_source.py index 3f2a03f..4249793 100644 --- a/src/orcapod/core/sources/delta_table_source.py +++ b/src/orcapod/core/sources/delta_table_source.py @@ -3,7 +3,7 @@ from orcapod.core.streams import TableStream -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PathLike, PythonSchema from orcapod.utils.lazy_module import LazyModule from pathlib import Path @@ -85,7 +85,7 @@ def source_identity_structure(self) -> Any: "tag_columns": self._tag_columns, } - def validate_inputs(self, *streams: dp.Stream) -> None: + def validate_inputs(self, *streams: cp.Stream) -> None: """Delta table sources don't take input streams.""" if len(streams) > 0: raise ValueError( @@ -99,7 +99,7 @@ def source_output_types( # Create a sample stream to get types return self.forward().types(include_system_tags=include_system_tags) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Generate stream from Delta table data. diff --git a/src/orcapod/core/sources/dict_source.py b/src/orcapod/core/sources/dict_source.py index f96eeba..d291b3f 100644 --- a/src/orcapod/core/sources/dict_source.py +++ b/src/orcapod/core/sources/dict_source.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import DataValue, PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule from orcapod.core.system_constants import constants @@ -95,7 +95,7 @@ def get_all_records( include_system_columns=include_system_columns ) - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """ Load data from file and return a static stream. diff --git a/src/orcapod/core/sources/list_source.py b/src/orcapod/core/sources/list_source.py index 699d3ab..fdc7ffa 100644 --- a/src/orcapod/core/sources/list_source.py +++ b/src/orcapod/core/sources/list_source.py @@ -13,7 +13,7 @@ StatefulStreamBase, ) from orcapod.errors import DuplicateTagError -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import DataValue, PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -78,14 +78,14 @@ class ListSource(SourceBase): """ @staticmethod - def default_tag_function(element: Any, idx: int) -> dp.Tag: + def default_tag_function(element: Any, idx: int) -> cp.Tag: return DictTag({"element_index": idx}) def __init__( self, name: str, data: list[Any], - tag_function: Callable[[Any, int], dp.Tag] | None = None, + tag_function: Callable[[Any, int], cp.Tag] | None = None, label: str | None = None, tag_function_hash_mode: Literal["content", "signature", "name"] = "name", expected_tag_keys: Collection[str] | None = None, diff --git a/src/orcapod/core/sources/manual_table_source.py b/src/orcapod/core/sources/manual_table_source.py index a428695..ba365ec 100644 --- a/src/orcapod/core/sources/manual_table_source.py +++ b/src/orcapod/core/sources/manual_table_source.py @@ -1,25 +1,16 @@ -from collections.abc import Collection, Iterator +from collections.abc import Collection from pathlib import Path from typing import TYPE_CHECKING, Any, cast from deltalake import DeltaTable, write_deltalake from deltalake.exceptions import TableNotFoundError -from pyarrow.lib import Table - -from orcapod.core.kernels import TrackedKernelBase -from orcapod.core.streams import ( - TableStream, - KernelStream, - StatefulStreamBase, -) -from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry + +from orcapod.core.sources.source_registry import SourceRegistry +from orcapod.core.streams import TableStream from orcapod.errors import DuplicateTagError -from orcapod.protocols import core_protocols as dp -from orcapod.types import DataValue, PythonSchema, PythonSchemaLike -from orcapod.utils import arrow_utils +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema, PythonSchemaLike from orcapod.utils.lazy_module import LazyModule -from orcapod.core.system_constants import constants -from orcapod.semantic_types import infer_python_schema_from_pylist_data if TYPE_CHECKING: import pandas as pd @@ -124,7 +115,7 @@ def delta_table_version(self) -> int | None: return self._delta_table.version() return None - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: """Load current delta table data as a stream.""" if len(streams) > 0: raise ValueError("ManualDeltaTableSource takes no input streams") diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index b190691..6d8b5f1 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -14,7 +14,7 @@ DictTag, ) from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema from orcapod.utils import arrow_utils from orcapod.utils.lazy_module import LazyModule @@ -40,32 +40,8 @@ logger = logging.getLogger(__name__) -def synchronous_run(async_func, *args, **kwargs): - """ - Use existing event loop if available. - - Pros: Reuses existing loop, more efficient - Cons: More complex, need to handle loop detection - """ - try: - # Check if we're already in an event loop - _ = asyncio.get_running_loop() - - def run_in_thread(): - return asyncio.run(async_func(*args, **kwargs)) - - import concurrent.futures - - with concurrent.futures.ThreadPoolExecutor() as executor: - future = executor.submit(run_in_thread) - return future.result() - except RuntimeError: - # No event loop running, safe to use asyncio.run() - return asyncio.run(async_func(*args, **kwargs)) - - class OperatorStreamBaseMixin: - def join(self, other_stream: dp.Stream, label: str | None = None) -> dp.Stream: + def join(self, other_stream: cp.Stream, label: str | None = None) -> cp.Stream: """ Joins this stream with another stream, returning a new stream that contains the combined data from both streams. @@ -76,9 +52,9 @@ def join(self, other_stream: dp.Stream, label: str | None = None) -> dp.Stream: def semi_join( self, - other_stream: dp.Stream, + other_stream: cp.Stream, label: str | None = None, - ) -> dp.Stream: + ) -> cp.Stream: """ Performs a semi-join with another stream, returning a new stream that contains only the packets from this stream that have matching tags in the other stream. @@ -92,7 +68,7 @@ def map_tags( name_map: Mapping[str, str], drop_unmapped: bool = True, label: str | None = None, - ) -> dp.Stream: + ) -> cp.Stream: """ Maps the tags in this stream according to the provided name_map. If drop_unmapped is True, any tags that are not in the name_map will be dropped. @@ -106,7 +82,7 @@ def map_packets( name_map: Mapping[str, str], drop_unmapped: bool = True, label: str | None = None, - ) -> dp.Stream: + ) -> cp.Stream: """ Maps the packets in this stream according to the provided packet_map. If drop_unmapped is True, any packets that are not in the packet_map will be dropped. @@ -120,7 +96,7 @@ def batch( batch_size: int = 0, drop_last: bool = False, label: str | None = None, - ) -> dp.Stream: + ) -> cp.Stream: """ Batch stream into fixed-size chunks, each of size batch_size. If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. @@ -137,7 +113,7 @@ class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase def __init__( self, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -156,7 +132,7 @@ def substream_identities(self) -> tuple[str, ...]: return (self.content_hash().to_hex(),) @property - def execution_engine(self) -> dp.ExecutionEngine | None: + def execution_engine(self) -> cp.ExecutionEngine | None: """ Returns the execution engine that is used to execute this stream. This is typically used to track the execution context of the stream. @@ -164,14 +140,14 @@ def execution_engine(self) -> dp.ExecutionEngine | None: return self._execution_engine @execution_engine.setter - def execution_engine(self, engine: dp.ExecutionEngine | None) -> None: + def execution_engine(self, engine: cp.ExecutionEngine | None) -> None: """ Sets the execution engine for the stream. This is typically used to track the execution context of the stream. """ self._execution_engine = engine - def get_substream(self, substream_id: str) -> dp.Stream: + def get_substream(self, substream_id: str) -> cp.Stream: """ Returns the substream with the given substream_id. This is used to retrieve a specific substream from the stream. @@ -183,7 +159,7 @@ def get_substream(self, substream_id: str) -> dp.Stream: @property @abstractmethod - def source(self) -> dp.Kernel | None: + def source(self) -> cp.Kernel | None: """ The source of the stream, which is the kernel that generated the stream. This is typically used to track the origin of the stream in the computational graph. @@ -192,7 +168,7 @@ def source(self) -> dp.Kernel | None: @property @abstractmethod - def upstreams(self) -> tuple[dp.Stream, ...]: + def upstreams(self) -> tuple[cp.Stream, ...]: """ The upstream streams that are used to generate this stream. This is typically used to track the origin of the stream in the computational graph. @@ -279,25 +255,25 @@ def _set_modified_time( def __iter__( self, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: return self.iter_packets() @abstractmethod def iter_packets( self, - execution_engine: dp.ExecutionEngine | None = None, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: ... @abstractmethod def run( self, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> None: ... @abstractmethod async def run_async( self, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> None: ... @abstractmethod @@ -308,7 +284,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": ... def as_polars_df( @@ -318,7 +294,7 @@ def as_polars_df( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. @@ -341,7 +317,7 @@ def as_df( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pl.DataFrame | None": """ Convert the entire stream to a Polars DataFrame. @@ -362,7 +338,7 @@ def as_lazy_frame( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pl.LazyFrame | None": """ Convert the entire stream to a Polars LazyFrame. @@ -387,7 +363,7 @@ def as_pandas_df( include_content_hash: bool | str = False, sort_by_tags: bool = True, index_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pd.DataFrame | None": df = self.as_polars_df( include_data_context=include_data_context, @@ -406,8 +382,8 @@ def as_pandas_df( return pdf def flow( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Collection[tuple[dp.Tag, dp.Packet]]: + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Collection[tuple[cp.Tag, cp.Packet]]: """ Flow everything through the stream, returning the entire collection of (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. @@ -435,8 +411,8 @@ class StreamBase(StatefulStreamBase): def __init__( self, - source: dp.Kernel | None = None, - upstreams: tuple[dp.Stream, ...] = (), + source: cp.Kernel | None = None, + upstreams: tuple[cp.Stream, ...] = (), data_context: str | contexts.DataContext | None = None, **kwargs, ) -> None: @@ -451,7 +427,7 @@ def __init__( super().__init__(data_context=data_context, **kwargs) @property - def source(self) -> dp.Kernel | None: + def source(self) -> cp.Kernel | None: """ The source of the stream, which is the kernel that generated the stream. This is typically used to track the origin of the stream in the computational graph. @@ -459,7 +435,7 @@ def source(self) -> dp.Kernel | None: return self._source @property - def upstreams(self) -> tuple[dp.Stream, ...]: + def upstreams(self) -> tuple[cp.Stream, ...]: """ The upstream streams that are used to generate this stream. This is typically used to track the origin of the stream in the computational graph. @@ -560,8 +536,8 @@ def __init__( tag_columns: Collection[str] = (), system_tag_columns: Collection[str] = (), source_info: dict[str, str | None] | None = None, - source: dp.Kernel | None = None, - upstreams: tuple[dp.Stream, ...] = (), + source: cp.Kernel | None = None, + upstreams: tuple[cp.Stream, ...] = (), **kwargs, ) -> None: super().__init__(source=source, upstreams=upstreams, **kwargs) @@ -655,7 +631,7 @@ def __init__( # ) # ) - self._cached_elements: list[tuple[dp.Tag, ArrowPacket]] | None = None + self._cached_elements: list[tuple[cp.Tag, ArrowPacket]] | None = None self._set_modified_time() # set modified time to now def data_content_identity_structure(self) -> Any: @@ -711,7 +687,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": """ Returns the underlying table representation of the stream. @@ -762,8 +738,8 @@ def clear_cache(self) -> None: self._cached_elements = None def iter_packets( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Iterator[tuple[dp.Tag, ArrowPacket]]: + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, ArrowPacket]]: """ Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). @@ -806,7 +782,7 @@ def iter_packets( ) yield from self._cached_elements - def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: """ Runs the stream, which in this case is a no-op since the stream is immutable. This is typically used to trigger any upstream computation of the stream. @@ -815,7 +791,7 @@ def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: pass async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None + self, execution_engine: cp.ExecutionEngine | None = None ) -> None: """ Runs the stream asynchronously, which in this case is a no-op since the stream is immutable. @@ -842,10 +818,10 @@ class KernelStream(StreamBase): def __init__( self, - output_stream: dp.Stream | None = None, - source: dp.Kernel | None = None, + output_stream: cp.Stream | None = None, + source: cp.Kernel | None = None, upstreams: tuple[ - dp.Stream, ... + cp.Stream, ... ] = (), # if provided, this will override the upstreams of the output_stream **kwargs, ) -> None: @@ -939,7 +915,7 @@ def last_modified(self) -> datetime | None: return None return self._cached_stream.last_modified - def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." @@ -947,7 +923,7 @@ def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: self._cached_stream.run(execution_engine=execution_engine) async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None + self, execution_engine: cp.ExecutionEngine | None = None ) -> None: self.refresh() assert self._cached_stream is not None, ( @@ -962,7 +938,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": self.refresh() assert self._cached_stream is not None, ( @@ -979,8 +955,8 @@ def as_table( def iter_packets( self, - execution_engine: dp.ExecutionEngine | None = None, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: self.refresh() assert self._cached_stream is not None, ( "Stream has not been updated or is empty." @@ -997,7 +973,7 @@ class LazyPodResultStream(StreamBase): This is what Pod.process() returns - it's static/fixed but efficient. """ - def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): + def __init__(self, pod: cp.Pod, prepared_stream: cp.Stream, **kwargs): super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) self.pod = pod self.prepared_stream = prepared_stream @@ -1006,13 +982,13 @@ def __init__(self, pod: dp.Pod, prepared_stream: dp.Stream, **kwargs): self._prepared_stream_iterator = prepared_stream.iter_packets() # Packet-level caching (from your PodStream) - self._cached_output_packets: dict[int, tuple[dp.Tag, dp.Packet | None]] = {} + self._cached_output_packets: dict[int, tuple[cp.Tag, cp.Packet | None]] = {} self._cached_output_table: pa.Table | None = None self._cached_content_hash_column: pa.Array | None = None def iter_packets( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: if self._prepared_stream_iterator is not None: for i, (tag, packet) in enumerate(self._prepared_stream_iterator): if i in self._cached_output_packets: @@ -1042,7 +1018,7 @@ def iter_packets( yield tag, packet async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None + self, execution_engine: cp.ExecutionEngine | None = None ) -> None: if self._prepared_stream_iterator is not None: pending_call_lut = {} @@ -1065,21 +1041,9 @@ async def run_async( def run( self, - execution_engine: dp.ExecutionEngine | None = None, - try_async_backend: bool = True, + execution_engine: cp.ExecutionEngine | None = None, ) -> None: - if try_async_backend: - # Use async run if requested - try: - return synchronous_run( - self.run_async, execution_engine=execution_engine - ) - except RuntimeError as e: - logger.warning( - "Failed to run async stream synchronously, falling back to sync run: %s", - e, - ) - # Fallback to synchronous run + # Fallback to synchronous run self.flow(execution_engine=execution_engine) def keys( @@ -1111,7 +1075,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] @@ -1175,7 +1139,7 @@ def as_table( content_hashes = [] # TODO: verify that order will be preserved for tag, packet in self.iter_packets(): - content_hashes.append(packet.content_hash()) + content_hashes.append(packet.content_hash().to_string()) self._cached_content_hash_column = pa.array( content_hashes, type=pa.large_string() ) @@ -1205,7 +1169,7 @@ class EfficientPodResultStream(StreamBase): """ # TODO: define interface for storage or pod storage - def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): + def __init__(self, pod: cp.CachedPod, input_stream: cp.Stream, **kwargs): super().__init__(source=pod, upstreams=(input_stream,), **kwargs) self.pod = pod self.input_stream = input_stream @@ -1215,12 +1179,12 @@ def __init__(self, pod: dp.CachedPod, input_stream: dp.Stream, **kwargs): self._prepared_stream_iterator = input_stream.iter_packets() # Packet-level caching (from your PodStream) - self._cached_output_packets: list[tuple[dp.Tag, dp.Packet | None]] | None = None + self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None self._cached_output_table: pa.Table | None = None self._cached_content_hash_column: pa.Array | None = None async def run_async( - self, execution_engine: dp.ExecutionEngine | None = None + self, execution_engine: cp.ExecutionEngine | None = None ) -> None: """ Runs the stream, processing the input stream and preparing the output stream. @@ -1298,26 +1262,13 @@ async def run_async( def run( self, - execution_engine: dp.ExecutionEngine | None = None, - try_async_backend: bool = False, + execution_engine: cp.ExecutionEngine | None = None, ) -> None: - if try_async_backend: - # Use async run if requested - try: - return synchronous_run( - self.run_async, execution_engine=execution_engine - ) - except RuntimeError as e: - logger.warning( - "Failed to run async stream synchronously, falling back to sync run: %s", - e, - ) - # Fallback to synchronous run self.flow(execution_engine=execution_engine) def iter_packets( - self, execution_engine: dp.ExecutionEngine | None = None - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """ Processes the input stream and prepares the output stream. This is typically called before iterating over the packets. @@ -1454,7 +1405,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": if self._cached_output_table is None: all_tags = [] @@ -1547,9 +1498,9 @@ def as_table( class WrappedStream(StreamBase): def __init__( self, - stream: dp.Stream, - source: dp.Kernel, - input_streams: tuple[dp.Stream, ...], + stream: cp.Stream, + source: cp.Kernel, + input_streams: tuple[cp.Stream, ...], label: str | None = None, **kwargs, ) -> None: @@ -1581,7 +1532,7 @@ def as_table( include_system_tags: bool = False, include_content_hash: bool | str = False, sort_by_tags: bool = True, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, ) -> "pa.Table": """ Returns the underlying table representation of the stream. @@ -1598,8 +1549,8 @@ def as_table( def iter_packets( self, - execution_engine: dp.ExecutionEngine | None = None, - ) -> Iterator[tuple[dp.Tag, dp.Packet]]: + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: """ Iterates over the packets in the stream. Each packet is represented as a tuple of (Tag, Packet). diff --git a/src/orcapod/core/system_constants.py b/src/orcapod/core/system_constants.py index edd3503..7dc6e49 100644 --- a/src/orcapod/core/system_constants.py +++ b/src/orcapod/core/system_constants.py @@ -6,16 +6,26 @@ DATA_CONTEXT_KEY = "context_key" INPUT_PACKET_HASH = "input_packet_hash" PACKET_RECORD_ID = "packet_id" -SYSTEM_TAG_PREFIX = "system_tag_" +SYSTEM_TAG_PREFIX = "tag" POD_VERSION = "pod_version" EXECUTION_ENGINE = "execution_engine" POD_TIMESTAMP = "pod_ts" +FIELD_SEPARATOR = ":" +BLOCK_SEPARATOR = "::" class SystemConstant: def __init__(self, global_prefix: str = ""): self._global_prefix = global_prefix + @property + def BLOCK_SEPARATOR(self) -> str: + return BLOCK_SEPARATOR + + @property + def FIELD_SEPARATOR(self) -> str: + return FIELD_SEPARATOR + @property def META_PREFIX(self) -> str: return f"{self._global_prefix}{SYSTEM_COLUMN_PREFIX}" @@ -46,7 +56,7 @@ def PACKET_RECORD_ID(self) -> str: @property def SYSTEM_TAG_PREFIX(self) -> str: - return f"{self._global_prefix}{DATAGRAM_PREFIX}{SYSTEM_TAG_PREFIX}" + return f"{self._global_prefix}{DATAGRAM_PREFIX}{SYSTEM_TAG_PREFIX}{self.BLOCK_SEPARATOR}" @property def POD_VERSION(self) -> str: diff --git a/src/orcapod/core/trackers.py b/src/orcapod/core/trackers.py index b3e7689..4ffe39a 100644 --- a/src/orcapod/core/trackers.py +++ b/src/orcapod/core/trackers.py @@ -1,5 +1,5 @@ from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from collections import defaultdict from collections.abc import Generator from abc import ABC, abstractmethod @@ -13,7 +13,7 @@ class BasicTrackerManager: def __init__(self) -> None: - self._active_trackers: list[dp.Tracker] = [] + self._active_trackers: list[cp.Tracker] = [] self._active = True def set_active(self, active: bool = True) -> None: @@ -23,7 +23,7 @@ def set_active(self, active: bool = True) -> None: """ self._active = active - def register_tracker(self, tracker: dp.Tracker) -> None: + def register_tracker(self, tracker: cp.Tracker) -> None: """ Register a new tracker in the system. This is used to add a new tracker to the list of active trackers. @@ -31,7 +31,7 @@ def register_tracker(self, tracker: dp.Tracker) -> None: if tracker not in self._active_trackers: self._active_trackers.append(tracker) - def deregister_tracker(self, tracker: dp.Tracker) -> None: + def deregister_tracker(self, tracker: cp.Tracker) -> None: """ Remove a tracker from the system. This is used to deactivate a tracker and remove it from the list of active trackers. @@ -39,7 +39,7 @@ def deregister_tracker(self, tracker: dp.Tracker) -> None: if tracker in self._active_trackers: self._active_trackers.remove(tracker) - def get_active_trackers(self) -> list[dp.Tracker]: + def get_active_trackers(self) -> list[cp.Tracker]: """ Get the list of active trackers. This is used to retrieve the currently active trackers in the system. @@ -52,8 +52,8 @@ def get_active_trackers(self) -> list[dp.Tracker]: def record_kernel_invocation( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: """ @@ -64,7 +64,7 @@ def record_kernel_invocation( tracker.record_kernel_invocation(kernel, upstreams, label=label) def record_source_invocation( - self, source: dp.Source, label: str | None = None + self, source: cp.Source, label: str | None = None ) -> None: """ Record the output stream of a source invocation in the tracker. @@ -74,7 +74,7 @@ def record_source_invocation( tracker.record_source_invocation(source, label=label) def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None + self, pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None ) -> None: """ Record the output stream of a pod invocation in the tracker. @@ -94,7 +94,7 @@ def no_tracking(self) -> Generator[None, Any, None]: class AutoRegisteringContextBasedTracker(ABC): - def __init__(self, tracker_manager: dp.TrackerManager | None = None) -> None: + def __init__(self, tracker_manager: cp.TrackerManager | None = None) -> None: self._tracker_manager = tracker_manager or DEFAULT_TRACKER_MANAGER self._active = False @@ -111,19 +111,19 @@ def is_active(self) -> bool: @abstractmethod def record_kernel_invocation( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: ... @abstractmethod def record_source_invocation( - self, source: dp.Source, label: str | None = None + self, source: cp.Source, label: str | None = None ) -> None: ... @abstractmethod def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None + self, pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None ) -> None: ... def __enter__(self): @@ -137,8 +137,8 @@ def __exit__(self, exc_type, exc_val, ext_tb): class Invocation(LabeledContentIdentifiableBase): def __init__( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...] = (), + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...] = (), label: str | None = None, ) -> None: """ @@ -195,7 +195,7 @@ class GraphTracker(AutoRegisteringContextBasedTracker): def __init__( self, - tracker_manager: dp.TrackerManager | None = None, + tracker_manager: cp.TrackerManager | None = None, **kwargs, ) -> None: super().__init__(tracker_manager=tracker_manager) @@ -203,13 +203,13 @@ def __init__( # Dictionary to map kernels to the streams they have invoked # This is used to track the computational graph and the invocations of kernels self.kernel_invocations: set[Invocation] = set() - self.invocation_to_pod_lut: dict[Invocation, dp.Pod] = {} - self.invocation_to_source_lut: dict[Invocation, dp.Source] = {} + self.invocation_to_pod_lut: dict[Invocation, cp.Pod] = {} + self.invocation_to_source_lut: dict[Invocation, cp.Source] = {} def _record_kernel_and_get_invocation( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> Invocation: invocation = Invocation(kernel, upstreams, label=label) @@ -218,8 +218,8 @@ def _record_kernel_and_get_invocation( def record_kernel_invocation( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: """ @@ -229,7 +229,7 @@ def record_kernel_invocation( self._record_kernel_and_get_invocation(kernel, upstreams, label) def record_source_invocation( - self, source: dp.Source, label: str | None = None + self, source: cp.Source, label: str | None = None ) -> None: """ Record the output stream of a source invocation in the tracker. @@ -238,7 +238,7 @@ def record_source_invocation( self.invocation_to_source_lut[invocation] = source def record_pod_invocation( - self, pod: dp.Pod, upstreams: tuple[dp.Stream, ...], label: str | None = None + self, pod: cp.Pod, upstreams: tuple[cp.Stream, ...], label: str | None = None ) -> None: """ Record the output stream of a pod invocation in the tracker. @@ -246,7 +246,7 @@ def record_pod_invocation( invocation = self._record_kernel_and_get_invocation(pod, upstreams, label) self.invocation_to_pod_lut[invocation] = pod - def reset(self) -> dict[dp.Kernel, list[dp.Stream]]: + def reset(self) -> dict[cp.Kernel, list[cp.Stream]]: """ Reset the tracker and return the recorded invocations. """ diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 4fb0c13..292aa30 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -1,14 +1,7 @@ -from typing import Any -from orcapod.protocols.hashing_protocols import ( - ContentIdentifiable, - ObjectHasher, - FunctionInfoExtractor, -) import logging import json -from uuid import UUID from pathlib import Path -from collections.abc import Mapping, Collection, Callable +from collections.abc import Collection, Callable import hashlib import xxhash import zlib @@ -17,6 +10,29 @@ logger = logging.getLogger(__name__) +# TODO: extract default char count as config +def combine_hashes( + *hashes: str, + order: bool = False, + prefix_hasher_id: bool = False, + hex_char_count: int | None = 20, +) -> str: + """Combine hashes into a single hash string.""" + + # Sort for deterministic order regardless of input order + if order: + prepared_hashes = sorted(hashes) + else: + prepared_hashes = list(hashes) + combined = "".join(prepared_hashes) + combined_hash = hashlib.sha256(combined.encode()).hexdigest() + if hex_char_count is not None: + combined_hash = combined_hash[:hex_char_count] + if prefix_hasher_id: + return "sha256@" + combined_hash + return combined_hash + + def serialize_through_json(processed_obj) -> bytes: """ Create a deterministic string representation of a processed object structure. @@ -33,270 +49,270 @@ def serialize_through_json(processed_obj) -> bytes: ) -def process_structure( - obj: Any, - visited: set[int] | None = None, - object_hasher: ObjectHasher | None = None, - function_info_extractor: FunctionInfoExtractor | None = None, - compressed: bool = False, - force_hash: bool = True, -) -> Any: - """ - Recursively process a structure to prepare it for hashing. - - Args: - obj: The object or structure to process - visited: Set of object ids already visited (to handle circular references) - function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation - - Returns: - A processed version of the structure suitable for stable hashing - """ - # Initialize the visited set if this is the top-level call - if visited is None: - visited = set() - else: - visited = visited.copy() # Copy to avoid modifying the original set - - # Check for circular references - use object's memory address - # NOTE: While id() is not stable across sessions, we only use it within a session - # to detect circular references, not as part of the final hash - obj_id = id(obj) - if obj_id in visited: - logger.debug( - f"Detected circular reference for object of type {type(obj).__name__}" - ) - return "CircularRef" # Don't include the actual id in hash output - - # For objects that could contain circular references, add to visited - if isinstance(obj, (dict, list, tuple, set)) or not isinstance( - obj, (str, int, float, bool, type(None)) - ): - visited.add(obj_id) - - # Handle None - if obj is None: - return None - - # TODO: currently using runtime_checkable on ContentIdentifiable protocol - # Re-evaluate this strategy to see if a faster / more robust check could be used - if isinstance(obj, ContentIdentifiable): - logger.debug( - f"Processing ContentHashableBase instance of type {type(obj).__name__}" - ) - if compressed: - # if compressed, the content identifiable object is immediately replaced with - # its hashed string identity - if object_hasher is None: - raise ValueError( - "ObjectHasher must be provided to hash ContentIdentifiable objects with compressed=True" - ) - return object_hasher.hash_to_hex(obj.identity_structure(), compressed=True) - else: - # if not compressed, replace the object with expanded identity structure and re-process - return process_structure( - obj.identity_structure(), - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - ) - - # Handle basic types - if isinstance(obj, (str, int, float, bool)): - return obj - - # Handle bytes and bytearray - if isinstance(obj, (bytes, bytearray)): - logger.debug( - f"Converting bytes/bytearray of length {len(obj)} to hex representation" - ) - return obj.hex() - - # Handle Path objects - if isinstance(obj, Path): - logger.debug(f"Converting Path object to string: {obj}") - return str(obj) - - # Handle UUID objects - if isinstance(obj, UUID): - logger.debug(f"Converting UUID to string: {obj}") - return str(obj) - - # Handle named tuples (which are subclasses of tuple) - if hasattr(obj, "_fields") and isinstance(obj, tuple): - logger.debug(f"Processing named tuple of type {type(obj).__name__}") - # For namedtuples, convert to dict and then process - d = {field: getattr(obj, field) for field in obj._fields} # type: ignore - return process_structure( - d, - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - compressed=compressed, - ) - - # Handle mappings (dict-like objects) - if isinstance(obj, Mapping): - # Process both keys and values - processed_items = [ - ( - process_structure( - k, - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - compressed=compressed, - ), - process_structure( - v, - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - compressed=compressed, - ), - ) - for k, v in obj.items() - ] - - # Sort by the processed keys for deterministic order - processed_items.sort(key=lambda x: str(x[0])) - - # Create a new dictionary with string keys based on processed keys - # TODO: consider checking for possibly problematic values in processed_k - # and issue a warning - return { - str(processed_k): processed_v - for processed_k, processed_v in processed_items - } - - # Handle sets and frozensets - if isinstance(obj, (set, frozenset)): - logger.debug( - f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" - ) - # Process each item first, then sort the processed results - processed_items = [ - process_structure( - item, - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - compressed=compressed, - ) - for item in obj - ] - return sorted(processed_items, key=str) - - # Handle collections (list-like objects) - if isinstance(obj, Collection): - logger.debug( - f"Processing collection of type {type(obj).__name__} with {len(obj)} items" - ) - return [ - process_structure( - item, - visited, - object_hasher=object_hasher, - function_info_extractor=function_info_extractor, - compressed=compressed, - ) - for item in obj - ] - - # For functions, use the function_content_hash - if callable(obj) and hasattr(obj, "__code__"): - logger.debug(f"Processing function: {getattr(obj, '__name__')}") - if function_info_extractor is not None: - # Use the extractor to get a stable representation - function_info = function_info_extractor.extract_function_info(obj) - logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") - - # simply return the function info as a stable representation - return function_info - else: - raise ValueError( - f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" - ) - - # handle data types - if isinstance(obj, type): - logger.debug(f"Processing class/type: {obj.__name__}") - return f"type:{obj.__name__}" - - # For other objects, attempt to create deterministic representation only if force_hash=True - class_name = obj.__class__.__name__ - module_name = obj.__class__.__module__ - if force_hash: - try: - import re - - logger.debug( - f"Processing generic object of type {module_name}.{class_name}" - ) - - # Try to get a stable dict representation if possible - if hasattr(obj, "__dict__"): - # Sort attributes to ensure stable order - attrs = sorted( - (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") - ) - # Limit to first 10 attributes to avoid extremely long representations - if len(attrs) > 10: - logger.debug( - f"Object has {len(attrs)} attributes, limiting to first 10" - ) - attrs = attrs[:10] - attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] - obj_repr = f"{{{', '.join(attr_strs)}}}" - else: - # Get basic repr but remove memory addresses - logger.debug( - "Object has no __dict__, using repr() with memory address removal" - ) - obj_repr = repr(obj) - if len(obj_repr) > 1000: - logger.debug( - f"Object repr is {len(obj_repr)} chars, truncating to 1000" - ) - obj_repr = obj_repr[:1000] + "..." - # Remove memory addresses which look like '0x7f9a1c2b3d4e' - obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) - - return f"{module_name}.{class_name}:{obj_repr}" - except Exception as e: - # Last resort - use class name only - logger.warning(f"Failed to process object representation: {e}") - try: - return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" - except AttributeError: - logger.error("Could not determine object class, using UnknownObject") - return "UnknownObject" - else: - raise ValueError( - f"Processing of {obj} of type {module_name}.{class_name} is not supported" - ) - - -def hash_object( - obj: Any, - function_info_extractor: FunctionInfoExtractor | None = None, - compressed: bool = False, -) -> bytes: - # Process the object to handle nested structures and HashableMixin instances - processed = process_structure( - obj, function_info_extractor=function_info_extractor, compressed=compressed - ) - - # Serialize the processed structure - json_str = json.dumps(processed, sort_keys=True, separators=(",", ":")).encode( - "utf-8" - ) - logger.debug( - f"Successfully serialized {type(obj).__name__} using custom serializer" - ) - - # Create the hash - return hashlib.sha256(json_str).digest() +# def process_structure( +# obj: Any, +# visited: set[int] | None = None, +# object_hasher: ObjectHasher | None = None, +# function_info_extractor: FunctionInfoExtractor | None = None, +# compressed: bool = False, +# force_hash: bool = True, +# ) -> Any: +# """ +# Recursively process a structure to prepare it for hashing. + +# Args: +# obj: The object or structure to process +# visited: Set of object ids already visited (to handle circular references) +# function_info_extractor: FunctionInfoExtractor to be used for extracting necessary function representation + +# Returns: +# A processed version of the structure suitable for stable hashing +# """ +# # Initialize the visited set if this is the top-level call +# if visited is None: +# visited = set() +# else: +# visited = visited.copy() # Copy to avoid modifying the original set + +# # Check for circular references - use object's memory address +# # NOTE: While id() is not stable across sessions, we only use it within a session +# # to detect circular references, not as part of the final hash +# obj_id = id(obj) +# if obj_id in visited: +# logger.debug( +# f"Detected circular reference for object of type {type(obj).__name__}" +# ) +# return "CircularRef" # Don't include the actual id in hash output + +# # For objects that could contain circular references, add to visited +# if isinstance(obj, (dict, list, tuple, set)) or not isinstance( +# obj, (str, int, float, bool, type(None)) +# ): +# visited.add(obj_id) + +# # Handle None +# if obj is None: +# return None + +# # TODO: currently using runtime_checkable on ContentIdentifiable protocol +# # Re-evaluate this strategy to see if a faster / more robust check could be used +# if isinstance(obj, ContentIdentifiable): +# logger.debug( +# f"Processing ContentHashableBase instance of type {type(obj).__name__}" +# ) +# if compressed: +# # if compressed, the content identifiable object is immediately replaced with +# # its hashed string identity +# if object_hasher is None: +# raise ValueError( +# "ObjectHasher must be provided to hash ContentIdentifiable objects with compressed=True" +# ) +# return object_hasher.hash_object(obj.identity_structure(), compressed=True) +# else: +# # if not compressed, replace the object with expanded identity structure and re-process +# return process_structure( +# obj.identity_structure(), +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# ) + +# # Handle basic types +# if isinstance(obj, (str, int, float, bool)): +# return obj + +# # Handle bytes and bytearray +# if isinstance(obj, (bytes, bytearray)): +# logger.debug( +# f"Converting bytes/bytearray of length {len(obj)} to hex representation" +# ) +# return obj.hex() + +# # Handle Path objects +# if isinstance(obj, Path): +# logger.debug(f"Converting Path object to string: {obj}") +# return str(obj) + +# # Handle UUID objects +# if isinstance(obj, UUID): +# logger.debug(f"Converting UUID to string: {obj}") +# return str(obj) + +# # Handle named tuples (which are subclasses of tuple) +# if hasattr(obj, "_fields") and isinstance(obj, tuple): +# logger.debug(f"Processing named tuple of type {type(obj).__name__}") +# # For namedtuples, convert to dict and then process +# d = {field: getattr(obj, field) for field in obj._fields} # type: ignore +# return process_structure( +# d, +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# compressed=compressed, +# ) + +# # Handle mappings (dict-like objects) +# if isinstance(obj, Mapping): +# # Process both keys and values +# processed_items = [ +# ( +# process_structure( +# k, +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# compressed=compressed, +# ), +# process_structure( +# v, +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# compressed=compressed, +# ), +# ) +# for k, v in obj.items() +# ] + +# # Sort by the processed keys for deterministic order +# processed_items.sort(key=lambda x: str(x[0])) + +# # Create a new dictionary with string keys based on processed keys +# # TODO: consider checking for possibly problematic values in processed_k +# # and issue a warning +# return { +# str(processed_k): processed_v +# for processed_k, processed_v in processed_items +# } + +# # Handle sets and frozensets +# if isinstance(obj, (set, frozenset)): +# logger.debug( +# f"Processing set/frozenset of type {type(obj).__name__} with {len(obj)} items" +# ) +# # Process each item first, then sort the processed results +# processed_items = [ +# process_structure( +# item, +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# compressed=compressed, +# ) +# for item in obj +# ] +# return sorted(processed_items, key=str) + +# # Handle collections (list-like objects) +# if isinstance(obj, Collection): +# logger.debug( +# f"Processing collection of type {type(obj).__name__} with {len(obj)} items" +# ) +# return [ +# process_structure( +# item, +# visited, +# object_hasher=object_hasher, +# function_info_extractor=function_info_extractor, +# compressed=compressed, +# ) +# for item in obj +# ] + +# # For functions, use the function_content_hash +# if callable(obj) and hasattr(obj, "__code__"): +# logger.debug(f"Processing function: {getattr(obj, '__name__')}") +# if function_info_extractor is not None: +# # Use the extractor to get a stable representation +# function_info = function_info_extractor.extract_function_info(obj) +# logger.debug(f"Extracted function info: {function_info} for {obj.__name__}") + +# # simply return the function info as a stable representation +# return function_info +# else: +# raise ValueError( +# f"Function {obj} encountered during processing but FunctionInfoExtractor is missing" +# ) + +# # handle data types +# if isinstance(obj, type): +# logger.debug(f"Processing class/type: {obj.__name__}") +# return f"type:{obj.__name__}" + +# # For other objects, attempt to create deterministic representation only if force_hash=True +# class_name = obj.__class__.__name__ +# module_name = obj.__class__.__module__ +# if force_hash: +# try: +# import re + +# logger.debug( +# f"Processing generic object of type {module_name}.{class_name}" +# ) + +# # Try to get a stable dict representation if possible +# if hasattr(obj, "__dict__"): +# # Sort attributes to ensure stable order +# attrs = sorted( +# (k, v) for k, v in obj.__dict__.items() if not k.startswith("_") +# ) +# # Limit to first 10 attributes to avoid extremely long representations +# if len(attrs) > 10: +# logger.debug( +# f"Object has {len(attrs)} attributes, limiting to first 10" +# ) +# attrs = attrs[:10] +# attr_strs = [f"{k}={type(v).__name__}" for k, v in attrs] +# obj_repr = f"{{{', '.join(attr_strs)}}}" +# else: +# # Get basic repr but remove memory addresses +# logger.debug( +# "Object has no __dict__, using repr() with memory address removal" +# ) +# obj_repr = repr(obj) +# if len(obj_repr) > 1000: +# logger.debug( +# f"Object repr is {len(obj_repr)} chars, truncating to 1000" +# ) +# obj_repr = obj_repr[:1000] + "..." +# # Remove memory addresses which look like '0x7f9a1c2b3d4e' +# obj_repr = re.sub(r" at 0x[0-9a-f]+", " at 0xMEMADDR", obj_repr) + +# return f"{module_name}.{class_name}:{obj_repr}" +# except Exception as e: +# # Last resort - use class name only +# logger.warning(f"Failed to process object representation: {e}") +# try: +# return f"object:{obj.__class__.__module__}.{obj.__class__.__name__}" +# except AttributeError: +# logger.error("Could not determine object class, using UnknownObject") +# return "UnknownObject" +# else: +# raise ValueError( +# f"Processing of {obj} of type {module_name}.{class_name} is not supported" +# ) + + +# def hash_object( +# obj: Any, +# function_info_extractor: FunctionInfoExtractor | None = None, +# compressed: bool = False, +# ) -> bytes: +# # Process the object to handle nested structures and HashableMixin instances +# processed = process_structure( +# obj, function_info_extractor=function_info_extractor, compressed=compressed +# ) + +# # Serialize the processed structure +# json_str = json.dumps(processed, sort_keys=True, separators=(",", ":")).encode( +# "utf-8" +# ) +# logger.debug( +# f"Successfully serialized {type(obj).__name__} using custom serializer" +# ) + +# # Create the hash +# return hashlib.sha256(json_str).digest() def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 14f27a9..666b66e 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -2,11 +2,36 @@ from orcapod.pipeline.nodes import KernelNode, PodNode from orcapod.protocols.pipeline_protocols import Node from orcapod import contexts -from orcapod.protocols import core_protocols as dp +from orcapod.protocols import core_protocols as cp from orcapod.protocols import database_protocols as dbp from typing import Any from collections.abc import Collection import logging +import asyncio + + +def synchronous_run(async_func, *args, **kwargs): + """ + Use existing event loop if available. + + Pros: Reuses existing loop, more efficient + Cons: More complex, need to handle loop detection + """ + try: + # Check if we're already in an event loop + _ = asyncio.get_running_loop() + + def run_in_thread(): + return asyncio.run(async_func(*args, **kwargs)) + + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_in_thread) + return future.result() + except RuntimeError: + # No event loop running, safe to use asyncio.run() + return asyncio.run(async_func(*args, **kwargs)) logger = logging.getLogger(__name__) @@ -23,7 +48,7 @@ def __init__( name: str | tuple[str, ...], pipeline_database: dbp.ArrowDatabase, results_database: dbp.ArrowDatabase | None = None, - tracker_manager: dp.TrackerManager | None = None, + tracker_manager: cp.TrackerManager | None = None, data_context: str | contexts.DataContext | None = None, auto_compile: bool = True, ): @@ -61,8 +86,8 @@ def flush(self) -> None: def record_kernel_invocation( self, - kernel: dp.Kernel, - upstreams: tuple[dp.Stream, ...], + kernel: cp.Kernel, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: super().record_kernel_invocation(kernel, upstreams, label) @@ -70,8 +95,8 @@ def record_kernel_invocation( def record_pod_invocation( self, - pod: dp.Pod, - upstreams: tuple[dp.Stream, ...], + pod: cp.Pod, + upstreams: tuple[cp.Stream, ...], label: str | None = None, ) -> None: super().record_pod_invocation(pod, upstreams, label) @@ -102,17 +127,41 @@ def compile(self) -> None: else: self.nodes[label] = nodes[0] - def run(self, execution_engine: dp.ExecutionEngine | None = None) -> None: - # FIXME: perform more efficient traversal through the graph! + def run( + self, + execution_engine: cp.ExecutionEngine | None = None, + run_async: bool | None = None, + ) -> None: + """Execute the pipeline by running all nodes in the graph. + + This method traverses through all nodes in the graph and executes them sequentially + using the specified execution engine. After execution, flushes the pipeline. + + Args: + execution_engine (dp.ExecutionEngine | None): The execution engine to use for running + the nodes. If None, creates a new default ExecutionEngine instance. + run_async (bool | None): Whether to run nodes asynchronously. If None, defaults to + the preferred mode based on the execution engine. + + Returns: + None + + Note: + Current implementation uses a simple traversal through all nodes. Future versions + may implement more efficient graph traversal algorithms. + """ for node in self.nodes.values(): - node.run(execution_engine=execution_engine) + if run_async: + synchronous_run(node.run_async, execution_engine=execution_engine) + else: + node.run(execution_engine=execution_engine) self.flush() def wrap_invocation( self, invocation: Invocation, - new_input_streams: Collection[dp.Stream], + new_input_streams: Collection[cp.Stream], ) -> Node: if invocation in self.invocation_to_pod_lut: pod = self.invocation_to_pod_lut[invocation] diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 97c91bd..c2723e7 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,9 +1,8 @@ from abc import abstractmethod -from einops import pack from orcapod.core.kernels import KernelStream, WrappedKernel from orcapod.core.sources import SourceBase from orcapod.core.pods import CachedPod -from orcapod.protocols import core_protocols as dp, database_protocols as dbp +from orcapod.protocols import core_protocols as cp, database_protocols as dbp from orcapod.types import PythonSchema from orcapod.utils.lazy_module import LazyModule from typing import TYPE_CHECKING, Any @@ -30,7 +29,7 @@ class NodeBase( def __init__( self, - input_streams: Collection[dp.Stream], + input_streams: Collection[cp.Stream], pipeline_database: dbp.ArrowDatabase, pipeline_path_prefix: tuple[str, ...] = (), **kwargs, @@ -54,7 +53,7 @@ def __init__( self.pipeline_database = pipeline_database @property - def contained_kernel(self) -> dp.Kernel: + def contained_kernel(self) -> cp.Kernel: raise NotImplementedError( "This property should be implemented by subclasses to return the contained kernel." ) @@ -72,7 +71,7 @@ def pipeline_path(self) -> tuple[str, ...]: """ ... - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: if len(streams) > 0: raise NotImplementedError( "At this moment, Node does not yet support handling additional input streams." @@ -120,8 +119,8 @@ class KernelNode(NodeBase, WrappedKernel): def __init__( self, - kernel: dp.Kernel, - input_streams: Collection[dp.Stream], + kernel: cp.Kernel, + input_streams: Collection[cp.Stream], pipeline_database: dbp.ArrowDatabase, pipeline_path_prefix: tuple[str, ...] = (), **kwargs, @@ -135,7 +134,7 @@ def __init__( ) @property - def contained_kernel(self) -> dp.Kernel: + def contained_kernel(self) -> cp.Kernel: return self.kernel def __repr__(self): @@ -144,13 +143,13 @@ def __repr__(self): def __str__(self): return f"KernelNode:{self.kernel!s}" - def forward(self, *streams: dp.Stream) -> dp.Stream: + def forward(self, *streams: cp.Stream) -> cp.Stream: output_stream = super().forward(*streams) self.record_pipeline_output(output_stream) return output_stream - def record_pipeline_output(self, output_stream: dp.Stream) -> None: + def record_pipeline_output(self, output_stream: cp.Stream) -> None: key_column_name = self.HASH_COLUMN_NAME output_table = output_stream.as_table( include_data_context=True, @@ -204,8 +203,8 @@ def get_all_records( class PodNode(NodeBase, CachedPod): def __init__( self, - pod: dp.Pod, - input_streams: Collection[dp.Stream], + pod: cp.Pod, + input_streams: Collection[cp.Stream], pipeline_database: dbp.ArrowDatabase, result_database: dbp.ArrowDatabase | None = None, record_path_prefix: tuple[str, ...] = (), @@ -223,7 +222,7 @@ def __init__( ) @property - def contained_kernel(self) -> dp.Kernel: + def contained_kernel(self) -> cp.Kernel: return self.pod @property @@ -249,13 +248,13 @@ def __str__(self): def call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, - ) -> tuple[dp.Tag, dp.Packet | None]: + ) -> tuple[cp.Tag, cp.Packet | None]: execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: record_id = self.get_record_id(packet, execution_engine_hash) @@ -286,13 +285,13 @@ def call( async def async_call( self, - tag: dp.Tag, - packet: dp.Packet, + tag: cp.Tag, + packet: cp.Packet, record_id: str | None = None, - execution_engine: dp.ExecutionEngine | None = None, + execution_engine: cp.ExecutionEngine | None = None, skip_cache_lookup: bool = False, skip_cache_insert: bool = False, - ) -> tuple[dp.Tag, dp.Packet | None]: + ) -> tuple[cp.Tag, cp.Packet | None]: execution_engine_hash = execution_engine.name if execution_engine else "default" if record_id is None: record_id = self.get_record_id(packet, execution_engine_hash) @@ -323,8 +322,8 @@ async def async_call( def add_pipeline_record( self, - tag: dp.Tag, - input_packet: dp.Packet, + tag: cp.Tag, + input_packet: cp.Packet, packet_record_id: str, retrieved: bool | None = None, skip_cache_lookup: bool = False, From 0f44d2abeca11389cac34046cc74ac1592da4fc2 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 31 Aug 2025 17:06:43 -0700 Subject: [PATCH 218/224] feat: add data frame source, config and fix bugs pertaining to node handling --- src/orcapod/__init__.py | 5 + src/orcapod/config.py | 36 +++ src/orcapod/core/base.py | 28 +- src/orcapod/core/operators/__init__.py | 3 + src/orcapod/core/operators/batch.py | 9 +- .../core/operators/column_selection.py | 245 ++++++++++++++++++ src/orcapod/core/operators/join.py | 9 +- src/orcapod/core/operators/mappers.py | 4 +- src/orcapod/core/polars_data_utils.py | 122 +++++++++ src/orcapod/core/sources/__init__.py | 2 + .../core/sources/arrow_table_source.py | 11 +- src/orcapod/core/sources/base.py | 167 +++++++++++- src/orcapod/core/sources/data_frame_source.py | 143 ++++++++++ src/orcapod/core/streams.py | 37 ++- src/orcapod/pipeline/nodes.py | 58 +++-- .../protocols/core_protocols/streams.py | 60 ++++- 16 files changed, 878 insertions(+), 61 deletions(-) create mode 100644 src/orcapod/config.py create mode 100644 src/orcapod/core/operators/column_selection.py create mode 100644 src/orcapod/core/polars_data_utils.py create mode 100644 src/orcapod/core/sources/data_frame_source.py diff --git a/src/orcapod/__init__.py b/src/orcapod/__init__.py index 3311d8b..f1ebd4b 100644 --- a/src/orcapod/__init__.py +++ b/src/orcapod/__init__.py @@ -1,8 +1,10 @@ +from .config import DEFAULT_CONFIG, Config from .core import DEFAULT_TRACKER_MANAGER from .core.pods import function_pod, FunctionPod, CachedPod from .core import streams from .core import operators from .core import sources +from .core.sources import DataFrameSource from . import databases from .pipeline import Pipeline @@ -10,6 +12,8 @@ no_tracking = DEFAULT_TRACKER_MANAGER.no_tracking __all__ = [ + "DEFAULT_CONFIG", + "Config", "DEFAULT_TRACKER_MANAGER", "no_tracking", "function_pod", @@ -18,6 +22,7 @@ "streams", "databases", "sources", + "DataFrameSource", "operators", "Pipeline", ] diff --git a/src/orcapod/config.py b/src/orcapod/config.py new file mode 100644 index 0000000..f36a514 --- /dev/null +++ b/src/orcapod/config.py @@ -0,0 +1,36 @@ +# config.py +from dataclasses import dataclass, replace +from typing import Self + + +@dataclass(frozen=True) +class Config: + """Immutable configuration object.""" + + system_tag_hash_n_char: int = 12 + schema_hash_n_char: int = 12 + path_hash_n_char: int = 20 + + def with_updates(self, **kwargs) -> Self: + """Create a new Config instance with updated values.""" + return replace(self, **kwargs) + + def merge(self, other: "Config") -> "Config": + """Merge with another config, other takes precedence.""" + if not isinstance(other, Config): + raise TypeError("Can only merge with another Config instance") + + # Get all non-default values from other + defaults = Config() + updates = {} + for field_name in self.__dataclass_fields__: + other_value = getattr(other, field_name) + default_value = getattr(defaults, field_name) + if other_value != default_value: + updates[field_name] = other_value + + return self.with_updates(**updates) + + +# Module-level default config - created at import time +DEFAULT_CONFIG = Config() diff --git a/src/orcapod/core/base.py b/src/orcapod/core/base.py index a71933f..828c371 100644 --- a/src/orcapod/core/base.py +++ b/src/orcapod/core/base.py @@ -1,12 +1,10 @@ -from abc import ABC -from collections.abc import Collection -from pathlib import Path -from typing import Any, Mapping -from uuid import UUID -from orcapod.protocols import hashing_protocols as hp -from orcapod import contexts import logging +from abc import ABC +from typing import Any +from orcapod import DEFAULT_CONFIG, contexts +from orcapod.config import Config +from orcapod.protocols import hashing_protocols as hp logger = logging.getLogger(__name__) @@ -54,13 +52,23 @@ def computed_label(self) -> str | None: return None -class ContextAwareBase(ABC): +class ContextAwareConfigurableBase(ABC): def __init__( - self, data_context: str | contexts.DataContext | None = None, **kwargs + self, + data_context: str | contexts.DataContext | None = None, + orcapod_config: Config | None = None, + **kwargs, ): super().__init__(**kwargs) + if orcapod_config is None: + orcapod_config = DEFAULT_CONFIG + self._orcapod_config = orcapod_config self._data_context = contexts.resolve_context(data_context) + @property + def orcapod_config(self) -> Config: + return self._orcapod_config + @property def data_context(self) -> contexts.DataContext: return self._data_context @@ -71,7 +79,7 @@ def data_context_key(self) -> str: return self._data_context.context_key -class ContentIdentifiableBase(ContextAwareBase): +class ContentIdentifiableBase(ContextAwareConfigurableBase): """ Base class for content-identifiable objects. This class provides a way to define objects that can be uniquely identified diff --git a/src/orcapod/core/operators/__init__.py b/src/orcapod/core/operators/__init__.py index 5890694..6cc8ee3 100644 --- a/src/orcapod/core/operators/__init__.py +++ b/src/orcapod/core/operators/__init__.py @@ -2,6 +2,7 @@ from .semijoin import SemiJoin from .mappers import MapTags, MapPackets from .batch import Batch +from .column_selection import DropTagColumns, DropPacketColumns __all__ = [ "Join", @@ -9,4 +10,6 @@ "MapTags", "MapPackets", "Batch", + "DropTagColumns", + "DropPacketColumns", ] diff --git a/src/orcapod/core/operators/batch.py b/src/orcapod/core/operators/batch.py index b4323b8..be48b3c 100644 --- a/src/orcapod/core/operators/batch.py +++ b/src/orcapod/core/operators/batch.py @@ -20,14 +20,14 @@ class Batch(UnaryOperator): Base class for all operators. """ - def __init__(self, batch_size: int = 0, drop_last_batch: bool = False, **kwargs): + def __init__(self, batch_size: int = 0, drop_partial_batch: bool = False, **kwargs): if batch_size < 0: raise ValueError("Batch size must be non-negative.") super().__init__(**kwargs) self.batch_size = batch_size - self.drop_last_batch = drop_last_batch + self.drop_partial_batch = drop_partial_batch def check_unary_input( self, @@ -77,7 +77,7 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: next_batch = {} i = 0 - if i > 0 and not self.drop_last_batch: + if i > 0 and not self.drop_partial_batch: batched_data.append(next_batch) batched_table = pa.Table.from_pylist(batched_data) @@ -99,7 +99,8 @@ def op_output_types( def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: return ( - (self.__class__.__name__, self.batch_size, self.drop_last_batch) + (stream,) + (self.__class__.__name__, self.batch_size, self.drop_partial_batch) + + (stream,) if stream is not None else () ) diff --git a/src/orcapod/core/operators/column_selection.py b/src/orcapod/core/operators/column_selection.py new file mode 100644 index 0000000..46f1612 --- /dev/null +++ b/src/orcapod/core/operators/column_selection.py @@ -0,0 +1,245 @@ +from orcapod.protocols import core_protocols as cp +from orcapod.core.streams import TableStream +from orcapod.types import PythonSchema +from typing import Any, TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection, Mapping +from orcapod.errors import InputValidationError +from orcapod.core.system_constants import constants +from orcapod.core.operators.base import UnaryOperator +import logging + + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + +logger = logging.getLogger(__name__) + + +class DropTagColumns(UnaryOperator): + """ + Operator that drops specified columns from a stream. + """ + + def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs): + if isinstance(columns, str): + columns = [columns] + self.columns = columns + self.strict = strict + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + columns_to_drop = self.columns + if not self.strict: + columns_to_drop = [c for c in columns_to_drop if c in tag_columns] + + new_tag_columns = [c for c in tag_columns if c not in columns_to_drop] + + if len(columns_to_drop) == 0: + logger.info("No tag columns to drop. Returning stream unaltered.") + return stream + + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + + modified_table = table.drop_columns(list(columns_to_drop)) + + return TableStream( + modified_table, + tag_columns=new_tag_columns, + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # TODO: remove redundant logic + tag_columns, packet_columns = stream.keys() + columns_to_drop = self.columns + missing_columns = set(columns_to_drop) - set(tag_columns) + if missing_columns: + if self.strict: + raise InputValidationError( + f"Missing tag columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" + ) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema, packet_schema = stream.types( + include_system_tags=include_system_tags + ) + tag_columns, _ = stream.keys() + new_tag_columns = [c for c in tag_columns if c not in self.columns] + + new_tag_schema = {k: v for k, v in tag_schema.items() if k in new_tag_columns} + + return new_tag_schema, packet_schema + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.columns, + self.strict, + ) + ((stream,) if stream is not None else ()) + + +class DropPacketColumns(UnaryOperator): + """ + Operator that drops specified columns from a stream. + """ + + def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs): + if isinstance(columns, str): + columns = [columns] + self.columns = columns + self.strict = strict + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + columns_to_drop = self.columns + if not self.strict: + columns_to_drop = [c for c in columns_to_drop if c in packet_columns] + + if len(columns_to_drop) == 0: + logger.info("No packet columns to drop. Returning stream unaltered.") + return stream + + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + + modified_table = table.drop_columns(list(columns_to_drop)) + + return TableStream( + modified_table, + tag_columns=tag_columns, + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # TODO: remove redundant logic + _, packet_columns = stream.keys() + missing_columns = set(self.columns) - set(packet_columns) + if missing_columns and self.strict: + raise InputValidationError( + f"Missing packet columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" + ) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema, packet_schema = stream.types( + include_system_tags=include_system_tags + ) + new_packet_schema = { + k: v for k, v in packet_schema.items() if k not in self.columns + } + + return tag_schema, new_packet_schema + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.columns, + self.strict, + ) + ((stream,) if stream is not None else ()) + + +class MapTags(UnaryOperator): + """ + Operator that maps tags in a stream using a user-defined function. + The function is applied to each tag in the stream, and the resulting tags + are returned as a new stream. + """ + + def __init__( + self, name_map: Mapping[str, str], drop_unmapped: bool = False, **kwargs + ): + self.name_map = dict(name_map) + self.drop_unmapped = drop_unmapped + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + missing_tags = set(tag_columns) - set(self.name_map.keys()) + + if not any(n in tag_columns for n in self.name_map): + # nothing to rename in the tags, return stream as is + return stream + + table = stream.as_table(include_source=True, include_system_tags=True) + + name_map = { + tc: self.name_map.get(tc, tc) for tc in tag_columns + } # rename the tag as necessary + new_tag_columns = [name_map[tc] for tc in tag_columns] + for c in packet_columns: + name_map[c] = c # no renaming on packet columns + + renamed_table = table.rename_columns(name_map) + + if missing_tags and self.drop_unmapped: + # drop any tags that are not in the name map + renamed_table = renamed_table.drop_columns(list(missing_tags)) + + return TableStream( + renamed_table, tag_columns=new_tag_columns, source=self, upstreams=(stream,) + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # verify that renamed value does NOT collide with other columns + tag_columns, packet_columns = stream.keys() + relevant_source = [] + relevant_target = [] + for source, target in self.name_map.items(): + if source in tag_columns: + relevant_source.append(source) + relevant_target.append(target) + remaining_tag_columns = set(tag_columns) - set(relevant_source) + overlapping_tag_columns = remaining_tag_columns.intersection(relevant_target) + overlapping_packet_columns = set(packet_columns).intersection(relevant_target) + + if overlapping_tag_columns or overlapping_packet_columns: + message = f"Renaming {self.name_map} would cause collisions with existing columns: " + if overlapping_tag_columns: + message += f"overlapping tag columns: {overlapping_tag_columns}." + if overlapping_packet_columns: + message += f"overlapping packet columns: {overlapping_packet_columns}." + raise InputValidationError(message) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, packet_typespec = stream.types( + include_system_tags=include_system_tags + ) + + # Create new packet typespec with renamed keys + new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} + + return new_tag_typespec, packet_typespec + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.name_map, + self.drop_unmapped, + ) + ((stream,) if stream is not None else ()) diff --git a/src/orcapod/core/operators/join.py b/src/orcapod/core/operators/join.py index 95b0371..04c65ee 100644 --- a/src/orcapod/core/operators/join.py +++ b/src/orcapod/core/operators/join.py @@ -85,9 +85,9 @@ def op_forward(self, *streams: cp.Stream) -> cp.Stream: table = stream.as_table(include_source=True, include_system_tags=True) # trick to get cartesian product table = table.add_column(0, COMMON_JOIN_KEY, pa.array([0] * len(table))) - N_CHAR = 12 table = arrow_data_utils.append_to_system_tags( - table, stream.content_hash().to_hex(char_count=N_CHAR) + table, + stream.content_hash().to_hex(self.orcapod_config.system_tag_hash_n_char), ) for next_stream in streams[1:]: @@ -96,7 +96,10 @@ def op_forward(self, *streams: cp.Stream) -> cp.Stream: include_source=True, include_system_tags=True ) next_table = arrow_data_utils.append_to_system_tags( - next_table, next_stream.content_hash().to_hex(char_count=N_CHAR) + next_table, + next_stream.content_hash().to_hex( + char_count=self.orcapod_config.system_tag_hash_n_char + ), ) # trick to ensure that there will always be at least one shared key # this ensure that no overlap in keys lead to full caretesian product diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 3806080..5e33598 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -104,7 +104,9 @@ def op_output_types( # Create new packet typespec with renamed keys new_packet_typespec = { - self.name_map.get(k, k): v for k, v in packet_typespec.items() + self.name_map.get(k, k): v + for k, v in packet_typespec.items() + if k in self.name_map or not self.drop_unmapped } return tag_typespec, new_packet_typespec diff --git a/src/orcapod/core/polars_data_utils.py b/src/orcapod/core/polars_data_utils.py new file mode 100644 index 0000000..7757a1d --- /dev/null +++ b/src/orcapod/core/polars_data_utils.py @@ -0,0 +1,122 @@ +# Collection of functions to work with Arrow table data that underlies streams and/or datagrams +from orcapod.utils.lazy_module import LazyModule +from typing import TYPE_CHECKING +from orcapod.core.system_constants import constants +from collections.abc import Collection + +if TYPE_CHECKING: + import polars as pl +else: + pl = LazyModule("polars") + + +def drop_columns_with_prefix( + df: "pl.DataFrame", + prefix: str | tuple[str, ...], + exclude_columns: Collection[str] = (), +) -> "pl.DataFrame": + """Drop columns with a specific prefix from a Polars DataFrame.""" + columns_to_drop = [ + col + for col in df.columns + if col.startswith(prefix) and col not in exclude_columns + ] + return df.drop(*columns_to_drop) + + +def drop_system_columns( + df: "pl.DataFrame", + system_column_prefix: tuple[str, ...] = ( + constants.META_PREFIX, + constants.DATAGRAM_PREFIX, + ), +) -> "pl.DataFrame": + return drop_columns_with_prefix(df, system_column_prefix) + + +def get_system_columns( + df: "pl.DataFrame", + system_column_prefix: tuple[str, ...] = ( + constants.META_PREFIX, + constants.DATAGRAM_PREFIX, + ), +) -> "pl.DataFrame": + """Get system columns from a Polars DataFrame.""" + return df.select( + [col for col in df.columns if col.startswith(system_column_prefix)] + ) + + +def add_system_tag_column( + df: "pl.DataFrame", + system_tag_column_name: str, + system_tag_values: str | Collection[str], +) -> "pl.DataFrame": + """Add a system tags column to a Polars DataFrame.""" + if df.is_empty(): + raise ValueError("DataFrame is empty") + if isinstance(system_tag_values, str): + system_tag_values = [system_tag_values] * df.height + else: + system_tag_values = list(system_tag_values) + if len(system_tag_values) != df.height: + raise ValueError( + "Length of system_tag_values must match number of rows in the DataFrame." + ) + if not system_tag_column_name.startswith(constants.SYSTEM_TAG_PREFIX): + system_tag_column_name = ( + f"{constants.SYSTEM_TAG_PREFIX}{system_tag_column_name}" + ) + tags_column = pl.Series( + system_tag_column_name, system_tag_values, dtype=pl.String() + ) + return df.with_columns(tags_column) + + +def append_to_system_tags(df: "pl.DataFrame", value: str) -> "pl.DataFrame": + """Append a value to the system tags column in an Arrow table.""" + if df.is_empty(): + raise ValueError("Table is empty") + + df.rename + column_name_map = { + c: f"{c}:{value}" + for c in df.columns + if c.startswith(constants.SYSTEM_TAG_PREFIX) + } + return df.rename(column_name_map) + + +def add_source_info( + df: "pl.DataFrame", + source_info: str | Collection[str] | None, + exclude_prefixes: Collection[str] = ( + constants.META_PREFIX, + constants.DATAGRAM_PREFIX, + ), + exclude_columns: Collection[str] = (), +) -> "pl.DataFrame": + """Add source information to an Arrow table.""" + # Create a new column with the source information + if source_info is None or isinstance(source_info, str): + source_column = [source_info] * df.height + elif isinstance(source_info, Collection): + if len(source_info) != df.height: + raise ValueError( + "Length of source_info collection must match number of rows in the table." + ) + source_column = source_info + + # identify columns for which source columns should be created + + for col in df.columns: + if col.startswith(tuple(exclude_prefixes)) or col in exclude_columns: + continue + source_column = pl.Series( + f"{constants.SOURCE_PREFIX}{col}", + [f"{source_val}::{col}" for source_val in source_column], + dtype=pl.String(), + ) + df = df.with_columns(source_column) + + return df diff --git a/src/orcapod/core/sources/__init__.py b/src/orcapod/core/sources/__init__.py index 65ead00..6bc4cf3 100644 --- a/src/orcapod/core/sources/__init__.py +++ b/src/orcapod/core/sources/__init__.py @@ -2,10 +2,12 @@ from .arrow_table_source import ArrowTableSource from .delta_table_source import DeltaTableSource from .dict_source import DictSource +from .data_frame_source import DataFrameSource from .source_registry import SourceRegistry, GLOBAL_SOURCE_REGISTRY __all__ = [ "SourceBase", + "DataFrameSource", "ArrowTableSource", "DeltaTableSource", "DictSource", diff --git a/src/orcapod/core/sources/arrow_table_source.py b/src/orcapod/core/sources/arrow_table_source.py index c4f020f..7d3c789 100644 --- a/src/orcapod/core/sources/arrow_table_source.py +++ b/src/orcapod/core/sources/arrow_table_source.py @@ -1,5 +1,4 @@ from collections.abc import Collection -from re import S from typing import TYPE_CHECKING, Any @@ -41,8 +40,6 @@ def __init__( if not preserve_system_columns: arrow_table = arrow_data_utils.drop_system_columns(arrow_table) - N_CHAR = 12 - non_system_columns = arrow_data_utils.drop_system_columns(arrow_table) tag_schema = non_system_columns.select(tag_columns).schema # FIXME: ensure tag_columns are found among non system columns @@ -59,7 +56,7 @@ def __init__( schema_hash = self.data_context.object_hasher.hash_object( (tag_python_schema, packet_python_schema) - ).to_hex(char_count=N_CHAR) + ).to_hex(char_count=self.orcapod_config.schema_hash_n_char) self.tag_columns = [ col for col in tag_columns if col in arrow_table.column_names @@ -68,8 +65,10 @@ def __init__( self.table_hash = self.data_context.arrow_hasher.hash_table(arrow_table) if source_name is None: - # TODO: extract this from system config - source_name = self.content_hash().to_hex(char_count=12) + # TODO: determine appropriate config name + source_name = self.content_hash().to_hex( + char_count=self.orcapod_config.path_hash_n_char + ) self._source_name = source_name diff --git a/src/orcapod/core/sources/base.py b/src/orcapod/core/sources/base.py index 2741099..8197640 100644 --- a/src/orcapod/core/sources/base.py +++ b/src/orcapod/core/sources/base.py @@ -1,4 +1,5 @@ from abc import abstractmethod +from ast import Not from collections.abc import Collection, Iterator from typing import TYPE_CHECKING, Any @@ -18,6 +19,164 @@ pa = LazyModule("pyarrow") +class InvocationBase(TrackedKernelBase, StatefulStreamBase): + def __init__(self, **kwargs): + super().__init__(**kwargs) + # Cache the KernelStream for reuse across all stream method calls + self._cached_kernel_stream: KernelStream | None = None + + def computed_label(self) -> str | None: + return None + + @abstractmethod + def kernel_identity_structure( + self, streams: Collection[cp.Stream] | None = None + ) -> Any: ... + + # Redefine the reference to ensure subclass would provide a concrete implementation + @property + @abstractmethod + def reference(self) -> tuple[str, ...]: + """Return the unique identifier for the kernel.""" + ... + + # =========================== Kernel Methods =========================== + + # The following are inherited from TrackedKernelBase as abstract methods. + # @abstractmethod + # def forward(self, *streams: dp.Stream) -> dp.Stream: + # """ + # Pure computation: return a static snapshot of the data. + + # This is the core method that subclasses must implement. + # Each call should return a fresh stream representing the current state of the data. + # This is what KernelStream calls when it needs to refresh its data. + # """ + # ... + + # @abstractmethod + # def kernel_output_types(self, *streams: dp.Stream) -> tuple[TypeSpec, TypeSpec]: + # """Return the tag and packet types this source produces.""" + # ... + + # @abstractmethod + # def kernel_identity_structure( + # self, streams: Collection[dp.Stream] | None = None + # ) -> dp.Any: ... + + def prepare_output_stream( + self, *streams: cp.Stream, label: str | None = None + ) -> KernelStream: + if self._cached_kernel_stream is None: + self._cached_kernel_stream = super().prepare_output_stream( + *streams, label=label + ) + return self._cached_kernel_stream + + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: + raise NotImplementedError("Behavior for track invocation is not determined") + + # ==================== Stream Protocol (Delegation) ==================== + + @property + def source(self) -> cp.Kernel | None: + """Sources are their own source.""" + return self + + # @property + # def upstreams(self) -> tuple[cp.Stream, ...]: ... + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """Delegate to the cached KernelStream.""" + return self().keys(include_system_tags=include_system_tags) + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """Delegate to the cached KernelStream.""" + return self().types(include_system_tags=include_system_tags) + + @property + def last_modified(self): + """Delegate to the cached KernelStream.""" + return self().last_modified + + @property + def is_current(self) -> bool: + """Delegate to the cached KernelStream.""" + return self().is_current + + def __iter__(self) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """ + Iterate over the cached KernelStream. + + This allows direct iteration over the source as if it were a stream. + """ + return self().iter_packets() + + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """Delegate to the cached KernelStream.""" + return self().iter_packets(execution_engine=execution_engine) + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + """Delegate to the cached KernelStream.""" + return self().as_table( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def flow( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Collection[tuple[cp.Tag, cp.Packet]]: + """Delegate to the cached KernelStream.""" + return self().flow(execution_engine=execution_engine) + + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: + """ + Run the source node, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + self().run(execution_engine=execution_engine) + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + """ + Run the source node asynchronously, executing the contained source. + + This is a no-op for sources since they are not executed like pods. + """ + await self().run_async(execution_engine=execution_engine) + + # ==================== LiveStream Protocol (Delegation) ==================== + + def refresh(self, force: bool = False) -> bool: + """Delegate to the cached KernelStream.""" + return self().refresh(force=force) + + def invalidate(self) -> None: + """Delegate to the cached KernelStream.""" + return self().invalidate() + + class SourceBase(TrackedKernelBase, StatefulStreamBase): """ Base class for sources that act as both Kernels and LiveStreams. @@ -38,13 +197,15 @@ def __init__(self, **kwargs): self._cached_kernel_stream: KernelStream | None = None self._schema_hash: str | None = None + # reset, so that computed label won't be used from StatefulStreamBase + def computed_label(self) -> str | None: + return None + def schema_hash(self) -> str: - # TODO: Migrate this to central config - N_CHAR = 12 if self._schema_hash is None: self._schema_hash = self.data_context.object_hasher.hash_object( (self.tag_types(), self.packet_types()) - ).to_hex(N_CHAR) + ).to_hex(self.orcapod_config.schema_hash_n_char) return self._schema_hash def kernel_identity_structure( diff --git a/src/orcapod/core/sources/data_frame_source.py b/src/orcapod/core/sources/data_frame_source.py new file mode 100644 index 0000000..54cd2ba --- /dev/null +++ b/src/orcapod/core/sources/data_frame_source.py @@ -0,0 +1,143 @@ +from collections.abc import Collection +from typing import TYPE_CHECKING, Any + +import polars + + +from orcapod.core.streams import TableStream +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.system_constants import constants +from orcapod.core import polars_data_utils +from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry + +from orcapod.core.sources.base import SourceBase + +if TYPE_CHECKING: + import pyarrow as pa + import polars as pl + from polars._typing import FrameInitTypes +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + + +class DataFrameSource(SourceBase): + """Construct source from a dataframe and any Polars dataframe compatible data structure""" + + SOURCE_ID = "polars" + + def __init__( + self, + data: "FrameInitTypes", + tag_columns: str | Collection[str] = (), + source_name: str | None = None, + source_registry: SourceRegistry | None = None, + auto_register: bool = True, + preserve_system_columns: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + + # clean the table, dropping any system columns + # Initialize polars dataframe + # TODO: work with LazyFrame + df = pl.DataFrame(data) + + if isinstance(tag_columns, str): + tag_columns = [tag_columns] + + if not preserve_system_columns: + df = polars_data_utils.drop_system_columns(df) + + non_system_columns = polars_data_utils.drop_system_columns(df) + missing_columns = set(tag_columns) - set(non_system_columns.columns) + if missing_columns: + raise ValueError( + f"Following tag columns not found in data: {missing_columns}" + ) + tag_schema = non_system_columns.select(tag_columns).to_arrow().schema + packet_schema = non_system_columns.drop(list(tag_columns)).to_arrow().schema + self.tag_columns = tag_columns + + tag_python_schema = ( + self.data_context.type_converter.arrow_schema_to_python_schema(tag_schema) + ) + packet_python_schema = ( + self.data_context.type_converter.arrow_schema_to_python_schema( + packet_schema + ) + ) + schema_hash = self.data_context.object_hasher.hash_object( + (tag_python_schema, packet_python_schema) + ).to_hex(char_count=self.orcapod_config.schema_hash_n_char) + + self.table_hash = self.data_context.arrow_hasher.hash_table(df.to_arrow()) + + if source_name is None: + # TODO: determine appropriate config name + source_name = self.content_hash().to_hex( + char_count=self.orcapod_config.path_hash_n_char + ) + + self._source_name = source_name + + row_index = list(range(df.height)) + + source_info = [ + f"{self.source_id}{constants.BLOCK_SEPARATOR}row_{i}" for i in row_index + ] + + # add source info + df = polars_data_utils.add_source_info( + df, source_info, exclude_columns=tag_columns + ) + + df = polars_data_utils.add_system_tag_column( + df, f"source{constants.FIELD_SEPARATOR}{schema_hash}", source_info + ) + + self._df = df + + self._table_stream = TableStream( + table=self._df.to_arrow(), + tag_columns=self.tag_columns, + source=self, + upstreams=(), + ) + + # Auto-register with global registry + if auto_register: + registry = source_registry or GLOBAL_SOURCE_REGISTRY + registry.register(self.source_id, self) + + @property + def reference(self) -> tuple[str, ...]: + return ("data_frame", f"source_{self._source_name}") + + @property + def df(self) -> "pl.DataFrame": + return self._df + + def source_identity_structure(self) -> Any: + return (self.__class__.__name__, self.tag_columns, self.table_hash) + + def get_all_records( + self, include_system_columns: bool = False + ) -> "pa.Table | None": + return self().as_table(include_source=include_system_columns) + + def forward(self, *streams: cp.Stream) -> cp.Stream: + """ + Load data from file and return a static stream. + + This is called by forward() and creates a fresh snapshot each time. + """ + return self._table_stream + + def source_output_types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """Return tag and packet types based on provided typespecs.""" + return self._table_stream.types(include_system_tags=include_system_tags) diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py index 6d8b5f1..63442e8 100644 --- a/src/orcapod/core/streams.py +++ b/src/orcapod/core/streams.py @@ -92,9 +92,9 @@ def map_packets( return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore def batch( - self, + self: cp.Stream, batch_size: int = 0, - drop_last: bool = False, + drop_partial_batch: bool = False, label: str | None = None, ) -> cp.Stream: """ @@ -103,7 +103,23 @@ def batch( """ from orcapod.core.operators import Batch - return Batch(batch_size=batch_size, drop_last=drop_last)(self, label=label) # type: ignore + return Batch(batch_size=batch_size, drop_partial_batch=drop_partial_batch)( + self, label=label + ) # type: ignore + + def drop_tag_columns( + self: cp.Stream, tag_columns: str | Collection[str], label: str | None = None + ) -> cp.Stream: + from orcapod.core.operators import DropTagColumns + + return DropTagColumns(tag_columns)(self, label=label) + + def drop_packet_columns( + self: cp.Stream, packet_columns: str | Collection[str], label: str | None = None + ) -> cp.Stream: + from orcapod.core.operators import DropPacketColumns + + return DropPacketColumns(packet_columns)(self, label=label) class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): @@ -111,6 +127,9 @@ class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase A stream that has a unique identity within the pipeline. """ + def pop(self) -> cp.Stream: + return self + def __init__( self, execution_engine: cp.ExecutionEngine | None = None, @@ -1156,9 +1175,15 @@ def as_table( ) if sort_by_tags: - output_table = output_table.sort_by( - [(column, "ascending") for column in self.keys()[0]] + # TODO: reimplement using polars natively + output_table = ( + pl.DataFrame(output_table) + .sort(by=[(column, "ascending") for column in self.keys()[0]]) + .to_arrow() ) + # output_table = output_table.sort_by( + # [(column, "ascending") for column in self.keys()[0]] + # ) return output_table @@ -1467,7 +1492,7 @@ def as_table( if self._cached_content_hash_column is None: content_hashes = [] for tag, packet in self.iter_packets(execution_engine=execution_engine): - content_hashes.append(packet.content_hash()) + content_hashes.append(packet.content_hash().to_string()) self._cached_content_hash_column = pa.array( content_hashes, type=pa.large_string() ) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index c2723e7..6518aed 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -1,6 +1,6 @@ from abc import abstractmethod from orcapod.core.kernels import KernelStream, WrappedKernel -from orcapod.core.sources import SourceBase +from orcapod.core.sources.base import SourceBase, InvocationBase from orcapod.core.pods import CachedPod from orcapod.protocols import core_protocols as cp, database_protocols as dbp from orcapod.types import PythonSchema @@ -21,7 +21,7 @@ class NodeBase( - SourceBase, + InvocationBase, ): """ Mixin class for pipeline nodes @@ -36,22 +36,33 @@ def __init__( ): super().__init__(**kwargs) self._cached_stream: KernelStream | None = None - self.input_streams = tuple(input_streams) - self.pipeline_path_prefix = pipeline_path_prefix + self._input_streams = tuple(input_streams) + self._pipeline_path_prefix = pipeline_path_prefix # compute invocation hash - note that empty () is passed into identity_structure to signify # identity structure of invocation with no input streams self.pipeline_node_hash = self.data_context.object_hasher.hash_object( self.identity_structure(()) ).to_string() tag_types, packet_types = self.types(include_system_tags=True) + self.tag_schema_hash = self.data_context.object_hasher.hash_object( tag_types ).to_string() + self.packet_schema_hash = self.data_context.object_hasher.hash_object( packet_types ).to_string() + self.pipeline_database = pipeline_database + @property + def upstreams(self) -> tuple[cp.Stream, ...]: + return self._input_streams + + def track_invocation(self, *streams: cp.Stream, label: str | None = None) -> None: + # Node invocation should not be tracked + return None + @property def contained_kernel(self) -> cp.Kernel: raise NotImplementedError( @@ -71,33 +82,34 @@ def pipeline_path(self) -> tuple[str, ...]: """ ... - def forward(self, *streams: cp.Stream) -> cp.Stream: + def validate_inputs(self, *streams: cp.Stream) -> None: + """Sources take no input streams.""" if len(streams) > 0: raise NotImplementedError( "At this moment, Node does not yet support handling additional input streams." ) - # TODO: re-evaluate the use here + + def forward(self, *streams: cp.Stream) -> cp.Stream: + # TODO: re-evaluate the use here -- consider semi joining with input streams # super().validate_inputs(*self.input_streams) - return super().forward(*self.input_streams) # type: ignore[return-value] + return super().forward(*self.upstreams) # type: ignore[return-value] - def source_output_types( - self, include_system_tags: bool = False + def kernel_output_types( + self, *streams: cp.Stream, include_system_tags: bool = False ) -> tuple[PythonSchema, PythonSchema]: """ Return the output types of the node. This is used to determine the types of the output streams. """ return self.contained_kernel.output_types( - *self.input_streams, include_system_tags=include_system_tags + *self.upstreams, include_system_tags=include_system_tags ) - def source_identity_structure(self) -> Any: - """ - Return the identity structure of the node. - This is used to compute the invocation hash. - """ + def kernel_identity_structure( + self, streams: Collection[cp.Stream] | None = None + ) -> Any: # construct identity structure from the node's information and the - return self.contained_kernel.identity_structure(self.input_streams) + return self.contained_kernel.identity_structure(self.upstreams) def get_all_records( self, include_system_columns: bool = False @@ -171,12 +183,12 @@ def pipeline_path(self) -> tuple[str, ...]: This is used to store the run-associated tag info. """ return ( - self.pipeline_path_prefix # pipeline ID + self._pipeline_path_prefix # pipeline ID + self.reference # node ID + ( - self.pipeline_node_hash, # pipeline node ID - self.packet_schema_hash, # packet schema ID - self.tag_schema_hash, # tag schema ID + f"node:{self.pipeline_node_hash}", # pipeline node ID + f"packet:{self.packet_schema_hash}", # packet schema ID + f"tag:{self.tag_schema_hash}", # tag schema ID ) ) @@ -232,11 +244,11 @@ def pipeline_path(self) -> tuple[str, ...]: This is used to store the run-associated tag info. """ return ( - self.pipeline_path_prefix # pipeline ID + self._pipeline_path_prefix # pipeline ID + self.reference # node ID + ( - self.pipeline_node_hash, # pipeline node ID - self.tag_schema_hash, # tag schema ID + f"node:{self.pipeline_node_hash}", # pipeline node ID + f"tag:{self.tag_schema_hash}", # tag schema ID ) ) diff --git a/src/orcapod/protocols/core_protocols/streams.py b/src/orcapod/protocols/core_protocols/streams.py index aea9dfd..bada8c9 100644 --- a/src/orcapod/protocols/core_protocols/streams.py +++ b/src/orcapod/protocols/core_protocols/streams.py @@ -358,7 +358,8 @@ def as_table( ... def flow( - self, execution_engine: ExecutionEngine | None = None + self, + execution_engine: ExecutionEngine | None = None, ) -> Collection[tuple[Tag, Packet]]: """ Return the entire stream as a collection of (tag, packet) pairs. @@ -373,7 +374,7 @@ def flow( """ ... - def join(self, other_stream: "Stream") -> "Stream": + def join(self, other_stream: "Stream", label: str | None = None) -> "Stream": """ Join this stream with another stream. @@ -389,7 +390,7 @@ def join(self, other_stream: "Stream") -> "Stream": """ ... - def semi_join(self, other_stream: "Stream") -> "Stream": + def semi_join(self, other_stream: "Stream", label: str | None = None) -> "Stream": """ Perform a semi-join with another stream. @@ -406,7 +407,10 @@ def semi_join(self, other_stream: "Stream") -> "Stream": ... def map_tags( - self, name_map: Mapping[str, str], drop_unmapped: bool = True + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, ) -> "Stream": """ Map tag names in this stream to new names based on the provided mapping. @@ -414,13 +418,59 @@ def map_tags( ... def map_packets( - self, name_map: Mapping[str, str], drop_unmapped: bool = True + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, ) -> "Stream": """ Map packet names in this stream to new names based on the provided mapping. """ ... + def drop_tag_columns( + self, tag_columns: str | Collection[str], label: str | None = None + ) -> "Stream": + """ + Drop the specified tag columns from the stream. A ValueError is raised + if one or more specified tag columns do not exist in the stream. + """ + ... + + # TODO: check to make sure source columns are also dropped + def drop_packet_columns( + self, packet_columns: str | Collection[str], label: str | None = None + ) -> "Stream": + """ + Drop the specified packet columns from the stream. A ValueError is raised + if one or more specified packet columns do not exist in the stream. + """ + ... + + def batch( + self, + batch_size: int = 0, + drop_partial_batch: bool = False, + label: str | None = None, + ) -> "Stream": + """ + Batch the stream into groups of the specified size. + + This operation groups (tag, packet) pairs into batches for more + efficient processing. Each batch is represented as a single (tag, packet) + pair where the tag is a list of tags and the packet is a list of packets. + + Args: + batch_size: Number of (tag, packet) pairs per batch. If 0, all + pairs are included in a single batch. + drop_partial_batch: If True, drop the last batch if it has fewer + than batch_size pairs. + + Returns: + Self: New stream containing batched (tag, packet) pairs. + """ + ... + @runtime_checkable class LiveStream(Stream, Protocol): From b476274a21b8516c8b47abc52a935cc6c462e9a5 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 31 Aug 2025 17:24:10 -0700 Subject: [PATCH 219/224] refactor: repackage core streams into its own subpackage --- src/orcapod/core/streams.py | 1586 ----------------- src/orcapod/core/streams/__init__.py | 16 + src/orcapod/core/streams/base.py | 526 ++++++ .../core/streams/efficient_pod_stream.py | 362 ++++ src/orcapod/core/streams/kernel_stream.py | 189 ++ src/orcapod/core/streams/lazy_pod_stream.py | 228 +++ src/orcapod/core/streams/table_stream.py | 319 ++++ src/orcapod/core/streams/wrapped_stream.py | 86 + 8 files changed, 1726 insertions(+), 1586 deletions(-) delete mode 100644 src/orcapod/core/streams.py create mode 100644 src/orcapod/core/streams/__init__.py create mode 100644 src/orcapod/core/streams/base.py create mode 100644 src/orcapod/core/streams/efficient_pod_stream.py create mode 100644 src/orcapod/core/streams/kernel_stream.py create mode 100644 src/orcapod/core/streams/lazy_pod_stream.py create mode 100644 src/orcapod/core/streams/table_stream.py create mode 100644 src/orcapod/core/streams/wrapped_stream.py diff --git a/src/orcapod/core/streams.py b/src/orcapod/core/streams.py deleted file mode 100644 index 63442e8..0000000 --- a/src/orcapod/core/streams.py +++ /dev/null @@ -1,1586 +0,0 @@ -import logging -from abc import abstractmethod -from collections.abc import Collection, Iterator, Mapping -from datetime import datetime, timezone -from itertools import repeat -from pathlib import Path -from typing import TYPE_CHECKING, Any, cast - -from orcapod import contexts -from orcapod.core.base import LabeledContentIdentifiableBase -from orcapod.core.datagrams import ( - ArrowPacket, - ArrowTag, - DictTag, -) -from orcapod.core.system_constants import constants -from orcapod.protocols import core_protocols as cp -from orcapod.types import PythonSchema -from orcapod.utils import arrow_utils -from orcapod.utils.lazy_module import LazyModule - - -if TYPE_CHECKING: - import pyarrow as pa - import pyarrow.compute as pc - import polars as pl - import pandas as pd - import asyncio -else: - pa = LazyModule("pyarrow") - pc = LazyModule("pyarrow.compute") - pl = LazyModule("polars") - pd = LazyModule("pandas") - asyncio = LazyModule("asyncio") - - -# TODO: consider using this instead of making copy of dicts -# from types import MappingProxyType - -logger = logging.getLogger(__name__) - - -class OperatorStreamBaseMixin: - def join(self, other_stream: cp.Stream, label: str | None = None) -> cp.Stream: - """ - Joins this stream with another stream, returning a new stream that contains - the combined data from both streams. - """ - from orcapod.core.operators import Join - - return Join()(self, other_stream, label=label) # type: ignore - - def semi_join( - self, - other_stream: cp.Stream, - label: str | None = None, - ) -> cp.Stream: - """ - Performs a semi-join with another stream, returning a new stream that contains - only the packets from this stream that have matching tags in the other stream. - """ - from orcapod.core.operators import SemiJoin - - return SemiJoin()(self, other_stream, label=label) # type: ignore - - def map_tags( - self, - name_map: Mapping[str, str], - drop_unmapped: bool = True, - label: str | None = None, - ) -> cp.Stream: - """ - Maps the tags in this stream according to the provided name_map. - If drop_unmapped is True, any tags that are not in the name_map will be dropped. - """ - from orcapod.core.operators import MapTags - - return MapTags(name_map, drop_unmapped)(self, label=label) # type: ignore - - def map_packets( - self, - name_map: Mapping[str, str], - drop_unmapped: bool = True, - label: str | None = None, - ) -> cp.Stream: - """ - Maps the packets in this stream according to the provided packet_map. - If drop_unmapped is True, any packets that are not in the packet_map will be dropped. - """ - from orcapod.core.operators import MapPackets - - return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore - - def batch( - self: cp.Stream, - batch_size: int = 0, - drop_partial_batch: bool = False, - label: str | None = None, - ) -> cp.Stream: - """ - Batch stream into fixed-size chunks, each of size batch_size. - If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. - """ - from orcapod.core.operators import Batch - - return Batch(batch_size=batch_size, drop_partial_batch=drop_partial_batch)( - self, label=label - ) # type: ignore - - def drop_tag_columns( - self: cp.Stream, tag_columns: str | Collection[str], label: str | None = None - ) -> cp.Stream: - from orcapod.core.operators import DropTagColumns - - return DropTagColumns(tag_columns)(self, label=label) - - def drop_packet_columns( - self: cp.Stream, packet_columns: str | Collection[str], label: str | None = None - ) -> cp.Stream: - from orcapod.core.operators import DropPacketColumns - - return DropPacketColumns(packet_columns)(self, label=label) - - -class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): - """ - A stream that has a unique identity within the pipeline. - """ - - def pop(self) -> cp.Stream: - return self - - def __init__( - self, - execution_engine: cp.ExecutionEngine | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._last_modified: datetime | None = None - self._set_modified_time() - # note that this is not necessary for Stream protocol, but is provided - # for convenience to resolve semantic types and other context-specific information - self._execution_engine = execution_engine - - @property - def substream_identities(self) -> tuple[str, ...]: - """ - Returns the identities of the substreams that this stream is composed of. - This is used to identify the substreams in the computational graph. - """ - return (self.content_hash().to_hex(),) - - @property - def execution_engine(self) -> cp.ExecutionEngine | None: - """ - Returns the execution engine that is used to execute this stream. - This is typically used to track the execution context of the stream. - """ - return self._execution_engine - - @execution_engine.setter - def execution_engine(self, engine: cp.ExecutionEngine | None) -> None: - """ - Sets the execution engine for the stream. - This is typically used to track the execution context of the stream. - """ - self._execution_engine = engine - - def get_substream(self, substream_id: str) -> cp.Stream: - """ - Returns the substream with the given substream_id. - This is used to retrieve a specific substream from the stream. - """ - if substream_id == self.substream_identities[0]: - return self - else: - raise ValueError(f"Substream with ID {substream_id} not found.") - - @property - @abstractmethod - def source(self) -> cp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - ... - - @property - @abstractmethod - def upstreams(self) -> tuple[cp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - ... - - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None - - @abstractmethod - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: ... - - def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: - return self.keys(include_system_tags=include_system_tags)[0] - - def packet_keys(self) -> tuple[str, ...]: - return self.keys()[1] - - @abstractmethod - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: ... - - def tag_types(self, include_system_tags: bool = False) -> PythonSchema: - return self.types(include_system_tags=include_system_tags)[0] - - def packet_types(self) -> PythonSchema: - return self.types()[1] - - @property - def last_modified(self) -> datetime | None: - """ - Returns when the stream's content was last modified. - This is used to track the time when the stream was last accessed. - Returns None if the stream has not been accessed yet. - """ - return self._last_modified - - @property - def is_current(self) -> bool: - """ - Returns whether the stream is current. - A stream is current if the content is up-to-date with respect to its source. - This can be used to determine if a stream with non-None last_modified is up-to-date. - Note that for asynchronous streams, this status is not applicable and always returns False. - """ - if self.last_modified is None: - # If there is no last_modified timestamp, we cannot determine if the stream is current - return False - - # check if the source kernel has been modified - if self.source is not None and ( - self.source.last_modified is None - or self.source.last_modified > self.last_modified - ): - return False - - # check if all upstreams are current - for upstream in self.upstreams: - if ( - not upstream.is_current - or upstream.last_modified is None - or upstream.last_modified > self.last_modified - ): - return False - return True - - def _set_modified_time( - self, timestamp: datetime | None = None, invalidate: bool = False - ) -> None: - if invalidate: - self._last_modified = None - return - - if timestamp is not None: - self._last_modified = timestamp - else: - self._last_modified = datetime.now(timezone.utc) - - def __iter__( - self, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - return self.iter_packets() - - @abstractmethod - def iter_packets( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: ... - - @abstractmethod - def run( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> None: ... - - @abstractmethod - async def run_async( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> None: ... - - @abstractmethod - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": ... - - def as_polars_df( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": - """ - Convert the entire stream to a Polars DataFrame. - """ - return pl.DataFrame( - self.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - ) - - def as_df( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": - """ - Convert the entire stream to a Polars DataFrame. - """ - return self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def as_lazy_frame( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.LazyFrame | None": - """ - Convert the entire stream to a Polars LazyFrame. - """ - df = self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - if df is None: - return None - return df.lazy() - - def as_pandas_df( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - index_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pd.DataFrame | None": - df = self.as_polars_df( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - if df is None: - return None - tag_keys, _ = self.keys() - pdf = df.to_pandas() - if index_by_tags: - pdf = pdf.set_index(list(tag_keys)) - return pdf - - def flow( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Collection[tuple[cp.Tag, cp.Packet]]: - """ - Flow everything through the stream, returning the entire collection of - (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. - """ - return [e for e in self.iter_packets(execution_engine=execution_engine)] - - # def identity_structure(self) -> Any: - # """ - # Identity structure of a stream is deferred to the identity structure - # of the associated invocation, if present. - # A bare stream without invocation has no well-defined identity structure. - # Specialized stream subclasses should override this method to provide more meaningful identity structure - # """ - # ... - - -class StreamBase(StatefulStreamBase): - """ - A stream is a collection of tagged-packets that are generated by an operation. - The stream is iterable and can be used to access the packets in the stream. - - A stream has property `invocation` that is an instance of Invocation that generated the stream. - This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). - """ - - def __init__( - self, - source: cp.Kernel | None = None, - upstreams: tuple[cp.Stream, ...] = (), - data_context: str | contexts.DataContext | None = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self._source = source - self._upstreams = upstreams - - # if data context is not provided, use that of the source kernel - if data_context is None and source is not None: - # if source is provided, use its data context - data_context = source.data_context_key - super().__init__(data_context=data_context, **kwargs) - - @property - def source(self) -> cp.Kernel | None: - """ - The source of the stream, which is the kernel that generated the stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._source - - @property - def upstreams(self) -> tuple[cp.Stream, ...]: - """ - The upstream streams that are used to generate this stream. - This is typically used to track the origin of the stream in the computational graph. - """ - return self._upstreams - - def computed_label(self) -> str | None: - if self.source is not None: - # use the invocation operation label - return self.source.label - return None - - # @abstractmethod - # def iter_packets( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... - - # @abstractmethod - # def run( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> None: ... - - # @abstractmethod - # async def run_async( - # self, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> None: ... - - # @abstractmethod - # def as_table( - # self, - # include_data_context: bool = False, - # include_source: bool = False, - # include_system_tags: bool = False, - # include_content_hash: bool | str = False, - # sort_by_tags: bool = True, - # execution_engine: dp.ExecutionEngine | None = None, - # ) -> "pa.Table": ... - - def identity_structure(self) -> Any: - """ - Identity structure of a stream is deferred to the identity structure - of the associated invocation, if present. - A bare stream without invocation has no well-defined identity structure. - Specialized stream subclasses should override this method to provide more meaningful identity structure - """ - if self.source is not None: - # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(self.upstreams) - return super().identity_structure() - - -class ImmutableStream(StreamBase): - """ - A class of stream that is constructed from immutable/constant data and does not change over time. - Consequently, the identity of an unsourced stream should be based on the content of the stream itself. - """ - - def __init__(self, **kwargs): - super().__init__(**kwargs) - self._data_content_identity = None - - @abstractmethod - def data_content_identity_structure(self) -> Any: - """ - Returns a hash of the content of the stream. - This is used to identify the content of the stream. - """ - ... - - def identity_structure(self) -> Any: - if self.source is not None: - # if the stream is generated by an operation, use the identity structure from the invocation - return self.source.identity_structure(self.upstreams) - # otherwise, use the content of the stream as the identity structure - if self._data_content_identity is None: - self._data_content_identity = self.data_content_identity_structure() - return self._data_content_identity - - -class TableStream(ImmutableStream): - """ - An immutable stream based on a PyArrow Table. - This stream is designed to be used with data that is already in a tabular format, - such as data loaded from a file or database. The columns to be treated as tags are - specified at initialization, and the rest of the columns are treated as packets. - The stream is immutable, meaning that once it is created, it cannot be modified. - This is useful for ensuring that the data in the stream remains consistent and unchanging. - - The types of the tag and packet columns are inferred from the PyArrow Table schema. - """ - - def __init__( - self, - table: "pa.Table", - tag_columns: Collection[str] = (), - system_tag_columns: Collection[str] = (), - source_info: dict[str, str | None] | None = None, - source: cp.Kernel | None = None, - upstreams: tuple[cp.Stream, ...] = (), - **kwargs, - ) -> None: - super().__init__(source=source, upstreams=upstreams, **kwargs) - - data_table, data_context_table = arrow_utils.split_by_column_groups( - table, [constants.CONTEXT_KEY] - ) - if data_table is None: - # TODO: provide better error message - raise ValueError( - "Table must contain at least one column to be used as a stream." - ) - table = data_table - - if data_context_table is None: - data_context_table = pa.table( - { - constants.CONTEXT_KEY: pa.array( - [contexts.get_default_context_key()] * len(table), - pa.large_string(), - ) - } - ) - - prefix_info = {constants.SOURCE_PREFIX: source_info} - - # determine tag columns first and then exclude any source info - self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) - self._system_tag_columns = tuple( - c for c in table.column_names if c.startswith(constants.SYSTEM_TAG_PREFIX) - ) - if len(system_tag_columns) > 0: - # rename system_tag_columns - column_name_map = { - c: f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns - } - table = table.rename_columns( - [column_name_map.get(c, c) for c in table.column_names] - ) - - self._system_tag_columns += tuple( - f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns - ) - - self._all_tag_columns = self._tag_columns + self._system_tag_columns - if delta := set(tag_columns) - set(self._tag_columns): - raise ValueError( - f"Specified tag columns {delta} are not present in the table." - ) - table, prefix_tables = arrow_utils.prepare_prefixed_columns( - table, - prefix_info, - exclude_columns=self._all_tag_columns, - ) - # now table should only contain tag columns and packet columns - self._packet_columns = tuple( - c for c in table.column_names if c not in self._all_tag_columns - ) - self._table = table - self._source_info_table = prefix_tables[constants.SOURCE_PREFIX] - self._data_context_table = data_context_table - - if len(self._packet_columns) == 0: - raise ValueError( - "No packet columns found in the table. At least one packet column is required." - ) - - tag_schema = pa.schema( - f for f in self._table.schema if f.name in self._tag_columns - ) - system_tag_schema = pa.schema( - f for f in self._table.schema if f.name in self._system_tag_columns - ) - all_tag_schema = arrow_utils.join_arrow_schemas(tag_schema, system_tag_schema) - packet_schema = pa.schema( - f for f in self._table.schema if f.name in self._packet_columns - ) - - self._tag_schema = tag_schema - self._system_tag_schema = system_tag_schema - self._all_tag_schema = all_tag_schema - self._packet_schema = packet_schema - # self._tag_converter = SemanticConverter.from_semantic_schema( - # schemas.SemanticSchema.from_arrow_schema( - # tag_schema, self._data_context.semantic_type_registry - # ) - # ) - # self._packet_converter = SemanticConverter.from_semantic_schema( - # schemas.SemanticSchema.from_arrow_schema( - # packet_schema, self._data_context.semantic_type_registry - # ) - # ) - - self._cached_elements: list[tuple[cp.Tag, ArrowPacket]] | None = None - self._set_modified_time() # set modified time to now - - def data_content_identity_structure(self) -> Any: - """ - Returns a hash of the content of the stream. - This is used to identify the content of the stream. - """ - table_hash = self.data_context.arrow_hasher.hash_table( - self.as_table( - include_data_context=True, include_source=True, include_system_tags=True - ), - ) - return ( - self.__class__.__name__, - table_hash, - self._tag_columns, - ) - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - tag_columns = self._tag_columns - if include_system_tags: - tag_columns += self._system_tag_columns - return tag_columns, self._packet_columns - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - # TODO: consider using MappingProxyType to avoid copying the dicts - converter = self.data_context.type_converter - if include_system_tags: - tag_schema = self._all_tag_schema - else: - tag_schema = self._tag_schema - return ( - converter.arrow_schema_to_python_schema(tag_schema), - converter.arrow_schema_to_python_schema(self._packet_schema), - ) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - """ - Returns the underlying table representation of the stream. - This is useful for converting the stream to a table format. - """ - output_table = self._table - if include_content_hash: - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - content_hashes = [ - str(packet.content_hash()) for _, packet in self.iter_packets() - ] - output_table = output_table.append_column( - hash_column_name, pa.array(content_hashes, type=pa.large_string()) - ) - if not include_system_tags: - # Check in original implementation - output_table = output_table.drop_columns(list(self._system_tag_columns)) - table_stack = (output_table,) - if include_data_context: - table_stack += (self._data_context_table,) - if include_source: - table_stack += (self._source_info_table,) - - table = arrow_utils.hstack_tables(*table_stack) - - if sort_by_tags: - # TODO: cleanup the sorting tag selection logic - try: - target_tags = ( - self._all_tag_columns if include_system_tags else self._tag_columns - ) - return table.sort_by([(column, "ascending") for column in target_tags]) - except pa.ArrowTypeError: - # If sorting fails, fall back to unsorted table - return table - - return table - - def clear_cache(self) -> None: - """ - Resets the cached elements of the stream. - This is useful for re-iterating over the stream. - """ - self._cached_elements = None - - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, ArrowPacket]]: - """ - Iterates over the packets in the stream. - Each packet is represented as a tuple of (Tag, Packet). - """ - # TODO: make it work with table batch stream - if self._cached_elements is None: - self._cached_elements = [] - tag_present = len(self._all_tag_columns) > 0 - if tag_present: - tags = self._table.select(self._all_tag_columns) - tag_batches = tags.to_batches() - else: - tag_batches = repeat(DictTag({})) - - # TODO: come back and clean up this logic - - packets = self._table.select(self._packet_columns) - for tag_batch, packet_batch in zip(tag_batches, packets.to_batches()): - for i in range(len(packet_batch)): - if tag_present: - tag = ArrowTag( - tag_batch.slice(i, 1), # type: ignore - data_context=self.data_context, - ) - - else: - tag = cast(DictTag, tag_batch) - - self._cached_elements.append( - ( - tag, - ArrowPacket( - packet_batch.slice(i, 1), - source_info=self._source_info_table.slice( - i, 1 - ).to_pylist()[0], - data_context=self.data_context, - ), - ) - ) - yield from self._cached_elements - - def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: - """ - Runs the stream, which in this case is a no-op since the stream is immutable. - This is typically used to trigger any upstream computation of the stream. - """ - # No-op for immutable streams - pass - - async def run_async( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> None: - """ - Runs the stream asynchronously, which in this case is a no-op since the stream is immutable. - This is typically used to trigger any upstream computation of the stream. - """ - # No-op for immutable streams - pass - - def __repr__(self) -> str: - return ( - f"{self.__class__.__name__}(table={self._table.column_names}, " - f"tag_columns={self._tag_columns})" - ) - - -class KernelStream(StreamBase): - """ - Recomputable stream that wraps a stream produced by a kernel to provide - an abstraction over the stream, taking the stream's source and upstreams as the basis of - recomputing the stream. - - This stream is used to represent the output of a kernel invocation. - """ - - def __init__( - self, - output_stream: cp.Stream | None = None, - source: cp.Kernel | None = None, - upstreams: tuple[ - cp.Stream, ... - ] = (), # if provided, this will override the upstreams of the output_stream - **kwargs, - ) -> None: - if (output_stream is None or output_stream.source is None) and source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - if source is None: - if output_stream is None or output_stream.source is None: - raise ValueError( - "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." - ) - source = output_stream.source - upstreams = upstreams or output_stream.upstreams - - super().__init__(source=source, upstreams=upstreams, **kwargs) - self.kernel = source - self._cached_stream = output_stream - - def clear_cache(self) -> None: - """ - Clears the cached stream. - This is useful for re-processing the stream with the same kernel. - """ - self._cached_stream = None - self._set_modified_time(invalidate=True) - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - tag_types, packet_types = self.kernel.output_types( - *self.upstreams, include_system_tags=include_system_tags - ) - return tuple(tag_types.keys()), tuple(packet_types.keys()) - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - return self.kernel.output_types( - *self.upstreams, include_system_tags=include_system_tags - ) - - @property - def is_current(self) -> bool: - if self._cached_stream is None or not super().is_current: - status = self.refresh() - if not status: # if it failed to update for whatever reason - return False - return True - - def refresh(self, force: bool = False) -> bool: - updated = False - if force or (self._cached_stream is not None and not super().is_current): - self.clear_cache() - - if self._cached_stream is None: - assert self.source is not None, ( - "Stream source must be set to recompute the stream." - ) - self._cached_stream = self.source.forward(*self.upstreams) - self._set_modified_time() - updated = True - - if self._cached_stream is None: - # TODO: use beter error type - raise ValueError( - "Stream could not be updated. Ensure that the source is valid and upstreams are correct." - ) - - return updated - - def invalidate(self) -> None: - """ - Invalidate the stream, marking it as needing recomputation. - This will clear the cached stream and set the last modified time to None. - """ - self.clear_cache() - self._set_modified_time(invalidate=True) - - @property - def last_modified(self) -> datetime | None: - if self._cached_stream is None: - return None - return self._cached_stream.last_modified - - def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - self._cached_stream.run(execution_engine=execution_engine) - - async def run_async( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> None: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - await self._cached_stream.run_async(execution_engine=execution_engine) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def iter_packets( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - self.refresh() - assert self._cached_stream is not None, ( - "Stream has not been updated or is empty." - ) - return self._cached_stream.iter_packets(execution_engine=execution_engine) - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" - - -class LazyPodResultStream(StreamBase): - """ - A fixed stream that lazily processes packets from a prepared input stream. - This is what Pod.process() returns - it's static/fixed but efficient. - """ - - def __init__(self, pod: cp.Pod, prepared_stream: cp.Stream, **kwargs): - super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) - self.pod = pod - self.prepared_stream = prepared_stream - self._set_modified_time() # set modified time to when we obtain the iterator - # capture the immutable iterator from the prepared stream - self._prepared_stream_iterator = prepared_stream.iter_packets() - - # Packet-level caching (from your PodStream) - self._cached_output_packets: dict[int, tuple[cp.Tag, cp.Packet | None]] = {} - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - if self._prepared_stream_iterator is not None: - for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - if i in self._cached_output_packets: - # Use cached result - tag, packet = self._cached_output_packets[i] - if packet is not None: - yield tag, packet - else: - # Process packet - processed = self.pod.call( - tag, packet, execution_engine=execution_engine - ) - if processed is not None: - # Update shared cache for future iterators (optimization) - self._cached_output_packets[i] = processed - tag, packet = processed - if packet is not None: - yield tag, packet - - # Mark completion by releasing the iterator - self._prepared_stream_iterator = None - else: - # Yield from snapshot of complete cache - for i in range(len(self._cached_output_packets)): - tag, packet = self._cached_output_packets[i] - if packet is not None: - yield tag, packet - - async def run_async( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> None: - if self._prepared_stream_iterator is not None: - pending_call_lut = {} - for i, (tag, packet) in enumerate(self._prepared_stream_iterator): - if i not in self._cached_output_packets: - # Process packet - pending_call_lut[i] = self.pod.async_call( - tag, packet, execution_engine=execution_engine - ) - - indices = list(pending_call_lut.keys()) - pending_calls = [pending_call_lut[i] for i in indices] - - results = await asyncio.gather(*pending_calls) - for i, result in zip(indices, results): - self._cached_output_packets[i] = result - - # Mark completion by releasing the iterator - self._prepared_stream_iterator = None - - def run( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> None: - # Fallback to synchronous run - self.flow(execution_engine=execution_engine) - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.prepared_stream.keys(include_system_tags=include_system_tags) - packet_keys = tuple(self.pod.output_packet_types().keys()) - return tag_keys, packet_keys - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, _ = self.prepared_stream.types( - include_system_tags=include_system_tags - ) - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_packet_types()) - return tag_typespec, packet_typespec - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(execution_engine=execution_engine): - if tag_schema is None: - tag_schema = tag.arrow_schema(include_system_tags=True) - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict(include_system_tags=True)) - # FIXME: using in the pinch conversion to str from path - # replace with an appropriate semantic converter-based approach! - dict_patcket = packet.as_dict(include_context=True, include_source=True) - for k, v in dict_patcket.items(): - if isinstance(v, Path): - dict_patcket[k] = str(v) - all_packets.append(dict_patcket) - - # TODO: re-verify the implemetation of this conversion - converter = self.data_context.type_converter - - struct_packets = converter.python_dicts_to_struct_dicts(all_packets) - all_tags_as_tables: pa.Table = pa.Table.from_pylist( - all_tags, schema=tag_schema - ) - all_packets_as_tables: pa.Table = pa.Table.from_pylist( - struct_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables( - all_tags_as_tables, all_packets_as_tables - ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - drop_columns = [] - if not include_system_tags: - # TODO: get system tags more effiicently - drop_columns.extend( - [ - c - for c in self._cached_output_table.column_names - if c.startswith(constants.SYSTEM_TAG_PREFIX) - ] - ) - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - - output_table = self._cached_output_table.drop(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - # TODO: verify that order will be preserved - for tag, packet in self.iter_packets(): - content_hashes.append(packet.content_hash().to_string()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column - ) - - if sort_by_tags: - # TODO: reimplement using polars natively - output_table = ( - pl.DataFrame(output_table) - .sort(by=[(column, "ascending") for column in self.keys()[0]]) - .to_arrow() - ) - # output_table = output_table.sort_by( - # [(column, "ascending") for column in self.keys()[0]] - # ) - return output_table - - -class EfficientPodResultStream(StreamBase): - """ - A fixed stream that lazily processes packets from a prepared input stream. - This is what Pod.process() returns - it's static/fixed but efficient. - """ - - # TODO: define interface for storage or pod storage - def __init__(self, pod: cp.CachedPod, input_stream: cp.Stream, **kwargs): - super().__init__(source=pod, upstreams=(input_stream,), **kwargs) - self.pod = pod - self.input_stream = input_stream - self._set_modified_time() # set modified time to when we obtain the iterator - # capture the immutable iterator from the input stream - - self._prepared_stream_iterator = input_stream.iter_packets() - - # Packet-level caching (from your PodStream) - self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None - self._cached_output_table: pa.Table | None = None - self._cached_content_hash_column: pa.Array | None = None - - async def run_async( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> None: - """ - Runs the stream, processing the input stream and preparing the output stream. - This is typically called before iterating over the packets. - """ - if self._cached_output_packets is None: - cached_results = [] - - # identify all entries in the input stream for which we still have not computed packets - target_entries = self.input_stream.as_table( - include_content_hash=constants.INPUT_PACKET_HASH, - include_source=True, - include_system_tags=True, - ) - existing_entries = self.pod.get_all_cached_outputs( - include_system_columns=True - ) - if existing_entries is None or existing_entries.num_rows == 0: - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - all_results = target_entries.join( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ), - keys=[constants.INPUT_PACKET_HASH], - join_type="left outer", - right_suffix="_right", - ) - # grab all columns from target_entries first - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = ( - all_results.filter(pc.is_valid(pc.field("_exists"))) - .drop_columns(target_entries.column_names) - .drop_columns(["_exists"]) - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - - pending_calls = [] - if missing is not None and missing.num_rows > 0: - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - pending = self.pod.async_call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine, - ) - pending_calls.append(pending) - import asyncio - - completed_calls = await asyncio.gather(*pending_calls) - for result in completed_calls: - cached_results.append(result) - - self._cached_output_packets = cached_results - self._set_modified_time() - - def run( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> None: - self.flow(execution_engine=execution_engine) - - def iter_packets( - self, execution_engine: cp.ExecutionEngine | None = None - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - """ - Processes the input stream and prepares the output stream. - This is typically called before iterating over the packets. - """ - if self._cached_output_packets is None: - cached_results = [] - - # identify all entries in the input stream for which we still have not computed packets - target_entries = self.input_stream.as_table( - include_system_tags=True, - include_source=True, - include_content_hash=constants.INPUT_PACKET_HASH, - execution_engine=execution_engine, - ) - existing_entries = self.pod.get_all_cached_outputs( - include_system_columns=True - ) - if existing_entries is None or existing_entries.num_rows == 0: - missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) - existing = None - else: - # missing = target_entries.join( - # existing_entries, - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left anti", - # ) - # Single join that gives you both missing and existing - # More efficient - only bring the key column from existing_entries - # .select([constants.INPUT_PACKET_HASH]).append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), - - # TODO: do more proper replacement operation - target_df = pl.DataFrame(target_entries) - existing_df = pl.DataFrame( - existing_entries.append_column( - "_exists", pa.array([True] * len(existing_entries)) - ) - ) - all_results_df = target_df.join( - existing_df, - on=constants.INPUT_PACKET_HASH, - how="left", - suffix="_right", - ) - all_results = all_results_df.to_arrow() - # all_results = target_entries.join( - # existing_entries.append_column( - # "_exists", pa.array([True] * len(existing_entries)) - # ), - # keys=[constants.INPUT_PACKET_HASH], - # join_type="left outer", - # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys - # ) - # grab all columns from target_entries first - missing = ( - all_results.filter(pc.is_null(pc.field("_exists"))) - .select(target_entries.column_names) - .drop_columns([constants.INPUT_PACKET_HASH]) - ) - - existing = all_results.filter( - pc.is_valid(pc.field("_exists")) - ).drop_columns( - [ - "_exists", - constants.INPUT_PACKET_HASH, - constants.PACKET_RECORD_ID, - *self.input_stream.keys()[1], # remove the input packet keys - ] - # TODO: look into NOT fetching back the record ID - ) - renamed = [ - c.removesuffix("_right") if c.endswith("_right") else c - for c in existing.column_names - ] - existing = existing.rename_columns(renamed) - - tag_keys = self.input_stream.keys()[0] - - if existing is not None and existing.num_rows > 0: - # If there are existing entries, we can cache them - existing_stream = TableStream(existing, tag_columns=tag_keys) - for tag, packet in existing_stream.iter_packets(): - cached_results.append((tag, packet)) - yield tag, packet - - if missing is not None and missing.num_rows > 0: - for tag, packet in TableStream(missing, tag_columns=tag_keys): - # Since these packets are known to be missing, skip the cache lookup - tag, packet = self.pod.call( - tag, - packet, - skip_cache_lookup=True, - execution_engine=execution_engine, - ) - cached_results.append((tag, packet)) - if packet is not None: - yield tag, packet - - self._cached_output_packets = cached_results - self._set_modified_time() - else: - for tag, packet in self._cached_output_packets: - if packet is not None: - yield tag, packet - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - - tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) - packet_keys = tuple(self.pod.output_packet_types().keys()) - return tag_keys, packet_keys - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - tag_typespec, _ = self.input_stream.types( - include_system_tags=include_system_tags - ) - # TODO: check if copying can be avoided - packet_typespec = dict(self.pod.output_packet_types()) - return tag_typespec, packet_typespec - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - if self._cached_output_table is None: - all_tags = [] - all_packets = [] - tag_schema, packet_schema = None, None - for tag, packet in self.iter_packets(execution_engine=execution_engine): - if tag_schema is None: - tag_schema = tag.arrow_schema(include_system_tags=True) - if packet_schema is None: - packet_schema = packet.arrow_schema( - include_context=True, - include_source=True, - ) - all_tags.append(tag.as_dict(include_system_tags=True)) - # FIXME: using in the pinch conversion to str from path - # replace with an appropriate semantic converter-based approach! - dict_patcket = packet.as_dict(include_context=True, include_source=True) - for k, v in dict_patcket.items(): - if isinstance(v, Path): - dict_patcket[k] = str(v) - all_packets.append(dict_patcket) - - converter = self.data_context.type_converter - - struct_packets = converter.python_dicts_to_struct_dicts(all_packets) - all_tags_as_tables: pa.Table = pa.Table.from_pylist( - all_tags, schema=tag_schema - ) - all_packets_as_tables: pa.Table = pa.Table.from_pylist( - struct_packets, schema=packet_schema - ) - - self._cached_output_table = arrow_utils.hstack_tables( - all_tags_as_tables, all_packets_as_tables - ) - assert self._cached_output_table is not None, ( - "_cached_output_table should not be None here." - ) - - drop_columns = [] - if not include_source: - drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) - if not include_data_context: - drop_columns.append(constants.CONTEXT_KEY) - if not include_system_tags: - # TODO: come up with a more efficient approach - drop_columns.extend( - [ - c - for c in self._cached_output_table.column_names - if c.startswith(constants.SYSTEM_TAG_PREFIX) - ] - ) - - output_table = self._cached_output_table.drop_columns(drop_columns) - - # lazily prepare content hash column if requested - if include_content_hash: - if self._cached_content_hash_column is None: - content_hashes = [] - for tag, packet in self.iter_packets(execution_engine=execution_engine): - content_hashes.append(packet.content_hash().to_string()) - self._cached_content_hash_column = pa.array( - content_hashes, type=pa.large_string() - ) - assert self._cached_content_hash_column is not None, ( - "_cached_content_hash_column should not be None here." - ) - hash_column_name = ( - "_content_hash" - if include_content_hash is True - else include_content_hash - ) - output_table = output_table.append_column( - hash_column_name, self._cached_content_hash_column - ) - - if sort_by_tags: - try: - # TODO: consider having explicit tag/packet properties? - output_table = output_table.sort_by( - [(column, "ascending") for column in self.keys()[0]] - ) - except pa.ArrowTypeError: - pass - - return output_table - - -class WrappedStream(StreamBase): - def __init__( - self, - stream: cp.Stream, - source: cp.Kernel, - input_streams: tuple[cp.Stream, ...], - label: str | None = None, - **kwargs, - ) -> None: - super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) - self._stream = stream - - def keys( - self, include_system_tags: bool = False - ) -> tuple[tuple[str, ...], tuple[str, ...]]: - """ - Returns the keys of the tag and packet columns in the stream. - This is useful for accessing the columns in the stream. - """ - return self._stream.keys(include_system_tags=include_system_tags) - - def types( - self, include_system_tags: bool = False - ) -> tuple[PythonSchema, PythonSchema]: - """ - Returns the types of the tag and packet columns in the stream. - This is useful for accessing the types of the columns in the stream. - """ - return self._stream.types(include_system_tags=include_system_tags) - - def as_table( - self, - include_data_context: bool = False, - include_source: bool = False, - include_system_tags: bool = False, - include_content_hash: bool | str = False, - sort_by_tags: bool = True, - execution_engine: cp.ExecutionEngine | None = None, - ) -> "pa.Table": - """ - Returns the underlying table representation of the stream. - This is useful for converting the stream to a table format. - """ - return self._stream.as_table( - include_data_context=include_data_context, - include_source=include_source, - include_system_tags=include_system_tags, - include_content_hash=include_content_hash, - sort_by_tags=sort_by_tags, - execution_engine=execution_engine, - ) - - def iter_packets( - self, - execution_engine: cp.ExecutionEngine | None = None, - ) -> Iterator[tuple[cp.Tag, cp.Packet]]: - """ - Iterates over the packets in the stream. - Each packet is represented as a tuple of (Tag, Packet). - """ - return self._stream.iter_packets(execution_engine=execution_engine) - - def identity_structure(self) -> Any: - return self._stream.identity_structure() diff --git a/src/orcapod/core/streams/__init__.py b/src/orcapod/core/streams/__init__.py new file mode 100644 index 0000000..630d32f --- /dev/null +++ b/src/orcapod/core/streams/__init__.py @@ -0,0 +1,16 @@ +from .base import StatefulStreamBase +from .kernel_stream import KernelStream +from .table_stream import TableStream +from .lazy_pod_stream import LazyPodResultStream +from .efficient_pod_stream import EfficientPodResultStream +from .wrapped_stream import WrappedStream + + +__all__ = [ + "StatefulStreamBase", + "KernelStream", + "TableStream", + "LazyPodResultStream", + "EfficientPodResultStream", + "WrappedStream", +] diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py new file mode 100644 index 0000000..0977ef1 --- /dev/null +++ b/src/orcapod/core/streams/base.py @@ -0,0 +1,526 @@ +import logging +from abc import abstractmethod +from collections.abc import Collection, Iterator, Mapping +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +from orcapod import contexts +from orcapod.core.base import LabeledContentIdentifiableBase +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils.lazy_module import LazyModule + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + import pandas as pd +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + pd = LazyModule("pandas") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class OperatorStreamBaseMixin: + def join(self, other_stream: cp.Stream, label: str | None = None) -> cp.Stream: + """ + Joins this stream with another stream, returning a new stream that contains + the combined data from both streams. + """ + from orcapod.core.operators import Join + + return Join()(self, other_stream, label=label) # type: ignore + + def semi_join( + self, + other_stream: cp.Stream, + label: str | None = None, + ) -> cp.Stream: + """ + Performs a semi-join with another stream, returning a new stream that contains + only the packets from this stream that have matching tags in the other stream. + """ + from orcapod.core.operators import SemiJoin + + return SemiJoin()(self, other_stream, label=label) # type: ignore + + def map_tags( + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, + ) -> cp.Stream: + """ + Maps the tags in this stream according to the provided name_map. + If drop_unmapped is True, any tags that are not in the name_map will be dropped. + """ + from orcapod.core.operators import MapTags + + return MapTags(name_map, drop_unmapped)(self, label=label) # type: ignore + + def map_packets( + self, + name_map: Mapping[str, str], + drop_unmapped: bool = True, + label: str | None = None, + ) -> cp.Stream: + """ + Maps the packets in this stream according to the provided packet_map. + If drop_unmapped is True, any packets that are not in the packet_map will be dropped. + """ + from orcapod.core.operators import MapPackets + + return MapPackets(name_map, drop_unmapped)(self, label=label) # type: ignore + + def batch( + self: cp.Stream, + batch_size: int = 0, + drop_partial_batch: bool = False, + label: str | None = None, + ) -> cp.Stream: + """ + Batch stream into fixed-size chunks, each of size batch_size. + If drop_last is True, any remaining elements that don't fit into a full batch will be dropped. + """ + from orcapod.core.operators import Batch + + return Batch(batch_size=batch_size, drop_partial_batch=drop_partial_batch)( + self, label=label + ) # type: ignore + + def drop_tag_columns( + self: cp.Stream, tag_columns: str | Collection[str], label: str | None = None + ) -> cp.Stream: + from orcapod.core.operators import DropTagColumns + + return DropTagColumns(tag_columns)(self, label=label) + + def drop_packet_columns( + self: cp.Stream, packet_columns: str | Collection[str], label: str | None = None + ) -> cp.Stream: + from orcapod.core.operators import DropPacketColumns + + return DropPacketColumns(packet_columns)(self, label=label) + + +class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): + """ + A stream that has a unique identity within the pipeline. + """ + + def pop(self) -> cp.Stream: + return self + + def __init__( + self, + execution_engine: cp.ExecutionEngine | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._last_modified: datetime | None = None + self._set_modified_time() + # note that this is not necessary for Stream protocol, but is provided + # for convenience to resolve semantic types and other context-specific information + self._execution_engine = execution_engine + + @property + def substream_identities(self) -> tuple[str, ...]: + """ + Returns the identities of the substreams that this stream is composed of. + This is used to identify the substreams in the computational graph. + """ + return (self.content_hash().to_hex(),) + + @property + def execution_engine(self) -> cp.ExecutionEngine | None: + """ + Returns the execution engine that is used to execute this stream. + This is typically used to track the execution context of the stream. + """ + return self._execution_engine + + @execution_engine.setter + def execution_engine(self, engine: cp.ExecutionEngine | None) -> None: + """ + Sets the execution engine for the stream. + This is typically used to track the execution context of the stream. + """ + self._execution_engine = engine + + def get_substream(self, substream_id: str) -> cp.Stream: + """ + Returns the substream with the given substream_id. + This is used to retrieve a specific substream from the stream. + """ + if substream_id == self.substream_identities[0]: + return self + else: + raise ValueError(f"Substream with ID {substream_id} not found.") + + @property + @abstractmethod + def source(self) -> cp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + ... + + @property + @abstractmethod + def upstreams(self) -> tuple[cp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + ... + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + @abstractmethod + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: ... + + def tag_keys(self, include_system_tags: bool = False) -> tuple[str, ...]: + return self.keys(include_system_tags=include_system_tags)[0] + + def packet_keys(self) -> tuple[str, ...]: + return self.keys()[1] + + @abstractmethod + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: ... + + def tag_types(self, include_system_tags: bool = False) -> PythonSchema: + return self.types(include_system_tags=include_system_tags)[0] + + def packet_types(self) -> PythonSchema: + return self.types()[1] + + @property + def last_modified(self) -> datetime | None: + """ + Returns when the stream's content was last modified. + This is used to track the time when the stream was last accessed. + Returns None if the stream has not been accessed yet. + """ + return self._last_modified + + @property + def is_current(self) -> bool: + """ + Returns whether the stream is current. + A stream is current if the content is up-to-date with respect to its source. + This can be used to determine if a stream with non-None last_modified is up-to-date. + Note that for asynchronous streams, this status is not applicable and always returns False. + """ + if self.last_modified is None: + # If there is no last_modified timestamp, we cannot determine if the stream is current + return False + + # check if the source kernel has been modified + if self.source is not None and ( + self.source.last_modified is None + or self.source.last_modified > self.last_modified + ): + return False + + # check if all upstreams are current + for upstream in self.upstreams: + if ( + not upstream.is_current + or upstream.last_modified is None + or upstream.last_modified > self.last_modified + ): + return False + return True + + def _set_modified_time( + self, timestamp: datetime | None = None, invalidate: bool = False + ) -> None: + if invalidate: + self._last_modified = None + return + + if timestamp is not None: + self._last_modified = timestamp + else: + self._last_modified = datetime.now(timezone.utc) + + def __iter__( + self, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + return self.iter_packets() + + @abstractmethod + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: ... + + @abstractmethod + def run( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> None: ... + + @abstractmethod + async def run_async( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> None: ... + + @abstractmethod + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": ... + + def as_polars_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pl.DataFrame | None": + """ + Convert the entire stream to a Polars DataFrame. + """ + return pl.DataFrame( + self.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + ) + + def as_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pl.DataFrame | None": + """ + Convert the entire stream to a Polars DataFrame. + """ + return self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def as_lazy_frame( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pl.LazyFrame | None": + """ + Convert the entire stream to a Polars LazyFrame. + """ + df = self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + if df is None: + return None + return df.lazy() + + def as_pandas_df( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + index_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pd.DataFrame | None": + df = self.as_polars_df( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + if df is None: + return None + tag_keys, _ = self.keys() + pdf = df.to_pandas() + if index_by_tags: + pdf = pdf.set_index(list(tag_keys)) + return pdf + + def flow( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Collection[tuple[cp.Tag, cp.Packet]]: + """ + Flow everything through the stream, returning the entire collection of + (Tag, Packet) as a collection. This will tigger any upstream computation of the stream. + """ + return [e for e in self.iter_packets(execution_engine=execution_engine)] + + # def identity_structure(self) -> Any: + # """ + # Identity structure of a stream is deferred to the identity structure + # of the associated invocation, if present. + # A bare stream without invocation has no well-defined identity structure. + # Specialized stream subclasses should override this method to provide more meaningful identity structure + # """ + # ... + + +class StreamBase(StatefulStreamBase): + """ + A stream is a collection of tagged-packets that are generated by an operation. + The stream is iterable and can be used to access the packets in the stream. + + A stream has property `invocation` that is an instance of Invocation that generated the stream. + This may be None if the stream is not generated by a kernel (i.e. directly instantiated by a user). + """ + + def __init__( + self, + source: cp.Kernel | None = None, + upstreams: tuple[cp.Stream, ...] = (), + data_context: str | contexts.DataContext | None = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self._source = source + self._upstreams = upstreams + + # if data context is not provided, use that of the source kernel + if data_context is None and source is not None: + # if source is provided, use its data context + data_context = source.data_context_key + super().__init__(data_context=data_context, **kwargs) + + @property + def source(self) -> cp.Kernel | None: + """ + The source of the stream, which is the kernel that generated the stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._source + + @property + def upstreams(self) -> tuple[cp.Stream, ...]: + """ + The upstream streams that are used to generate this stream. + This is typically used to track the origin of the stream in the computational graph. + """ + return self._upstreams + + def computed_label(self) -> str | None: + if self.source is not None: + # use the invocation operation label + return self.source.label + return None + + # @abstractmethod + # def iter_packets( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> Iterator[tuple[dp.Tag, dp.Packet]]: ... + + # @abstractmethod + # def run( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> None: ... + + # @abstractmethod + # async def run_async( + # self, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> None: ... + + # @abstractmethod + # def as_table( + # self, + # include_data_context: bool = False, + # include_source: bool = False, + # include_system_tags: bool = False, + # include_content_hash: bool | str = False, + # sort_by_tags: bool = True, + # execution_engine: dp.ExecutionEngine | None = None, + # ) -> "pa.Table": ... + + def identity_structure(self) -> Any: + """ + Identity structure of a stream is deferred to the identity structure + of the associated invocation, if present. + A bare stream without invocation has no well-defined identity structure. + Specialized stream subclasses should override this method to provide more meaningful identity structure + """ + if self.source is not None: + # if the stream is generated by an operation, use the identity structure from the invocation + return self.source.identity_structure(self.upstreams) + return super().identity_structure() + + +class ImmutableStream(StreamBase): + """ + A class of stream that is constructed from immutable/constant data and does not change over time. + Consequently, the identity of an unsourced stream should be based on the content of the stream itself. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._data_content_identity = None + + @abstractmethod + def data_content_identity_structure(self) -> Any: + """ + Returns a hash of the content of the stream. + This is used to identify the content of the stream. + """ + ... + + def identity_structure(self) -> Any: + if self.source is not None: + # if the stream is generated by an operation, use the identity structure from the invocation + return self.source.identity_structure(self.upstreams) + # otherwise, use the content of the stream as the identity structure + if self._data_content_identity is None: + self._data_content_identity = self.data_content_identity_structure() + return self._data_content_identity diff --git a/src/orcapod/core/streams/efficient_pod_stream.py b/src/orcapod/core/streams/efficient_pod_stream.py new file mode 100644 index 0000000..dca2a90 --- /dev/null +++ b/src/orcapod/core/streams/efficient_pod_stream.py @@ -0,0 +1,362 @@ +import logging +from collections.abc import Iterator +from pathlib import Path +from typing import TYPE_CHECKING + +from orcapod.core.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase +from orcapod.core.streams.table_stream import TableStream + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class EfficientPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + # TODO: define interface for storage or pod storage + def __init__(self, pod: cp.CachedPod, input_stream: cp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(input_stream,), **kwargs) + self.pod = pod + self.input_stream = input_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the input stream + + self._prepared_stream_iterator = input_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + """ + Runs the stream, processing the input stream and preparing the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH, + include_source=True, + include_system_tags=True, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = ( + all_results.filter(pc.is_valid(pc.field("_exists"))) + .drop_columns(target_entries.column_names) + .drop_columns(["_exists"]) + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + pending_calls = [] + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + pending = self.pod.async_call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + pending_calls.append(pending) + import asyncio + + completed_calls = await asyncio.gather(*pending_calls) + for result in completed_calls: + cached_results.append(result) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def run( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> None: + self.flow(execution_engine=execution_engine) + + def iter_packets( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """ + Processes the input stream and prepares the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + # all_results = target_entries.join( + # existing_entries.append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left outer", + # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys + # ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + yield tag, packet + + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + cached_results.append((tag, packet)) + if packet is not None: + yield tag, packet + + self._cached_output_packets = cached_results + self._set_modified_time() + else: + for tag, packet in self._cached_output_packets: + if packet is not None: + yield tag, packet + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, _ = self.input_stream.types( + include_system_tags=include_system_tags + ) + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(execution_engine=execution_engine): + if tag_schema is None: + tag_schema = tag.arrow_schema(include_system_tags=True) + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict(include_system_tags=True)) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + if not include_system_tags: + # TODO: come up with a more efficient approach + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + + output_table = self._cached_output_table.drop_columns(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(execution_engine=execution_engine): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if sort_by_tags: + try: + # TODO: consider having explicit tag/packet properties? + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) + except pa.ArrowTypeError: + pass + + return output_table diff --git a/src/orcapod/core/streams/kernel_stream.py b/src/orcapod/core/streams/kernel_stream.py new file mode 100644 index 0000000..40908db --- /dev/null +++ b/src/orcapod/core/streams/kernel_stream.py @@ -0,0 +1,189 @@ +import logging +from collections.abc import Iterator +from datetime import datetime +from typing import TYPE_CHECKING + +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + import pandas as pd + import asyncio +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + pd = LazyModule("pandas") + asyncio = LazyModule("asyncio") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class KernelStream(StreamBase): + """ + Recomputable stream that wraps a stream produced by a kernel to provide + an abstraction over the stream, taking the stream's source and upstreams as the basis of + recomputing the stream. + + This stream is used to represent the output of a kernel invocation. + """ + + def __init__( + self, + output_stream: cp.Stream | None = None, + source: cp.Kernel | None = None, + upstreams: tuple[ + cp.Stream, ... + ] = (), # if provided, this will override the upstreams of the output_stream + **kwargs, + ) -> None: + if (output_stream is None or output_stream.source is None) and source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + if source is None: + if output_stream is None or output_stream.source is None: + raise ValueError( + "Either output_stream must have a kernel assigned to it or source must be provided in order to be recomputable." + ) + source = output_stream.source + upstreams = upstreams or output_stream.upstreams + + super().__init__(source=source, upstreams=upstreams, **kwargs) + self.kernel = source + self._cached_stream = output_stream + + def clear_cache(self) -> None: + """ + Clears the cached stream. + This is useful for re-processing the stream with the same kernel. + """ + self._cached_stream = None + self._set_modified_time(invalidate=True) + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + tag_types, packet_types = self.kernel.output_types( + *self.upstreams, include_system_tags=include_system_tags + ) + return tuple(tag_types.keys()), tuple(packet_types.keys()) + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + return self.kernel.output_types( + *self.upstreams, include_system_tags=include_system_tags + ) + + @property + def is_current(self) -> bool: + if self._cached_stream is None or not super().is_current: + status = self.refresh() + if not status: # if it failed to update for whatever reason + return False + return True + + def refresh(self, force: bool = False) -> bool: + updated = False + if force or (self._cached_stream is not None and not super().is_current): + self.clear_cache() + + if self._cached_stream is None: + assert self.source is not None, ( + "Stream source must be set to recompute the stream." + ) + self._cached_stream = self.source.forward(*self.upstreams) + self._set_modified_time() + updated = True + + if self._cached_stream is None: + # TODO: use beter error type + raise ValueError( + "Stream could not be updated. Ensure that the source is valid and upstreams are correct." + ) + + return updated + + def invalidate(self) -> None: + """ + Invalidate the stream, marking it as needing recomputation. + This will clear the cached stream and set the last modified time to None. + """ + self.clear_cache() + self._set_modified_time(invalidate=True) + + @property + def last_modified(self) -> datetime | None: + if self._cached_stream is None: + return None + return self._cached_stream.last_modified + + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + self._cached_stream.run(execution_engine=execution_engine) + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + await self._cached_stream.run_async(execution_engine=execution_engine) + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + self.refresh() + assert self._cached_stream is not None, ( + "Stream has not been updated or is empty." + ) + return self._cached_stream.iter_packets(execution_engine=execution_engine) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(kernel={self.source}, upstreams={self.upstreams})" diff --git a/src/orcapod/core/streams/lazy_pod_stream.py b/src/orcapod/core/streams/lazy_pod_stream.py new file mode 100644 index 0000000..a80cd97 --- /dev/null +++ b/src/orcapod/core/streams/lazy_pod_stream.py @@ -0,0 +1,228 @@ +import logging +from collections.abc import Iterator +from pathlib import Path +from typing import TYPE_CHECKING + +from orcapod.core.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase + + +if TYPE_CHECKING: + import pyarrow as pa + import polars as pl + import asyncio +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + asyncio = LazyModule("asyncio") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class LazyPodResultStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + def __init__(self, pod: cp.Pod, prepared_stream: cp.Stream, **kwargs): + super().__init__(source=pod, upstreams=(prepared_stream,), **kwargs) + self.pod = pod + self.prepared_stream = prepared_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the prepared stream + self._prepared_stream_iterator = prepared_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: dict[int, tuple[cp.Tag, cp.Packet | None]] = {} + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + def iter_packets( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + if self._prepared_stream_iterator is not None: + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i in self._cached_output_packets: + # Use cached result + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + else: + # Process packet + processed = self.pod.call( + tag, packet, execution_engine=execution_engine + ) + if processed is not None: + # Update shared cache for future iterators (optimization) + self._cached_output_packets[i] = processed + tag, packet = processed + if packet is not None: + yield tag, packet + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + else: + # Yield from snapshot of complete cache + for i in range(len(self._cached_output_packets)): + tag, packet = self._cached_output_packets[i] + if packet is not None: + yield tag, packet + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + if self._prepared_stream_iterator is not None: + pending_call_lut = {} + for i, (tag, packet) in enumerate(self._prepared_stream_iterator): + if i not in self._cached_output_packets: + # Process packet + pending_call_lut[i] = self.pod.async_call( + tag, packet, execution_engine=execution_engine + ) + + indices = list(pending_call_lut.keys()) + pending_calls = [pending_call_lut[i] for i in indices] + + results = await asyncio.gather(*pending_calls) + for i, result in zip(indices, results): + self._cached_output_packets[i] = result + + # Mark completion by releasing the iterator + self._prepared_stream_iterator = None + + def run( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> None: + # Fallback to synchronous run + self.flow(execution_engine=execution_engine) + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.prepared_stream.keys(include_system_tags=include_system_tags) + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, _ = self.prepared_stream.types( + include_system_tags=include_system_tags + ) + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(execution_engine=execution_engine): + if tag_schema is None: + tag_schema = tag.arrow_schema(include_system_tags=True) + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict(include_system_tags=True)) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + # TODO: re-verify the implemetation of this conversion + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_system_tags: + # TODO: get system tags more effiicently + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + + output_table = self._cached_output_table.drop(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + # TODO: verify that order will be preserved + for tag, packet in self.iter_packets(): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if sort_by_tags: + # TODO: reimplement using polars natively + output_table = ( + pl.DataFrame(output_table) + .sort(by=self.keys()[0], descending=False) + .to_arrow() + ) + # output_table = output_table.sort_by( + # [(column, "ascending") for column in self.keys()[0]] + # ) + return output_table diff --git a/src/orcapod/core/streams/table_stream.py b/src/orcapod/core/streams/table_stream.py new file mode 100644 index 0000000..51cd466 --- /dev/null +++ b/src/orcapod/core/streams/table_stream.py @@ -0,0 +1,319 @@ +import logging +from collections.abc import Collection, Iterator +from itertools import repeat +from typing import TYPE_CHECKING, Any, cast + +from orcapod import contexts +from orcapod.core.datagrams import ( + ArrowPacket, + ArrowTag, + DictTag, +) +from orcapod.core.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import ImmutableStream + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + import pandas as pd +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + pd = LazyModule("pandas") + +logger = logging.getLogger(__name__) + + +class TableStream(ImmutableStream): + """ + An immutable stream based on a PyArrow Table. + This stream is designed to be used with data that is already in a tabular format, + such as data loaded from a file or database. The columns to be treated as tags are + specified at initialization, and the rest of the columns are treated as packets. + The stream is immutable, meaning that once it is created, it cannot be modified. + This is useful for ensuring that the data in the stream remains consistent and unchanging. + + The types of the tag and packet columns are inferred from the PyArrow Table schema. + """ + + def __init__( + self, + table: "pa.Table", + tag_columns: Collection[str] = (), + system_tag_columns: Collection[str] = (), + source_info: dict[str, str | None] | None = None, + source: cp.Kernel | None = None, + upstreams: tuple[cp.Stream, ...] = (), + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=upstreams, **kwargs) + + data_table, data_context_table = arrow_utils.split_by_column_groups( + table, [constants.CONTEXT_KEY] + ) + if data_table is None: + # TODO: provide better error message + raise ValueError( + "Table must contain at least one column to be used as a stream." + ) + table = data_table + + if data_context_table is None: + data_context_table = pa.table( + { + constants.CONTEXT_KEY: pa.array( + [contexts.get_default_context_key()] * len(table), + pa.large_string(), + ) + } + ) + + prefix_info = {constants.SOURCE_PREFIX: source_info} + + # determine tag columns first and then exclude any source info + self._tag_columns = tuple(c for c in tag_columns if c in table.column_names) + self._system_tag_columns = tuple( + c for c in table.column_names if c.startswith(constants.SYSTEM_TAG_PREFIX) + ) + if len(system_tag_columns) > 0: + # rename system_tag_columns + column_name_map = { + c: f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns + } + table = table.rename_columns( + [column_name_map.get(c, c) for c in table.column_names] + ) + + self._system_tag_columns += tuple( + f"{constants.SYSTEM_TAG_PREFIX}{c}" for c in system_tag_columns + ) + + self._all_tag_columns = self._tag_columns + self._system_tag_columns + if delta := set(tag_columns) - set(self._tag_columns): + raise ValueError( + f"Specified tag columns {delta} are not present in the table." + ) + table, prefix_tables = arrow_utils.prepare_prefixed_columns( + table, + prefix_info, + exclude_columns=self._all_tag_columns, + ) + # now table should only contain tag columns and packet columns + self._packet_columns = tuple( + c for c in table.column_names if c not in self._all_tag_columns + ) + self._table = table + self._source_info_table = prefix_tables[constants.SOURCE_PREFIX] + self._data_context_table = data_context_table + + if len(self._packet_columns) == 0: + raise ValueError( + "No packet columns found in the table. At least one packet column is required." + ) + + tag_schema = pa.schema( + f for f in self._table.schema if f.name in self._tag_columns + ) + system_tag_schema = pa.schema( + f for f in self._table.schema if f.name in self._system_tag_columns + ) + all_tag_schema = arrow_utils.join_arrow_schemas(tag_schema, system_tag_schema) + packet_schema = pa.schema( + f for f in self._table.schema if f.name in self._packet_columns + ) + + self._tag_schema = tag_schema + self._system_tag_schema = system_tag_schema + self._all_tag_schema = all_tag_schema + self._packet_schema = packet_schema + # self._tag_converter = SemanticConverter.from_semantic_schema( + # schemas.SemanticSchema.from_arrow_schema( + # tag_schema, self._data_context.semantic_type_registry + # ) + # ) + # self._packet_converter = SemanticConverter.from_semantic_schema( + # schemas.SemanticSchema.from_arrow_schema( + # packet_schema, self._data_context.semantic_type_registry + # ) + # ) + + self._cached_elements: list[tuple[cp.Tag, ArrowPacket]] | None = None + self._set_modified_time() # set modified time to now + + def data_content_identity_structure(self) -> Any: + """ + Returns a hash of the content of the stream. + This is used to identify the content of the stream. + """ + table_hash = self.data_context.arrow_hasher.hash_table( + self.as_table( + include_data_context=True, include_source=True, include_system_tags=True + ), + ) + return ( + self.__class__.__name__, + table_hash, + self._tag_columns, + ) + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + tag_columns = self._tag_columns + if include_system_tags: + tag_columns += self._system_tag_columns + return tag_columns, self._packet_columns + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + # TODO: consider using MappingProxyType to avoid copying the dicts + converter = self.data_context.type_converter + if include_system_tags: + tag_schema = self._all_tag_schema + else: + tag_schema = self._tag_schema + return ( + converter.arrow_schema_to_python_schema(tag_schema), + converter.arrow_schema_to_python_schema(self._packet_schema), + ) + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + output_table = self._table + if include_content_hash: + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + content_hashes = [ + str(packet.content_hash()) for _, packet in self.iter_packets() + ] + output_table = output_table.append_column( + hash_column_name, pa.array(content_hashes, type=pa.large_string()) + ) + if not include_system_tags: + # Check in original implementation + output_table = output_table.drop_columns(list(self._system_tag_columns)) + table_stack = (output_table,) + if include_data_context: + table_stack += (self._data_context_table,) + if include_source: + table_stack += (self._source_info_table,) + + table = arrow_utils.hstack_tables(*table_stack) + + if sort_by_tags: + # TODO: cleanup the sorting tag selection logic + try: + target_tags = ( + self._all_tag_columns if include_system_tags else self._tag_columns + ) + return table.sort_by([(column, "ascending") for column in target_tags]) + except pa.ArrowTypeError: + # If sorting fails, fall back to unsorted table + return table + + return table + + def clear_cache(self) -> None: + """ + Resets the cached elements of the stream. + This is useful for re-iterating over the stream. + """ + self._cached_elements = None + + def iter_packets( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, ArrowPacket]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + # TODO: make it work with table batch stream + if self._cached_elements is None: + self._cached_elements = [] + tag_present = len(self._all_tag_columns) > 0 + if tag_present: + tags = self._table.select(self._all_tag_columns) + tag_batches = tags.to_batches() + else: + tag_batches = repeat(DictTag({})) + + # TODO: come back and clean up this logic + + packets = self._table.select(self._packet_columns) + for tag_batch, packet_batch in zip(tag_batches, packets.to_batches()): + for i in range(len(packet_batch)): + if tag_present: + tag = ArrowTag( + tag_batch.slice(i, 1), # type: ignore + data_context=self.data_context, + ) + + else: + tag = cast(DictTag, tag_batch) + + self._cached_elements.append( + ( + tag, + ArrowPacket( + packet_batch.slice(i, 1), + source_info=self._source_info_table.slice( + i, 1 + ).to_pylist()[0], + data_context=self.data_context, + ), + ) + ) + yield from self._cached_elements + + def run(self, execution_engine: cp.ExecutionEngine | None = None) -> None: + """ + Runs the stream, which in this case is a no-op since the stream is immutable. + This is typically used to trigger any upstream computation of the stream. + """ + # No-op for immutable streams + pass + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + """ + Runs the stream asynchronously, which in this case is a no-op since the stream is immutable. + This is typically used to trigger any upstream computation of the stream. + """ + # No-op for immutable streams + pass + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(table={self._table.column_names}, " + f"tag_columns={self._tag_columns})" + ) diff --git a/src/orcapod/core/streams/wrapped_stream.py b/src/orcapod/core/streams/wrapped_stream.py new file mode 100644 index 0000000..3f14203 --- /dev/null +++ b/src/orcapod/core/streams/wrapped_stream.py @@ -0,0 +1,86 @@ +import logging +from collections.abc import Iterator +from typing import TYPE_CHECKING, Any + +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase + + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class WrappedStream(StreamBase): + def __init__( + self, + stream: cp.Stream, + source: cp.Kernel, + input_streams: tuple[cp.Stream, ...], + label: str | None = None, + **kwargs, + ) -> None: + super().__init__(source=source, upstreams=input_streams, label=label, **kwargs) + self._stream = stream + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + return self._stream.keys(include_system_tags=include_system_tags) + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + """ + Returns the types of the tag and packet columns in the stream. + This is useful for accessing the types of the columns in the stream. + """ + return self._stream.types(include_system_tags=include_system_tags) + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + """ + Returns the underlying table representation of the stream. + This is useful for converting the stream to a table format. + """ + return self._stream.as_table( + include_data_context=include_data_context, + include_source=include_source, + include_system_tags=include_system_tags, + include_content_hash=include_content_hash, + sort_by_tags=sort_by_tags, + execution_engine=execution_engine, + ) + + def iter_packets( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """ + Iterates over the packets in the stream. + Each packet is represented as a tuple of (Tag, Packet). + """ + return self._stream.iter_packets(execution_engine=execution_engine) + + def identity_structure(self) -> Any: + return self._stream.identity_structure() From f2472bfce30a520fd94c2893dde44bd0b5ff8b29 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 31 Aug 2025 18:18:21 -0700 Subject: [PATCH 220/224] feat: add pipeline dag plotting --- pyproject.toml | 1 + src/orcapod/pipeline/graph.py | 217 ++++++++++++++++++++++++++++++++++ uv.lock | 11 ++ 3 files changed, 229 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 14996e0..5c243d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "beartype>=0.21.0", "deltalake>=1.0.2", "selection-pipeline", + "graphviz>=0.21", ] readme = "README.md" requires-python = ">=3.11.0" diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 666b66e..9fcd198 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -6,8 +6,17 @@ from orcapod.protocols import database_protocols as dbp from typing import Any from collections.abc import Collection +import os +import tempfile import logging import asyncio +from typing import TYPE_CHECKING +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import networkx as nx +else: + nx = LazyModule("networkx") def synchronous_run(async_func, *args, **kwargs): @@ -109,11 +118,16 @@ def compile(self) -> None: invocation_to_stream_lut = {} G = self.generate_graph() + node_graph = nx.DiGraph() for invocation in nx.topological_sort(G): input_streams = [ invocation_to_stream_lut[parent] for parent in invocation.parents() ] + node = self.wrap_invocation(invocation, new_input_streams=input_streams) + for parent in node.upstreams: + node_graph.add_edge(parent.source, node) + invocation_to_stream_lut[invocation] = node() name_candidates.setdefault(node.label, []).append(node) @@ -127,6 +141,13 @@ def compile(self) -> None: else: self.nodes[label] = nodes[0] + self.label_lut = {v: k for k, v in self.nodes.items()} + + self.graph = node_graph + + def show_graph(self, **kwargs) -> None: + render_graph(self.graph, self.label_lut, **kwargs) + def run( self, execution_engine: cp.ExecutionEngine | None = None, @@ -217,3 +238,199 @@ def rename(self, old_name: str, new_name: str) -> None: node.label = new_name self.nodes[new_name] = node logger.info(f"Node '{old_name}' renamed to '{new_name}'") + + +# import networkx as nx +# # import graphviz +# import matplotlib.pyplot as plt +# import matplotlib.image as mpimg +# import tempfile +# import os + + +class GraphRenderer: + """Simple renderer for NetworkX graphs using Graphviz DOT format""" + + def __init__(self): + """Initialize the renderer""" + pass + + def _sanitize_node_id(self, node_id: Any) -> str: + """Convert node_id to a valid DOT identifier using hash""" + return f"node_{hash(node_id)}" + + def _get_node_label( + self, node_id: Any, label_lut: dict[Any, str] | None = None + ) -> str: + """Get label for a node""" + if label_lut and node_id in label_lut: + return label_lut[node_id] + return str(node_id) + + def generate_dot( + self, + graph: "nx.DiGraph", + label_lut: dict[Any, str] | None = None, + rankdir: str = "TB", + node_shape: str = "box", + node_style: str = "filled", + node_color: str = "lightblue", + edge_color: str = "black", + dpi: int = 150, + ) -> str: + """ + Generate DOT syntax from NetworkX graph + + Args: + graph: NetworkX DiGraph to render + label_lut: Optional dictionary mapping node_id -> display_label + rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') + node_shape: Shape for all nodes + node_style: Style for all nodes + node_color: Fill color for all nodes + edge_color: Color for all edges + dpi: Resolution for rendered image (default 150) + + Returns: + DOT format string + """ + try: + import graphviz + except ImportError as e: + raise ImportError( + "Graphviz is not installed. Please install graphviz to render graph of the pipeline." + ) from e + + dot = graphviz.Digraph(comment="NetworkX Graph") + + # Set graph attributes + dot.attr(rankdir=rankdir, dpi=str(dpi)) + dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) + dot.attr("edge", color=edge_color) + + # Add nodes + for node_id in graph.nodes(): + sanitized_id = self._sanitize_node_id(node_id) + label = self._get_node_label(node_id, label_lut) + dot.node(sanitized_id, label=label) + + # Add edges + for source, target in graph.edges(): + source_id = self._sanitize_node_id(source) + target_id = self._sanitize_node_id(target) + dot.edge(source_id, target_id) + + return dot.source + + def render_graph( + self, + graph: nx.DiGraph, + label_lut: dict[Any, str] | None = None, + show: bool = True, + output_path: str | None = None, + raw_output: bool = False, + rankdir: str = "TB", + figsize: tuple = (6, 4), + dpi: int = 150, + **style_kwargs, + ) -> str | None: + """ + Render NetworkX graph using Graphviz + + Args: + graph: NetworkX DiGraph to render + label_lut: Optional dictionary mapping node_id -> display_label + show: Display the graph using matplotlib + output_path: Save graph to file (e.g., 'graph.png', 'graph.pdf') + raw_output: Return DOT syntax instead of rendering + rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') + figsize: Figure size for matplotlib display + dpi: Resolution for rendered image (default 150) + **style_kwargs: Additional styling (node_color, edge_color, node_shape, etc.) + + Returns: + DOT syntax if raw_output=True, None otherwise + """ + try: + import graphviz + except ImportError as e: + raise ImportError( + "Graphviz is not installed. Please install graphviz to render graph of the pipeline." + ) from e + + if raw_output: + return self.generate_dot(graph, label_lut, rankdir, dpi=dpi, **style_kwargs) + + # Create Graphviz object + dot = graphviz.Digraph(comment="NetworkX Graph") + dot.attr(rankdir=rankdir, dpi=str(dpi)) + + # Apply styling + node_shape = style_kwargs.get("node_shape", "box") + node_style = style_kwargs.get("node_style", "filled") + node_color = style_kwargs.get("node_color", "lightblue") + edge_color = style_kwargs.get("edge_color", "black") + + dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) + dot.attr("edge", color=edge_color) + + # Add nodes with labels + for node_id in graph.nodes(): + sanitized_id = self._sanitize_node_id(node_id) + label = self._get_node_label(node_id, label_lut) + dot.node(sanitized_id, label=label) + + # Add edges + for source, target in graph.edges(): + source_id = self._sanitize_node_id(source) + target_id = self._sanitize_node_id(target) + dot.edge(source_id, target_id) + + # Handle output + if output_path: + # Save to file + name, ext = os.path.splitext(output_path) + format_type = ext[1:] if ext else "png" + dot.render(name, format=format_type, cleanup=True) + print(f"Graph saved to {output_path}") + + if show: + # Display with matplotlib + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + dot.render(tmp.name[:-4], format="png", cleanup=True) + + import matplotlib.pyplot as plt + import matplotlib.image as mpimg + + # Display with matplotlib + img = mpimg.imread(tmp.name) + plt.figure(figsize=figsize) + plt.imshow(img) + plt.axis("off") + plt.title("Graph Visualization") + plt.tight_layout() + plt.show() + + # Clean up + os.unlink(tmp.name) + + return None + + +# Convenience function for quick rendering +def render_graph( + graph: nx.DiGraph, label_lut: dict[Any, str] | None = None, **kwargs +) -> str | None: + """ + Convenience function to quickly render a NetworkX graph + + Args: + graph: NetworkX DiGraph to render + label_lut: Optional dictionary mapping node_id -> display_label + **kwargs: All other arguments passed to GraphRenderer.render_graph() + + Returns: + DOT syntax if raw_output=True, None otherwise + """ + renderer = GraphRenderer() + return renderer.render_graph(graph, label_lut, **kwargs) diff --git a/uv.lock b/uv.lock index 891c272..900ac2f 100644 --- a/uv.lock +++ b/uv.lock @@ -1275,6 +1275,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, ] +[[package]] +name = "graphviz" +version = "0.21" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/b3/3ac91e9be6b761a4b30d66ff165e54439dcd48b83f4e20d644867215f6ca/graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78", size = 200434, upload-time = "2025-06-15T09:35:05.824Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" }, +] + [[package]] name = "griffe" version = "1.13.0" @@ -2514,6 +2523,7 @@ source = { editable = "." } dependencies = [ { name = "beartype" }, { name = "deltalake" }, + { name = "graphviz" }, { name = "matplotlib" }, { name = "networkx" }, { name = "pandas" }, @@ -2575,6 +2585,7 @@ dev = [ requires-dist = [ { name = "beartype", specifier = ">=0.21.0" }, { name = "deltalake", specifier = ">=1.0.2" }, + { name = "graphviz", specifier = ">=0.21" }, { name = "ipywidgets", marker = "extra == 'ray'", specifier = ">=8.1.7" }, { name = "matplotlib", specifier = ">=3.10.3" }, { name = "networkx" }, From 99763dcd777aea18ce12fc29ef9746819b606448 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Sun, 31 Aug 2025 18:24:14 -0700 Subject: [PATCH 221/224] fix: compatibility of delta table database with database protocol --- src/orcapod/databases/delta_lake_databases.py | 2 +- src/orcapod/protocols/database_protocols.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/databases/delta_lake_databases.py b/src/orcapod/databases/delta_lake_databases.py index e8aba77..1ffd5d4 100644 --- a/src/orcapod/databases/delta_lake_databases.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -586,7 +586,7 @@ def get_records_with_column_value( column_values: Collection[tuple[str, Any]] | Mapping[str, Any], record_id_column: str | None = None, flush: bool = False, - ): + ) -> "pa.Table | None": if flush: self.flush_batch(record_path) # check if record_id is found in pending batches diff --git a/src/orcapod/protocols/database_protocols.py b/src/orcapod/protocols/database_protocols.py index dd37903..1bf9eac 100644 --- a/src/orcapod/protocols/database_protocols.py +++ b/src/orcapod/protocols/database_protocols.py @@ -51,7 +51,7 @@ def get_records_by_ids( def get_records_with_column_value( self, record_path: tuple[str, ...], - column_name_value: Collection[tuple[str, Any]] | Mapping[str, Any], + column_values: Collection[tuple[str, Any]] | Mapping[str, Any], record_id_column: str | None = None, flush: bool = False, ) -> "pa.Table | None": ... From ee0b1a18f35bc1ce26b387bcb43ed903593e093a Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 1 Sep 2025 12:17:54 -0700 Subject: [PATCH 222/224] refactor: clean up type use and add graph support --- src/orcapod/core/datagrams/arrow_datagram.py | 15 +- .../core/datagrams/arrow_tag_packet.py | 22 +- src/orcapod/core/datagrams/base.py | 14 +- src/orcapod/core/datagrams/dict_datagram.py | 8 +- src/orcapod/core/datagrams/dict_tag_packet.py | 19 +- src/orcapod/core/operators/mappers.py | 15 +- src/orcapod/core/pods.py | 4 +- src/orcapod/core/sources/data_frame_source.py | 3 - src/orcapod/core/streams/__init__.py | 4 +- src/orcapod/core/streams/base.py | 9 + ...ent_pod_stream.py => cached_pod_stream.py} | 76 +- src/orcapod/core/streams/pod_node_stream.py | 435 +++++++ .../basic_delta_lake_arrow_database.py | 1008 +++++++++++++++++ src/orcapod/databases/delta_lake_databases.py | 987 +--------------- src/orcapod/pipeline/graph.py | 631 +++++++++-- src/orcapod/pipeline/nodes.py | 6 + .../semantic_types/semantic_registry.py | 12 +- src/orcapod/semantic_types/type_inference.py | 3 +- .../semantic_types/universal_converter.py | 54 +- src/orcapod/utils/arrow_utils.py | 25 +- src/orcapod/utils/types_utils.py | 73 +- 21 files changed, 2238 insertions(+), 1185 deletions(-) rename src/orcapod/core/streams/{efficient_pod_stream.py => cached_pod_stream.py} (83%) create mode 100644 src/orcapod/core/streams/pod_node_stream.py create mode 100644 src/orcapod/databases/basic_delta_lake_arrow_database.py diff --git a/src/orcapod/core/datagrams/arrow_datagram.py b/src/orcapod/core/datagrams/arrow_datagram.py index db34ad6..52c046e 100644 --- a/src/orcapod/core/datagrams/arrow_datagram.py +++ b/src/orcapod/core/datagrams/arrow_datagram.py @@ -1,8 +1,7 @@ import logging from collections.abc import Collection, Iterator, Mapping -from typing import Self +from typing import Self, TYPE_CHECKING -import pyarrow as pa from orcapod import contexts from orcapod.core.datagrams.base import BaseDatagram @@ -10,6 +9,12 @@ from orcapod.types import DataValue, PythonSchema from orcapod.protocols.hashing_protocols import ContentHash from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") logger = logging.getLogger(__name__) DEBUG = False @@ -49,7 +54,7 @@ class ArrowDatagram(BaseDatagram): def __init__( self, - table: pa.Table, + table: "pa.Table", meta_info: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: @@ -272,7 +277,7 @@ def arrow_schema( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -395,7 +400,7 @@ def as_table( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Table: + ) -> "pa.Table": """ Convert the datagram to an Arrow table. diff --git a/src/orcapod/core/datagrams/arrow_tag_packet.py b/src/orcapod/core/datagrams/arrow_tag_packet.py index b5d93cd..24d2185 100644 --- a/src/orcapod/core/datagrams/arrow_tag_packet.py +++ b/src/orcapod/core/datagrams/arrow_tag_packet.py @@ -1,10 +1,8 @@ import logging from collections.abc import Collection, Mapping -from typing import Self +from typing import Self, TYPE_CHECKING -import pyarrow as pa - from orcapod.core.system_constants import constants from orcapod import contexts from orcapod.semantic_types import infer_python_schema_from_pylist_data @@ -13,9 +11,15 @@ from orcapod.utils import arrow_utils from orcapod.core.datagrams.arrow_datagram import ArrowDatagram +from orcapod.utils.lazy_module import LazyModule logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + class ArrowTag(ArrowDatagram): """ @@ -35,7 +39,7 @@ class ArrowTag(ArrowDatagram): def __init__( self, - table: pa.Table, + table: "pa.Table", system_tags: Mapping[str, DataValue] | None = None, data_context: str | contexts.DataContext | None = None, ) -> None: @@ -109,7 +113,7 @@ def arrow_schema( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -162,7 +166,7 @@ def as_table( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> pa.Table: + ) -> "pa.Table": table = super().as_table( include_all_info=include_all_info, include_meta_columns=include_meta_columns, @@ -239,7 +243,7 @@ class ArrowPacket(ArrowDatagram): def __init__( self, - table: pa.Table | pa.RecordBatch, + table: "pa.Table | pa.RecordBatch", meta_info: Mapping[str, DataValue] | None = None, source_info: Mapping[str, str | None] | None = None, data_context: str | contexts.DataContext | None = None, @@ -321,7 +325,7 @@ def arrow_schema( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -379,7 +383,7 @@ def as_table( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> pa.Table: + ) -> "pa.Table": table = super().as_table( include_all_info=include_all_info, include_meta_columns=include_meta_columns, diff --git a/src/orcapod/core/datagrams/base.py b/src/orcapod/core/datagrams/base.py index 48cbc53..ec68860 100644 --- a/src/orcapod/core/datagrams/base.py +++ b/src/orcapod/core/datagrams/base.py @@ -19,17 +19,21 @@ import logging from abc import abstractmethod from collections.abc import Collection, Iterator, Mapping -from typing import Self, TypeAlias +from typing import Self, TypeAlias, TYPE_CHECKING from orcapod import contexts from orcapod.core.base import ContentIdentifiableBase from orcapod.protocols.hashing_protocols import ContentHash -import pyarrow as pa - +from orcapod.utils.lazy_module import LazyModule from orcapod.types import DataValue, PythonSchema logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + # A conveniece packet-like type that defines a value that can be # converted to a packet. It's broader than Packet and a simple mapping # from string keys to DataValue (e.g., int, float, str) can be regarded @@ -188,7 +192,7 @@ def arrow_schema( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """Return the PyArrow schema for this datagram.""" ... @@ -214,7 +218,7 @@ def as_table( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Table: + ) -> "pa.Table": """Convert the datagram to an Arrow table.""" ... diff --git a/src/orcapod/core/datagrams/dict_datagram.py b/src/orcapod/core/datagrams/dict_datagram.py index ec9825e..642a5b2 100644 --- a/src/orcapod/core/datagrams/dict_datagram.py +++ b/src/orcapod/core/datagrams/dict_datagram.py @@ -258,7 +258,7 @@ def arrow_schema( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -407,7 +407,7 @@ def as_arrow_compatible_dict( [python_dict], python_schema=python_schema )[0] - def _get_meta_arrow_table(self) -> pa.Table: + def _get_meta_arrow_table(self) -> "pa.Table": if self._cached_meta_table is None: arrow_schema = self._get_meta_arrow_schema() self._cached_meta_table = pa.Table.from_pylist( @@ -419,7 +419,7 @@ def _get_meta_arrow_table(self) -> pa.Table: ) return self._cached_meta_table - def _get_meta_arrow_schema(self) -> pa.Schema: + def _get_meta_arrow_schema(self) -> "pa.Schema": if self._cached_meta_arrow_schema is None: self._cached_meta_arrow_schema = ( self._data_context.type_converter.python_schema_to_arrow_schema( @@ -437,7 +437,7 @@ def as_table( include_all_info: bool = False, include_meta_columns: bool | Collection[str] = False, include_context: bool = False, - ) -> pa.Table: + ) -> "pa.Table": """ Convert the datagram to an Arrow table. diff --git a/src/orcapod/core/datagrams/dict_tag_packet.py b/src/orcapod/core/datagrams/dict_tag_packet.py index afc8aa2..11e6d66 100644 --- a/src/orcapod/core/datagrams/dict_tag_packet.py +++ b/src/orcapod/core/datagrams/dict_tag_packet.py @@ -1,8 +1,7 @@ import logging from collections.abc import Collection, Mapping -from typing import Self +from typing import Self, TYPE_CHECKING -import pyarrow as pa from orcapod.core.system_constants import constants from orcapod import contexts @@ -10,6 +9,12 @@ from orcapod.utils import arrow_utils from orcapod.semantic_types import infer_python_schema_from_pylist_data from orcapod.types import DataValue, PythonSchema, PythonSchemaLike +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") logger = logging.getLogger(__name__) @@ -72,7 +77,7 @@ def as_table( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> pa.Table: + ) -> "pa.Table": """Convert the packet to an Arrow table.""" table = super().as_table( include_all_info=include_all_info, @@ -158,7 +163,7 @@ def arrow_schema( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_system_tags: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. @@ -291,7 +296,7 @@ def __init__( self._cached_source_info_schema: pa.Schema | None = None @property - def _source_info_arrow_schema(self) -> pa.Schema: + def _source_info_arrow_schema(self) -> "pa.Schema": if self._cached_source_info_schema is None: self._cached_source_info_schema = ( self._converter.python_schema_to_arrow_schema( @@ -312,7 +317,7 @@ def as_table( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> pa.Table: + ) -> "pa.Table": """Convert the packet to an Arrow table.""" table = super().as_table( include_all_info=include_all_info, @@ -441,7 +446,7 @@ def arrow_schema( include_meta_columns: bool | Collection[str] = False, include_context: bool = False, include_source: bool = False, - ) -> pa.Schema: + ) -> "pa.Schema": """ Return the PyArrow schema for this datagram. diff --git a/src/orcapod/core/operators/mappers.py b/src/orcapod/core/operators/mappers.py index 5e33598..5500e1b 100644 --- a/src/orcapod/core/operators/mappers.py +++ b/src/orcapod/core/operators/mappers.py @@ -144,9 +144,11 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: table = stream.as_table(include_source=True, include_system_tags=True) name_map = { - tc: self.name_map.get(tc, tc) for tc in tag_columns + tc: self.name_map.get(tc, tc) + for tc in tag_columns + if tc in self.name_map or not self.drop_unmapped } # rename the tag as necessary - new_tag_columns = [name_map[tc] for tc in tag_columns] + new_tag_columns = list(name_map.values()) for c in packet_columns: name_map[c] = c # no renaming on packet columns @@ -195,6 +197,15 @@ def op_output_types( # Create new packet typespec with renamed keys new_tag_typespec = {self.name_map.get(k, k): v for k, v in tag_typespec.items()} + # Create new packet typespec with renamed keys + new_tag_typespec = { + self.name_map.get(k, k): v + for k, v in tag_typespec.items() + if k in self.name_map or not self.drop_unmapped + } + + return new_tag_typespec, packet_typespec + return new_tag_typespec, packet_typespec def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: diff --git a/src/orcapod/core/pods.py b/src/orcapod/core/pods.py index 5593b16..d9855d9 100644 --- a/src/orcapod/core/pods.py +++ b/src/orcapod/core/pods.py @@ -13,7 +13,7 @@ ) from orcapod.core.kernels import KernelStream, TrackedKernelBase from orcapod.core.operators import Join -from orcapod.core.streams import EfficientPodResultStream, LazyPodResultStream +from orcapod.core.streams import CachedPodStream, LazyPodResultStream from orcapod.core.system_constants import constants from orcapod.hashing.hash_utils import get_function_components, get_function_signature from orcapod.protocols import core_protocols as cp @@ -723,7 +723,7 @@ async def async_call( def forward(self, *streams: cp.Stream) -> cp.Stream: assert len(streams) == 1, "PodBase.forward expects exactly one input stream" - return EfficientPodResultStream(pod=self, input_stream=streams[0]) + return CachedPodStream(pod=self, input_stream=streams[0]) def record_packet( self, diff --git a/src/orcapod/core/sources/data_frame_source.py b/src/orcapod/core/sources/data_frame_source.py index 54cd2ba..35af11d 100644 --- a/src/orcapod/core/sources/data_frame_source.py +++ b/src/orcapod/core/sources/data_frame_source.py @@ -1,9 +1,6 @@ from collections.abc import Collection from typing import TYPE_CHECKING, Any -import polars - - from orcapod.core.streams import TableStream from orcapod.protocols import core_protocols as cp from orcapod.types import PythonSchema diff --git a/src/orcapod/core/streams/__init__.py b/src/orcapod/core/streams/__init__.py index 630d32f..86f64bb 100644 --- a/src/orcapod/core/streams/__init__.py +++ b/src/orcapod/core/streams/__init__.py @@ -2,7 +2,7 @@ from .kernel_stream import KernelStream from .table_stream import TableStream from .lazy_pod_stream import LazyPodResultStream -from .efficient_pod_stream import EfficientPodResultStream +from .cached_pod_stream import CachedPodStream from .wrapped_stream import WrappedStream @@ -11,6 +11,6 @@ "KernelStream", "TableStream", "LazyPodResultStream", - "EfficientPodResultStream", + "CachedPodStream", "WrappedStream", ] diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py index 0977ef1..082225a 100644 --- a/src/orcapod/core/streams/base.py +++ b/src/orcapod/core/streams/base.py @@ -398,6 +398,15 @@ def flow( """ return [e for e in self.iter_packets(execution_engine=execution_engine)] + def _repr_html_(self) -> str: + df = self.as_polars_df() + if df is not None: + tag_map = {t: f"*{t}" for t in self.tag_keys()} + # TODO: construct repr html better + df = df.rename(tag_map) + return f"{self.__class__.__name__}[{self.label}]\n" + df._repr_html_() + return "" + # def identity_structure(self) -> Any: # """ # Identity structure of a stream is deferred to the identity structure diff --git a/src/orcapod/core/streams/efficient_pod_stream.py b/src/orcapod/core/streams/cached_pod_stream.py similarity index 83% rename from src/orcapod/core/streams/efficient_pod_stream.py rename to src/orcapod/core/streams/cached_pod_stream.py index dca2a90..4f85f30 100644 --- a/src/orcapod/core/streams/efficient_pod_stream.py +++ b/src/orcapod/core/streams/cached_pod_stream.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) -class EfficientPodResultStream(StreamBase): +class CachedPodStream(StreamBase): """ A fixed stream that lazily processes packets from a prepared input stream. This is what Pod.process() returns - it's static/fixed but efficient. @@ -131,7 +131,79 @@ def run( self, execution_engine: cp.ExecutionEngine | None = None, ) -> None: - self.flow(execution_engine=execution_engine) + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_cached_outputs(include_system_columns=True) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + cached_results.append((tag, packet)) + + self._cached_output_packets = cached_results + self._set_modified_time() def iter_packets( self, execution_engine: cp.ExecutionEngine | None = None diff --git a/src/orcapod/core/streams/pod_node_stream.py b/src/orcapod/core/streams/pod_node_stream.py new file mode 100644 index 0000000..4ddb5c8 --- /dev/null +++ b/src/orcapod/core/streams/pod_node_stream.py @@ -0,0 +1,435 @@ +import logging +from collections.abc import Iterator +from pathlib import Path +from typing import TYPE_CHECKING + +from orcapod.core.system_constants import constants +from orcapod.protocols import core_protocols as cp +from orcapod.types import PythonSchema +from orcapod.utils import arrow_utils +from orcapod.utils.lazy_module import LazyModule +from orcapod.core.streams.base import StreamBase +from orcapod.core.streams.table_stream import TableStream +from orcapod.protocols.pipeline_protocols import PodNode + + +if TYPE_CHECKING: + import pyarrow as pa + import pyarrow.compute as pc + import polars as pl + +else: + pa = LazyModule("pyarrow") + pc = LazyModule("pyarrow.compute") + pl = LazyModule("polars") + + +# TODO: consider using this instead of making copy of dicts +# from types import MappingProxyType + +logger = logging.getLogger(__name__) + + +class PodNodeStream(StreamBase): + """ + A fixed stream that lazily processes packets from a prepared input stream. + This is what Pod.process() returns - it's static/fixed but efficient. + """ + + # TODO: define interface for storage or pod storage + def __init__(self, pod_node: PodNode, input_stream: cp.Stream, **kwargs): + super().__init__(source=pod_node, upstreams=(input_stream,), **kwargs) + self.pod_node = pod_node + self.input_stream = input_stream + self._set_modified_time() # set modified time to when we obtain the iterator + # capture the immutable iterator from the input stream + + self._prepared_stream_iterator = input_stream.iter_packets() + + # Packet-level caching (from your PodStream) + self._cached_output_packets: list[tuple[cp.Tag, cp.Packet | None]] | None = None + self._cached_output_table: pa.Table | None = None + self._cached_content_hash_column: pa.Array | None = None + + async def run_async( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> None: + """ + Runs the stream, processing the input stream and preparing the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_content_hash=constants.INPUT_PACKET_HASH, + include_source=True, + include_system_tags=True, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + all_results = target_entries.join( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ), + keys=[constants.INPUT_PACKET_HASH], + join_type="left outer", + right_suffix="_right", + ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = ( + all_results.filter(pc.is_valid(pc.field("_exists"))) + .drop_columns(target_entries.column_names) + .drop_columns(["_exists"]) + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + pending_calls = [] + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + pending = self.pod.async_call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + pending_calls.append(pending) + import asyncio + + completed_calls = await asyncio.gather(*pending_calls) + for result in completed_calls: + cached_results.append(result) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def run( + self, + execution_engine: cp.ExecutionEngine | None = None, + ) -> None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_cached_outputs(include_system_columns=True) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + cached_results.append((tag, packet)) + + self._cached_output_packets = cached_results + self._set_modified_time() + + def iter_packets( + self, execution_engine: cp.ExecutionEngine | None = None + ) -> Iterator[tuple[cp.Tag, cp.Packet]]: + """ + Processes the input stream and prepares the output stream. + This is typically called before iterating over the packets. + """ + if self._cached_output_packets is None: + cached_results = [] + + # identify all entries in the input stream for which we still have not computed packets + target_entries = self.input_stream.as_table( + include_system_tags=True, + include_source=True, + include_content_hash=constants.INPUT_PACKET_HASH, + execution_engine=execution_engine, + ) + existing_entries = self.pod.get_all_cached_outputs( + include_system_columns=True + ) + if existing_entries is None or existing_entries.num_rows == 0: + missing = target_entries.drop_columns([constants.INPUT_PACKET_HASH]) + existing = None + else: + # missing = target_entries.join( + # existing_entries, + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left anti", + # ) + # Single join that gives you both missing and existing + # More efficient - only bring the key column from existing_entries + # .select([constants.INPUT_PACKET_HASH]).append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + + # TODO: do more proper replacement operation + target_df = pl.DataFrame(target_entries) + existing_df = pl.DataFrame( + existing_entries.append_column( + "_exists", pa.array([True] * len(existing_entries)) + ) + ) + all_results_df = target_df.join( + existing_df, + on=constants.INPUT_PACKET_HASH, + how="left", + suffix="_right", + ) + all_results = all_results_df.to_arrow() + # all_results = target_entries.join( + # existing_entries.append_column( + # "_exists", pa.array([True] * len(existing_entries)) + # ), + # keys=[constants.INPUT_PACKET_HASH], + # join_type="left outer", + # right_suffix="_right", # rename the existing records in case of collision of output packet keys with input packet keys + # ) + # grab all columns from target_entries first + missing = ( + all_results.filter(pc.is_null(pc.field("_exists"))) + .select(target_entries.column_names) + .drop_columns([constants.INPUT_PACKET_HASH]) + ) + + existing = all_results.filter( + pc.is_valid(pc.field("_exists")) + ).drop_columns( + [ + "_exists", + constants.INPUT_PACKET_HASH, + constants.PACKET_RECORD_ID, + *self.input_stream.keys()[1], # remove the input packet keys + ] + # TODO: look into NOT fetching back the record ID + ) + renamed = [ + c.removesuffix("_right") if c.endswith("_right") else c + for c in existing.column_names + ] + existing = existing.rename_columns(renamed) + + tag_keys = self.input_stream.keys()[0] + + if existing is not None and existing.num_rows > 0: + # If there are existing entries, we can cache them + existing_stream = TableStream(existing, tag_columns=tag_keys) + for tag, packet in existing_stream.iter_packets(): + cached_results.append((tag, packet)) + yield tag, packet + + if missing is not None and missing.num_rows > 0: + for tag, packet in TableStream(missing, tag_columns=tag_keys): + # Since these packets are known to be missing, skip the cache lookup + tag, packet = self.pod.call( + tag, + packet, + skip_cache_lookup=True, + execution_engine=execution_engine, + ) + cached_results.append((tag, packet)) + if packet is not None: + yield tag, packet + + self._cached_output_packets = cached_results + self._set_modified_time() + else: + for tag, packet in self._cached_output_packets: + if packet is not None: + yield tag, packet + + def keys( + self, include_system_tags: bool = False + ) -> tuple[tuple[str, ...], tuple[str, ...]]: + """ + Returns the keys of the tag and packet columns in the stream. + This is useful for accessing the columns in the stream. + """ + + tag_keys, _ = self.input_stream.keys(include_system_tags=include_system_tags) + packet_keys = tuple(self.pod.output_packet_types().keys()) + return tag_keys, packet_keys + + def types( + self, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_typespec, _ = self.input_stream.types( + include_system_tags=include_system_tags + ) + # TODO: check if copying can be avoided + packet_typespec = dict(self.pod.output_packet_types()) + return tag_typespec, packet_typespec + + def as_table( + self, + include_data_context: bool = False, + include_source: bool = False, + include_system_tags: bool = False, + include_content_hash: bool | str = False, + sort_by_tags: bool = True, + execution_engine: cp.ExecutionEngine | None = None, + ) -> "pa.Table": + if self._cached_output_table is None: + all_tags = [] + all_packets = [] + tag_schema, packet_schema = None, None + for tag, packet in self.iter_packets(execution_engine=execution_engine): + if tag_schema is None: + tag_schema = tag.arrow_schema(include_system_tags=True) + if packet_schema is None: + packet_schema = packet.arrow_schema( + include_context=True, + include_source=True, + ) + all_tags.append(tag.as_dict(include_system_tags=True)) + # FIXME: using in the pinch conversion to str from path + # replace with an appropriate semantic converter-based approach! + dict_patcket = packet.as_dict(include_context=True, include_source=True) + for k, v in dict_patcket.items(): + if isinstance(v, Path): + dict_patcket[k] = str(v) + all_packets.append(dict_patcket) + + converter = self.data_context.type_converter + + struct_packets = converter.python_dicts_to_struct_dicts(all_packets) + all_tags_as_tables: pa.Table = pa.Table.from_pylist( + all_tags, schema=tag_schema + ) + all_packets_as_tables: pa.Table = pa.Table.from_pylist( + struct_packets, schema=packet_schema + ) + + self._cached_output_table = arrow_utils.hstack_tables( + all_tags_as_tables, all_packets_as_tables + ) + assert self._cached_output_table is not None, ( + "_cached_output_table should not be None here." + ) + + drop_columns = [] + if not include_source: + drop_columns.extend(f"{constants.SOURCE_PREFIX}{c}" for c in self.keys()[1]) + if not include_data_context: + drop_columns.append(constants.CONTEXT_KEY) + if not include_system_tags: + # TODO: come up with a more efficient approach + drop_columns.extend( + [ + c + for c in self._cached_output_table.column_names + if c.startswith(constants.SYSTEM_TAG_PREFIX) + ] + ) + + output_table = self._cached_output_table.drop_columns(drop_columns) + + # lazily prepare content hash column if requested + if include_content_hash: + if self._cached_content_hash_column is None: + content_hashes = [] + for tag, packet in self.iter_packets(execution_engine=execution_engine): + content_hashes.append(packet.content_hash().to_string()) + self._cached_content_hash_column = pa.array( + content_hashes, type=pa.large_string() + ) + assert self._cached_content_hash_column is not None, ( + "_cached_content_hash_column should not be None here." + ) + hash_column_name = ( + "_content_hash" + if include_content_hash is True + else include_content_hash + ) + output_table = output_table.append_column( + hash_column_name, self._cached_content_hash_column + ) + + if sort_by_tags: + try: + # TODO: consider having explicit tag/packet properties? + output_table = output_table.sort_by( + [(column, "ascending") for column in self.keys()[0]] + ) + except pa.ArrowTypeError: + pass + + return output_table diff --git a/src/orcapod/databases/basic_delta_lake_arrow_database.py b/src/orcapod/databases/basic_delta_lake_arrow_database.py new file mode 100644 index 0000000..412d247 --- /dev/null +++ b/src/orcapod/databases/basic_delta_lake_arrow_database.py @@ -0,0 +1,1008 @@ +import logging +from collections import defaultdict +from collections.abc import Collection, Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, cast + +from deltalake import DeltaTable, write_deltalake +from deltalake.exceptions import TableNotFoundError + +from orcapod.core import constants +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import polars as pl + import pyarrow as pa + import pyarrow.compute as pc +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pc = LazyModule("pyarrow.compute") + +# Module-level logger +logger = logging.getLogger(__name__) + + +class BasicDeltaTableArrowStore: + """ + A basic Delta Table-based Arrow data store with flexible hierarchical path support. + This store does NOT implement lazy loading or streaming capabilities, therefore + being "basic" in that sense. It is designed for simple use cases where data is written + in batches and read back as complete tables. It is worth noting that the Delta table + structure created by this store IS compatible with more advanced Delta Table-based + data stores (to be implemented) that will support lazy loading and streaming. + + Uses tuple-based source paths for robust parameter handling: + - ("source_name", "source_id") -> source_name/source_id/ + - ("org", "project", "dataset") -> org/project/dataset/ + - ("year", "month", "day", "experiment") -> year/month/day/experiment/ + """ + + RECORD_ID_COLUMN = f"{constants.META_PREFIX}record_id" + + def __init__( + self, + base_path: str | Path, + duplicate_entry_behavior: str = "error", + create_base_path: bool = True, + max_hierarchy_depth: int = 10, + batch_size: int = 100, + ): + """ + Initialize the BasicDeltaTableArrowStore. + + Args: + base_path: Base directory path where Delta tables will be stored + duplicate_entry_behavior: How to handle duplicate record_ids: + - 'error': Raise ValueError when record_id already exists + - 'overwrite': Replace existing entry with new data + create_base_path: Whether to create the base path if it doesn't exist + max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) + batch_size: Number of records to batch before writing to Delta table + """ + # Validate duplicate behavior + if duplicate_entry_behavior not in ["error", "overwrite"]: + raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") + + self.duplicate_entry_behavior = duplicate_entry_behavior + self.base_path = Path(base_path) + self.max_hierarchy_depth = max_hierarchy_depth + self.batch_size = batch_size + + if create_base_path: + self.base_path.mkdir(parents=True, exist_ok=True) + elif not self.base_path.exists(): + raise ValueError( + f"Base path {self.base_path} does not exist and create_base_path=False" + ) + + # Cache for Delta tables to avoid repeated initialization + self._delta_table_cache: dict[str, DeltaTable] = {} + + # Batch management + self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) + + logger.info( + f"Initialized DeltaTableArrowDataStore at {self.base_path} " + f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " + f"batch_size={batch_size}, as" + ) + + def flush(self) -> None: + """ + Flush all pending batches immediately. + + This method is called to ensure all pending data is written to the Delta tables. + """ + try: + self.flush_all_batches() + except Exception as e: + logger.error(f"Error during flush: {e}") + + def flush_batch(self, record_path: tuple[str, ...]) -> None: + """ + Flush pending batch for a specific source path. + + Args: + record_path: Tuple of path components + """ + logger.debug("Flushing triggered!!") + source_key = self._get_source_key(record_path) + + if ( + source_key not in self._pending_batches + or not self._pending_batches[source_key] + ): + return + + # Get all pending records + pending_tables = self._pending_batches[source_key] + self._pending_batches[source_key] = {} + + try: + # Combine all tables in the batch + combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() + + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + # Check if table exists + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # TODO: reconsider mode="overwrite" here + write_deltalake( + table_path, + combined_table, + mode="overwrite", + ) + logger.debug( + f"Created new Delta table for {source_key} with {len(combined_table)} records" + ) + else: + if self.duplicate_entry_behavior == "overwrite": + # Get entry IDs from the batch + record_ids = combined_table.column( + self.RECORD_ID_COLUMN + ).to_pylist() + unique_record_ids = cast(list[str], list(set(record_ids))) + + # Delete existing records with these IDs + if unique_record_ids: + record_ids_str = "', '".join(unique_record_ids) + delete_predicate = ( + f"{self.RECORD_ID_COLUMN} IN ('{record_ids_str}')" + ) + try: + delta_table.delete(delete_predicate) + logger.debug( + f"Deleted {len(unique_record_ids)} existing records from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing records to delete from {source_key}: {e}" + ) + + # otherwise, only insert if same record_id does not exist yet + delta_table.merge( + source=combined_table, + predicate=f"target.{self.RECORD_ID_COLUMN} = source.{self.RECORD_ID_COLUMN}", + source_alias="source", + target_alias="target", + ).when_not_matched_insert_all().execute() + + logger.debug( + f"Appended batch of {len(combined_table)} records to {source_key}" + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + # Put the tables back in the pending queue + self._pending_batches[source_key] = pending_tables + raise + + def flush_all_batches(self) -> None: + """Flush all pending batches.""" + source_keys = list(self._pending_batches.keys()) + + # TODO: capture and re-raise exceptions at the end + for source_key in source_keys: + record_path = tuple(source_key.split("/")) + try: + self.flush_batch(record_path) + except Exception as e: + logger.error(f"Error flushing batch for {source_key}: {e}") + + def __del__(self): + """Cleanup when object is destroyed.""" + self.flush() + + def _validate_record_path(self, record_path: tuple[str, ...]) -> None: + # TODO: consider removing this as path creation can be tried directly + """ + Validate source path components. + + Args: + record_path: Tuple of path components + + Raises: + ValueError: If path is invalid + """ + if not record_path: + raise ValueError("Source path cannot be empty") + + if len(record_path) > self.max_hierarchy_depth: + raise ValueError( + f"Source path depth {len(record_path)} exceeds maximum {self.max_hierarchy_depth}" + ) + + # Validate path components + for i, component in enumerate(record_path): + if not component or not isinstance(component, str): + raise ValueError( + f"Source path component {i} is invalid: {repr(component)}" + ) + + # Check for filesystem-unsafe characters + unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] + if any(char in component for char in unsafe_chars): + raise ValueError( + f"Source path {record_path} component {component} contains invalid characters: {repr(component)}" + ) + + def _get_source_key(self, record_path: tuple[str, ...]) -> str: + """Generate cache key for source storage.""" + return "/".join(record_path) + + def _get_table_path(self, record_path: tuple[str, ...]) -> Path: + """Get the filesystem path for a given source path.""" + path = self.base_path + for subpath in record_path: + path = path / subpath + return path + + def _get_existing_delta_table( + self, record_path: tuple[str, ...] + ) -> DeltaTable | None: + """ + Get or create a Delta table, handling schema initialization properly. + + Args: + record_path: Tuple of path components + + Returns: + DeltaTable instance or None if table doesn't exist + """ + source_key = self._get_source_key(record_path) + table_path = self._get_table_path(record_path) + + # Check cache first + if dt := self._delta_table_cache.get(source_key): + return dt + + try: + # Try to load existing table + delta_table = DeltaTable(str(table_path)) + self._delta_table_cache[source_key] = delta_table + logger.debug(f"Loaded existing Delta table for {source_key}") + return delta_table + except TableNotFoundError: + # Table doesn't exist + return None + except Exception as e: + logger.error(f"Error loading Delta table for {source_key}: {e}") + # Try to clear any corrupted cache and retry once + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + return None + + def _ensure_record_id_column( + self, arrow_data: "pa.Table", record_id: str + ) -> "pa.Table": + """Ensure the table has an record id column.""" + if self.RECORD_ID_COLUMN not in arrow_data.column_names: + # Add record_id column at the beginning + key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) + arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) + return arrow_data + + def _remove_record_id_column(self, arrow_data: "pa.Table") -> "pa.Table": + """Remove the record id column if it exists.""" + if self.RECORD_ID_COLUMN in arrow_data.column_names: + column_names = arrow_data.column_names + indices_to_keep = [ + i + for i, name in enumerate(column_names) + if name != self.RECORD_ID_COLUMN + ] + arrow_data = arrow_data.select(indices_to_keep) + return arrow_data + + def _handle_record_id_column( + self, arrow_data: "pa.Table", record_id_column: str | None = None + ) -> "pa.Table": + """ + Handle record_id column based on add_record_id_column parameter. + + Args: + arrow_data: Arrow table with record id column + record_id_column: Control entry ID column inclusion: + + """ + if not record_id_column: + # Remove the record id column + return self._remove_record_id_column(arrow_data) + + # Rename record id column + if self.RECORD_ID_COLUMN in arrow_data.column_names: + schema = arrow_data.schema + new_names = [ + record_id_column if name == self.RECORD_ID_COLUMN else name + for name in schema.names + ] + return arrow_data.rename_columns(new_names) + else: + raise ValueError( + f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." + ) + + def _create_record_id_filter(self, record_id: str) -> list: + """ + Create a proper filter expression for Delta Lake. + + Args: + record_id: The entry ID to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [(self.RECORD_ID_COLUMN, "=", record_id)] + + def _create_record_ids_filter(self, record_ids: list[str]) -> list: + """ + Create a proper filter expression for multiple entry IDs. + + Args: + record_ids: List of entry IDs to filter by + + Returns: + List containing the filter expression for Delta Lake + """ + return [(self.RECORD_ID_COLUMN, "in", record_ids)] + + def _read_table_with_filter( + self, + delta_table: DeltaTable, + filters: list | None = None, + ) -> "pa.Table": + """ + Read table using to_pyarrow_dataset with original schema preservation. + + Args: + delta_table: The Delta table to read from + filters: Optional filters to apply + + Returns: + Arrow table with preserved schema + """ + # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading + dataset = delta_table.to_pyarrow_dataset(as_large_types=True) + if filters: + # Apply filters at dataset level for better performance + import pyarrow.compute as pc + + filter_expr = None + for filt in filters: + if len(filt) == 3: + col, op, val = filt + if op == "=": + expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore + elif op == "in": + expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore + else: + logger.warning( + f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." + ) + # Fallback to table-level filtering + return dataset.to_table()(filters=filters) + + if filter_expr is None: + filter_expr = expr + else: + filter_expr = pc.and_(filter_expr, expr) # type: ignore + + if filter_expr is not None: + return dataset.to_table(filter=filter_expr) + + return dataset.to_table() + + def add_record( + self, + record_path: tuple[str, ...], + record_id: str, + data: "pa.Table", + ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, + force_flush: bool = False, + ) -> "pa.Table": + self._validate_record_path(record_path) + source_key = self._get_source_key(record_path) + + # Check for existing entry + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + if not ignore_duplicates: + pending_table = self._pending_batches[source_key].get(record_id, None) + if pending_table is not None: + raise ValueError( + f"Entry '{record_id}' already exists in pending batch for {source_key}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + existing_record = self.get_record_by_id(record_path, record_id, flush=False) + if existing_record is not None: + raise ValueError( + f"Entry '{record_id}' already exists in {'/'.join(record_path)}. " + f"Use duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Add record_id column to the data + data_with_record_id = self._ensure_record_id_column(data, record_id) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table - save original schema first + write_deltalake(str(table_path), data_with_record_id, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + else: + if self.duplicate_entry_behavior == "overwrite": + try: + delta_table.delete( + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" + ) + logger.debug( + f"Deleted existing record {record_id} from {source_key}" + ) + except Exception as e: + logger.debug( + f"No existing record to delete for {record_id}: {e}" + ) + + write_deltalake( + table_path, + data_with_record_id, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + else: + # Add to the batch for later flushing + self._pending_batches[source_key][record_id] = data_with_record_id + batch_size = len(self._pending_batches[source_key]) + + # Check if we need to flush + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added record {record_id} to {source_key}") + return data + + def add_records( + self, + record_path: tuple[str, ...], + records: "pa.Table", + record_id_column: str | None = None, + ignore_duplicates: bool | None = None, + overwrite_existing: bool = False, + force_flush: bool = False, + ) -> list[str]: + """ + Add multiple records to the Delta table, using one column as record_id. + + Args: + record_path: Path tuple identifying the table location + records: PyArrow table containing the records to add + record_id_column: Column name to use as record_id (defaults to first column) + ignore_duplicates: Whether to ignore duplicate entries + overwrite_existing: Whether to overwrite existing records with same ID + force_flush: Whether to write immediately instead of batching + + Returns: + List of record IDs that were added + """ + self._validate_record_path(record_path) + source_key = self._get_source_key(record_path) + + # Determine record_id column + if record_id_column is None: + record_id_column = records.column_names[0] + + # Validate that the record_id column exists + if record_id_column not in records.column_names: + raise ValueError( + f"Record ID column '{record_id_column}' not found in table. " + f"Available columns: {records.column_names}" + ) + + # Rename the record_id column to the standard name + column_mapping = {record_id_column: self.RECORD_ID_COLUMN} + records_renamed = records.rename_columns( + [column_mapping.get(col, col) for col in records.column_names] + ) + + # Get unique record IDs from the data + record_ids_array = records_renamed[self.RECORD_ID_COLUMN] + unique_record_ids = pc.unique(record_ids_array).to_pylist() + + # Set default behavior for duplicates + if ignore_duplicates is None: + ignore_duplicates = self.duplicate_entry_behavior != "error" + + added_record_ids = [] + + # Check for duplicates if needed + if not ignore_duplicates: + # Check pending batches + pending_duplicates = [] + for record_id in unique_record_ids: + if record_id in self._pending_batches[source_key]: + pending_duplicates.append(record_id) + + if pending_duplicates: + raise ValueError( + f"Records {pending_duplicates} already exist in pending batch for {source_key}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + # Check existing table + existing_duplicates = [] + try: + for record_id in unique_record_ids: + existing_record = self.get_record_by_id( + record_path, str(record_id), flush=False + ) + if existing_record is not None: + existing_duplicates.append(record_id) + except Exception as e: + logger.debug(f"Error checking existing records: {e}") + + if existing_duplicates: + raise ValueError( + f"Records {existing_duplicates} already exist in {'/'.join(record_path)}. " + f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." + ) + + if force_flush: + # Write immediately + table_path = self._get_table_path(record_path) + table_path.mkdir(parents=True, exist_ok=True) + + delta_table = self._get_existing_delta_table(record_path) + + if delta_table is None: + # Create new table + write_deltalake(str(table_path), records_renamed, mode="overwrite") + logger.debug(f"Created new Delta table for {source_key}") + added_record_ids = unique_record_ids + else: + # Handle existing table + if self.duplicate_entry_behavior == "overwrite" or overwrite_existing: + # Delete existing records with matching IDs + try: + # Create SQL condition for multiple record IDs + escaped_ids = [ + str(rid).replace("'", "''") for rid in unique_record_ids + ] + id_list = "', '".join(escaped_ids) + delete_condition = f"{self.RECORD_ID_COLUMN} IN ('{id_list}')" + + delta_table.delete(delete_condition) + logger.debug( + f"Deleted existing records {unique_record_ids} from {source_key}" + ) + except Exception as e: + logger.debug(f"No existing records to delete: {e}") + + # Filter out duplicates if not overwriting + if not ( + self.duplicate_entry_behavior == "overwrite" or overwrite_existing + ): + # Get existing record IDs + try: + existing_table = delta_table.to_pyarrow_table() + if len(existing_table) > 0: + existing_ids = pc.unique( + existing_table[self.RECORD_ID_COLUMN] + ) + + # Filter out records that already exist + mask = pc.invert( + pc.is_in( + records_renamed[self.RECORD_ID_COLUMN], existing_ids + ) + ) + records_renamed = pc.filter(records_renamed, mask) # type: ignore + + # Update the list of record IDs that will actually be added + if len(records_renamed) > 0: + added_record_ids = pc.unique( + records_renamed[self.RECORD_ID_COLUMN] + ).to_pylist() + else: + added_record_ids = [] + else: + added_record_ids = unique_record_ids + except Exception as e: + logger.debug(f"Error filtering duplicates: {e}") + added_record_ids = unique_record_ids + else: + added_record_ids = unique_record_ids + + # Append the (possibly filtered) records + if len(records_renamed) > 0: + write_deltalake( + table_path, + records_renamed, + mode="append", + schema_mode="merge", + ) + + # Update cache + self._delta_table_cache[source_key] = DeltaTable(str(table_path)) + + else: + # Add to batches for later flushing + # Group records by record_id for individual batch entries + for record_id in unique_record_ids: + # Filter records for this specific record_id + mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) # type: ignore + single_record = pc.filter(records_renamed, mask) # type: ignore + + # Add to pending batch (will overwrite if duplicate_entry_behavior allows) + if ( + self.duplicate_entry_behavior == "overwrite" + or overwrite_existing + or record_id not in self._pending_batches[source_key] + ): + self._pending_batches[source_key][str(record_id)] = single_record + added_record_ids.append(record_id) + elif ignore_duplicates: + logger.debug(f"Ignoring duplicate record {record_id}") + else: + # This should have been caught earlier, but just in case + logger.warning(f"Skipping duplicate record {record_id}") + + # Check if we need to flush + batch_size = len(self._pending_batches[source_key]) + if batch_size >= self.batch_size: + self.flush_batch(record_path) + + logger.debug(f"Added {len(added_record_ids)} records to {source_key}") + return [str(rid) for rid in added_record_ids] + + def get_record_by_id( + self, + record_path: tuple[str, ...], + record_id: str, + record_id_column: str | None = None, + flush: bool = False, + ) -> "pa.Table | None": + """ + Get a specific record by record_id with schema preservation. + + Args: + record_path: Tuple of path components + record_id: Unique identifier for the record + + Returns: + Arrow table for the record or None if not found + """ + + if flush: + self.flush_batch(record_path) + self._validate_record_path(record_path) + + # check if record_id is found in pending batches + source_key = self._get_source_key(record_path) + if record_id in self._pending_batches[source_key]: + # Return the pending record after removing the entry id column + return self._remove_record_id_column( + self._pending_batches[source_key][record_id] + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read + filter_expr = self._create_record_id_filter(record_id) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Handle (remove/rename) the record id column before returning + return self._handle_record_id_column(result, record_id_column) + + except Exception as e: + logger.error( + f"Error getting record {record_id} from {'/'.join(record_path)}: {e}" + ) + raise e + + def get_all_records( + self, + record_path: tuple[str, ...], + record_id_column: str | None = None, + retrieve_pending: bool = True, + flush: bool = False, + ) -> "pa.Table | None": + """ + Retrieve all records for a given source path as a single table with schema preservation. + + Args: + record_path: Tuple of path components + record_id_column: If not None or empty, record id is returned in the result with the specified column name + + Returns: + Arrow table containing all records with original schema, or None if no records found + """ + # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily + + if flush: + self.flush_batch(record_path) + self._validate_record_path(record_path) + + collected_tables = [] + if retrieve_pending: + # Check if there are pending records in the batch + for record_id, arrow_table in self._pending_batches[ + self._get_source_key(record_path) + ].items(): + collected_tables.append( + self._ensure_record_id_column(arrow_table, record_id) + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is not None: + try: + # Use filter-based read + result = self._read_table_with_filter(delta_table) + + if len(result) != 0: + collected_tables.append(result) + + except Exception as e: + logger.error( + f"Error getting all records from {'/'.join(record_path)}: {e}" + ) + if collected_tables: + total_table = pa.concat_tables(collected_tables) + + # Handle record_id column based on parameter + return self._handle_record_id_column(total_table, record_id_column) + + return None + + def get_records_by_ids( + self, + record_path: tuple[str, ...], + record_ids: "list[str] | pl.Series | pa.Array", + record_id_column: str | None = None, + flush: bool = False, + ) -> "pa.Table | None": + """ + Retrieve records by entry IDs as a single table with schema preservation. + + Args: + record_path: Tuple of path components + record_ids: Entry IDs to retrieve + add_record_id_column: Control entry ID column inclusion + preserve_input_order: If True, return results in input order with nulls for missing + + Returns: + Arrow table containing all found records with original schema, or None if no records found + """ + + if flush: + self.flush_batch(record_path) + + self._validate_record_path(record_path) + + # Convert input to list of strings for consistency + if isinstance(record_ids, list): + if not record_ids: + return None + record_ids_list = record_ids + elif isinstance(record_ids, pl.Series): + if len(record_ids) == 0: + return None + record_ids_list = record_ids.to_list() + elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): + if len(record_ids) == 0: + return None + record_ids_list = record_ids.to_pylist() + else: + raise TypeError( + f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" + ) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Use schema-preserving read with filters + filter_expr = self._create_record_ids_filter( + cast(list[str], record_ids_list) + ) + result = self._read_table_with_filter(delta_table, filters=filter_expr) + + if len(result) == 0: + return None + + # Handle record_id column based on parameter + return self._handle_record_id_column(result, record_id_column) + + except Exception as e: + logger.error( + f"Error getting records by IDs from {'/'.join(record_path)}: {e}" + ) + return None + + def get_pending_batch_info(self) -> dict[str, int]: + """ + Get information about pending batches. + + Returns: + Dictionary mapping source keys to number of pending records + """ + return { + source_key: len(tables) + for source_key, tables in self._pending_batches.items() + if tables + } + + def list_sources(self) -> list[tuple[str, ...]]: + """ + List all available source paths. + + Returns: + List of source path tuples + """ + sources = [] + + def _scan_directory(current_path: Path, path_components: tuple[str, ...]): + """Recursively scan for Delta tables.""" + for item in current_path.iterdir(): + if not item.is_dir(): + continue + + new_path_components = path_components + (item.name,) + + # Check if this directory contains a Delta table + try: + DeltaTable(str(item)) + sources.append(new_path_components) + except TableNotFoundError: + # Not a Delta table, continue scanning subdirectories + if len(new_path_components) < self.max_hierarchy_depth: + _scan_directory(item, new_path_components) + + _scan_directory(self.base_path, ()) + return sources + + def delete_source(self, record_path: tuple[str, ...]) -> bool: + """ + Delete an entire source (all records for a source path). + + Args: + record_path: Tuple of path components + + Returns: + True if source was deleted, False if it didn't exist + """ + self._validate_record_path(record_path) + + # Flush any pending batches first + self.flush_batch(record_path) + + table_path = self._get_table_path(record_path) + source_key = self._get_source_key(record_path) + + if not table_path.exists(): + return False + + try: + # Remove from caches + if source_key in self._delta_table_cache: + del self._delta_table_cache[source_key] + + # Remove directory + import shutil + + shutil.rmtree(table_path) + + logger.info(f"Deleted source {source_key}") + return True + + except Exception as e: + logger.error(f"Error deleting source {source_key}: {e}") + return False + + def delete_record(self, record_path: tuple[str, ...], record_id: str) -> bool: + """ + Delete a specific record. + + Args: + record_path: Tuple of path components + record_id: ID of the record to delete + + Returns: + True if record was deleted, False if it didn't exist + """ + self._validate_record_path(record_path) + + # Flush any pending batches first + self.flush_batch(record_path) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return False + + try: + # Check if record exists using proper filter + filter_expr = self._create_record_id_filter(record_id) + existing = self._read_table_with_filter(delta_table, filters=filter_expr) + if len(existing) == 0: + return False + + # Delete the record using SQL-style predicate (this is correct for delete operations) + delta_table.delete( + f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" + ) + + # Update cache + source_key = self._get_source_key(record_path) + self._delta_table_cache[source_key] = delta_table + + logger.debug(f"Deleted record {record_id} from {'/'.join(record_path)}") + return True + + except Exception as e: + logger.error( + f"Error deleting record {record_id} from {'/'.join(record_path)}: {e}" + ) + return False + + def get_table_info(self, record_path: tuple[str, ...]) -> dict[str, Any] | None: + """ + Get metadata information about a Delta table. + + Args: + record_path: Tuple of path components + + Returns: + Dictionary with table metadata, or None if table doesn't exist + """ + self._validate_record_path(record_path) + + delta_table = self._get_existing_delta_table(record_path) + if delta_table is None: + return None + + try: + # Get basic info + schema = delta_table.schema() + history = delta_table.history() + source_key = self._get_source_key(record_path) + + # Add pending batch info + pending_info = self.get_pending_batch_info() + pending_count = pending_info.get(source_key, 0) + + return { + "path": str(self._get_table_path(record_path)), + "record_path": record_path, + "schema": schema, + "version": delta_table.version(), + "num_files": len(delta_table.files()), + "history_length": len(history), + "latest_commit": history[0] if history else None, + "pending_records": pending_count, + } + + except Exception as e: + logger.error(f"Error getting table info for {'/'.join(record_path)}: {e}") + return None diff --git a/src/orcapod/databases/delta_lake_databases.py b/src/orcapod/databases/delta_lake_databases.py index 1ffd5d4..ab02802 100644 --- a/src/orcapod/databases/delta_lake_databases.py +++ b/src/orcapod/databases/delta_lake_databases.py @@ -707,7 +707,7 @@ def get_records_by_ids( Returns: Arrow table containing all found records with original schema, or None if no records found """ - record_key = self._get_record_key(record_path) + _ = self._get_record_key(record_path) if flush: self.flush_batch(record_path) @@ -867,988 +867,3 @@ def flush_batch(self, record_path: tuple[str, ...]) -> None: self._pending_batches[record_key] = pending_batch self._pending_record_ids[record_key] = pending_ids raise - - -class BasicDeltaTableArrowStore: - """ - A basic Delta Table-based Arrow data store with flexible hierarchical path support. - This store does NOT implement lazy loading or streaming capabilities, therefore - being "basic" in that sense. It is designed for simple use cases where data is written - in batches and read back as complete tables. It is worth noting that the Delta table - structure created by this store IS compatible with more advanced Delta Table-based - data stores (to be implemented) that will support lazy loading and streaming. - - Uses tuple-based source paths for robust parameter handling: - - ("source_name", "source_id") -> source_name/source_id/ - - ("org", "project", "dataset") -> org/project/dataset/ - - ("year", "month", "day", "experiment") -> year/month/day/experiment/ - """ - - RECORD_ID_COLUMN = f"{constants.META_PREFIX}record_id" - - def __init__( - self, - base_path: str | Path, - duplicate_entry_behavior: str = "error", - create_base_path: bool = True, - max_hierarchy_depth: int = 10, - batch_size: int = 100, - ): - """ - Initialize the BasicDeltaTableArrowStore. - - Args: - base_path: Base directory path where Delta tables will be stored - duplicate_entry_behavior: How to handle duplicate record_ids: - - 'error': Raise ValueError when record_id already exists - - 'overwrite': Replace existing entry with new data - create_base_path: Whether to create the base path if it doesn't exist - max_hierarchy_depth: Maximum allowed depth for source paths (safety limit) - batch_size: Number of records to batch before writing to Delta table - """ - # Validate duplicate behavior - if duplicate_entry_behavior not in ["error", "overwrite"]: - raise ValueError("duplicate_entry_behavior must be 'error' or 'overwrite'") - - self.duplicate_entry_behavior = duplicate_entry_behavior - self.base_path = Path(base_path) - self.max_hierarchy_depth = max_hierarchy_depth - self.batch_size = batch_size - - if create_base_path: - self.base_path.mkdir(parents=True, exist_ok=True) - elif not self.base_path.exists(): - raise ValueError( - f"Base path {self.base_path} does not exist and create_base_path=False" - ) - - # Cache for Delta tables to avoid repeated initialization - self._delta_table_cache: dict[str, DeltaTable] = {} - - # Batch management - self._pending_batches: dict[str, dict[str, pa.Table]] = defaultdict(dict) - - logger.info( - f"Initialized DeltaTableArrowDataStore at {self.base_path} " - f"with duplicate_entry_behavior='{duplicate_entry_behavior}', " - f"batch_size={batch_size}, as" - ) - - def flush(self) -> None: - """ - Flush all pending batches immediately. - - This method is called to ensure all pending data is written to the Delta tables. - """ - try: - self.flush_all_batches() - except Exception as e: - logger.error(f"Error during flush: {e}") - - def flush_batch(self, record_path: tuple[str, ...]) -> None: - """ - Flush pending batch for a specific source path. - - Args: - record_path: Tuple of path components - """ - logger.debug("Flushing triggered!!") - source_key = self._get_source_key(record_path) - - if ( - source_key not in self._pending_batches - or not self._pending_batches[source_key] - ): - return - - # Get all pending records - pending_tables = self._pending_batches[source_key] - self._pending_batches[source_key] = {} - - try: - # Combine all tables in the batch - combined_table = pa.concat_tables(pending_tables.values()).combine_chunks() - - table_path = self._get_table_path(record_path) - table_path.mkdir(parents=True, exist_ok=True) - - # Check if table exists - delta_table = self._get_existing_delta_table(record_path) - - if delta_table is None: - # TODO: reconsider mode="overwrite" here - write_deltalake( - table_path, - combined_table, - mode="overwrite", - ) - logger.debug( - f"Created new Delta table for {source_key} with {len(combined_table)} records" - ) - else: - if self.duplicate_entry_behavior == "overwrite": - # Get entry IDs from the batch - record_ids = combined_table.column( - self.RECORD_ID_COLUMN - ).to_pylist() - unique_record_ids = cast(list[str], list(set(record_ids))) - - # Delete existing records with these IDs - if unique_record_ids: - record_ids_str = "', '".join(unique_record_ids) - delete_predicate = ( - f"{self.RECORD_ID_COLUMN} IN ('{record_ids_str}')" - ) - try: - delta_table.delete(delete_predicate) - logger.debug( - f"Deleted {len(unique_record_ids)} existing records from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing records to delete from {source_key}: {e}" - ) - - # otherwise, only insert if same record_id does not exist yet - delta_table.merge( - source=combined_table, - predicate=f"target.{self.RECORD_ID_COLUMN} = source.{self.RECORD_ID_COLUMN}", - source_alias="source", - target_alias="target", - ).when_not_matched_insert_all().execute() - - logger.debug( - f"Appended batch of {len(combined_table)} records to {source_key}" - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - # Put the tables back in the pending queue - self._pending_batches[source_key] = pending_tables - raise - - def flush_all_batches(self) -> None: - """Flush all pending batches.""" - source_keys = list(self._pending_batches.keys()) - - # TODO: capture and re-raise exceptions at the end - for source_key in source_keys: - record_path = tuple(source_key.split("/")) - try: - self.flush_batch(record_path) - except Exception as e: - logger.error(f"Error flushing batch for {source_key}: {e}") - - def __del__(self): - """Cleanup when object is destroyed.""" - self.flush() - - def _validate_record_path(self, record_path: tuple[str, ...]) -> None: - # TODO: consider removing this as path creation can be tried directly - """ - Validate source path components. - - Args: - record_path: Tuple of path components - - Raises: - ValueError: If path is invalid - """ - if not record_path: - raise ValueError("Source path cannot be empty") - - if len(record_path) > self.max_hierarchy_depth: - raise ValueError( - f"Source path depth {len(record_path)} exceeds maximum {self.max_hierarchy_depth}" - ) - - # Validate path components - for i, component in enumerate(record_path): - if not component or not isinstance(component, str): - raise ValueError( - f"Source path component {i} is invalid: {repr(component)}" - ) - - # Check for filesystem-unsafe characters - unsafe_chars = ["/", "\\", ":", "*", "?", '"', "<", ">", "|", "\0"] - if any(char in component for char in unsafe_chars): - raise ValueError( - f"Source path {record_path} component {component} contains invalid characters: {repr(component)}" - ) - - def _get_source_key(self, record_path: tuple[str, ...]) -> str: - """Generate cache key for source storage.""" - return "/".join(record_path) - - def _get_table_path(self, record_path: tuple[str, ...]) -> Path: - """Get the filesystem path for a given source path.""" - path = self.base_path - for subpath in record_path: - path = path / subpath - return path - - def _get_existing_delta_table( - self, record_path: tuple[str, ...] - ) -> DeltaTable | None: - """ - Get or create a Delta table, handling schema initialization properly. - - Args: - record_path: Tuple of path components - - Returns: - DeltaTable instance or None if table doesn't exist - """ - source_key = self._get_source_key(record_path) - table_path = self._get_table_path(record_path) - - # Check cache first - if dt := self._delta_table_cache.get(source_key): - return dt - - try: - # Try to load existing table - delta_table = DeltaTable(str(table_path)) - self._delta_table_cache[source_key] = delta_table - logger.debug(f"Loaded existing Delta table for {source_key}") - return delta_table - except TableNotFoundError: - # Table doesn't exist - return None - except Exception as e: - logger.error(f"Error loading Delta table for {source_key}: {e}") - # Try to clear any corrupted cache and retry once - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - return None - - def _ensure_record_id_column( - self, arrow_data: "pa.Table", record_id: str - ) -> "pa.Table": - """Ensure the table has an record id column.""" - if self.RECORD_ID_COLUMN not in arrow_data.column_names: - # Add record_id column at the beginning - key_array = pa.array([record_id] * len(arrow_data), type=pa.large_string()) - arrow_data = arrow_data.add_column(0, self.RECORD_ID_COLUMN, key_array) - return arrow_data - - def _remove_record_id_column(self, arrow_data: "pa.Table") -> "pa.Table": - """Remove the record id column if it exists.""" - if self.RECORD_ID_COLUMN in arrow_data.column_names: - column_names = arrow_data.column_names - indices_to_keep = [ - i - for i, name in enumerate(column_names) - if name != self.RECORD_ID_COLUMN - ] - arrow_data = arrow_data.select(indices_to_keep) - return arrow_data - - def _handle_record_id_column( - self, arrow_data: "pa.Table", record_id_column: str | None = None - ) -> "pa.Table": - """ - Handle record_id column based on add_record_id_column parameter. - - Args: - arrow_data: Arrow table with record id column - record_id_column: Control entry ID column inclusion: - - """ - if not record_id_column: - # Remove the record id column - return self._remove_record_id_column(arrow_data) - - # Rename record id column - if self.RECORD_ID_COLUMN in arrow_data.column_names: - schema = arrow_data.schema - new_names = [ - record_id_column if name == self.RECORD_ID_COLUMN else name - for name in schema.names - ] - return arrow_data.rename_columns(new_names) - else: - raise ValueError( - f"Record ID column '{self.RECORD_ID_COLUMN}' not found in the table and cannot be renamed." - ) - - def _create_record_id_filter(self, record_id: str) -> list: - """ - Create a proper filter expression for Delta Lake. - - Args: - record_id: The entry ID to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [(self.RECORD_ID_COLUMN, "=", record_id)] - - def _create_record_ids_filter(self, record_ids: list[str]) -> list: - """ - Create a proper filter expression for multiple entry IDs. - - Args: - record_ids: List of entry IDs to filter by - - Returns: - List containing the filter expression for Delta Lake - """ - return [(self.RECORD_ID_COLUMN, "in", record_ids)] - - def _read_table_with_filter( - self, - delta_table: DeltaTable, - filters: list | None = None, - ) -> "pa.Table": - """ - Read table using to_pyarrow_dataset with original schema preservation. - - Args: - delta_table: The Delta table to read from - filters: Optional filters to apply - - Returns: - Arrow table with preserved schema - """ - # Use to_pyarrow_dataset with as_large_types for Polars compatible arrow table loading - dataset = delta_table.to_pyarrow_dataset(as_large_types=True) - if filters: - # Apply filters at dataset level for better performance - import pyarrow.compute as pc - - filter_expr = None - for filt in filters: - if len(filt) == 3: - col, op, val = filt - if op == "=": - expr = pc.equal(pc.field(col), pa.scalar(val)) # type: ignore - elif op == "in": - expr = pc.is_in(pc.field(col), pa.array(val)) # type: ignore - else: - logger.warning( - f"Unsupported filter operation: {op}. Falling back to table-level filter application which may be less efficient." - ) - # Fallback to table-level filtering - return dataset.to_table()(filters=filters) - - if filter_expr is None: - filter_expr = expr - else: - filter_expr = pc.and_(filter_expr, expr) # type: ignore - - if filter_expr is not None: - return dataset.to_table(filter=filter_expr) - - return dataset.to_table() - - def add_record( - self, - record_path: tuple[str, ...], - record_id: str, - data: "pa.Table", - ignore_duplicates: bool | None = None, - overwrite_existing: bool = False, - force_flush: bool = False, - ) -> "pa.Table": - self._validate_record_path(record_path) - source_key = self._get_source_key(record_path) - - # Check for existing entry - if ignore_duplicates is None: - ignore_duplicates = self.duplicate_entry_behavior != "error" - if not ignore_duplicates: - pending_table = self._pending_batches[source_key].get(record_id, None) - if pending_table is not None: - raise ValueError( - f"Entry '{record_id}' already exists in pending batch for {source_key}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - existing_record = self.get_record_by_id(record_path, record_id, flush=False) - if existing_record is not None: - raise ValueError( - f"Entry '{record_id}' already exists in {'/'.join(record_path)}. " - f"Use duplicate_entry_behavior='overwrite' to allow updates." - ) - - # Add record_id column to the data - data_with_record_id = self._ensure_record_id_column(data, record_id) - - if force_flush: - # Write immediately - table_path = self._get_table_path(record_path) - table_path.mkdir(parents=True, exist_ok=True) - - delta_table = self._get_existing_delta_table(record_path) - - if delta_table is None: - # Create new table - save original schema first - write_deltalake(str(table_path), data_with_record_id, mode="overwrite") - logger.debug(f"Created new Delta table for {source_key}") - else: - if self.duplicate_entry_behavior == "overwrite": - try: - delta_table.delete( - f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" - ) - logger.debug( - f"Deleted existing record {record_id} from {source_key}" - ) - except Exception as e: - logger.debug( - f"No existing record to delete for {record_id}: {e}" - ) - - write_deltalake( - table_path, - data_with_record_id, - mode="append", - schema_mode="merge", - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - else: - # Add to the batch for later flushing - self._pending_batches[source_key][record_id] = data_with_record_id - batch_size = len(self._pending_batches[source_key]) - - # Check if we need to flush - if batch_size >= self.batch_size: - self.flush_batch(record_path) - - logger.debug(f"Added record {record_id} to {source_key}") - return data - - def add_records( - self, - record_path: tuple[str, ...], - records: "pa.Table", - record_id_column: str | None = None, - ignore_duplicates: bool | None = None, - overwrite_existing: bool = False, - force_flush: bool = False, - ) -> list[str]: - """ - Add multiple records to the Delta table, using one column as record_id. - - Args: - record_path: Path tuple identifying the table location - records: PyArrow table containing the records to add - record_id_column: Column name to use as record_id (defaults to first column) - ignore_duplicates: Whether to ignore duplicate entries - overwrite_existing: Whether to overwrite existing records with same ID - force_flush: Whether to write immediately instead of batching - - Returns: - List of record IDs that were added - """ - self._validate_record_path(record_path) - source_key = self._get_source_key(record_path) - - # Determine record_id column - if record_id_column is None: - record_id_column = records.column_names[0] - - # Validate that the record_id column exists - if record_id_column not in records.column_names: - raise ValueError( - f"Record ID column '{record_id_column}' not found in table. " - f"Available columns: {records.column_names}" - ) - - # Rename the record_id column to the standard name - column_mapping = {record_id_column: self.RECORD_ID_COLUMN} - records_renamed = records.rename_columns( - [column_mapping.get(col, col) for col in records.column_names] - ) - - # Get unique record IDs from the data - record_ids_array = records_renamed[self.RECORD_ID_COLUMN] - unique_record_ids = pc.unique(record_ids_array).to_pylist() - - # Set default behavior for duplicates - if ignore_duplicates is None: - ignore_duplicates = self.duplicate_entry_behavior != "error" - - added_record_ids = [] - - # Check for duplicates if needed - if not ignore_duplicates: - # Check pending batches - pending_duplicates = [] - for record_id in unique_record_ids: - if record_id in self._pending_batches[source_key]: - pending_duplicates.append(record_id) - - if pending_duplicates: - raise ValueError( - f"Records {pending_duplicates} already exist in pending batch for {source_key}. " - f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." - ) - - # Check existing table - existing_duplicates = [] - try: - for record_id in unique_record_ids: - existing_record = self.get_record_by_id( - record_path, str(record_id), flush=False - ) - if existing_record is not None: - existing_duplicates.append(record_id) - except Exception as e: - logger.debug(f"Error checking existing records: {e}") - - if existing_duplicates: - raise ValueError( - f"Records {existing_duplicates} already exist in {'/'.join(record_path)}. " - f"Use ignore_duplicates=True or duplicate_entry_behavior='overwrite' to allow updates." - ) - - if force_flush: - # Write immediately - table_path = self._get_table_path(record_path) - table_path.mkdir(parents=True, exist_ok=True) - - delta_table = self._get_existing_delta_table(record_path) - - if delta_table is None: - # Create new table - write_deltalake(str(table_path), records_renamed, mode="overwrite") - logger.debug(f"Created new Delta table for {source_key}") - added_record_ids = unique_record_ids - else: - # Handle existing table - if self.duplicate_entry_behavior == "overwrite" or overwrite_existing: - # Delete existing records with matching IDs - try: - # Create SQL condition for multiple record IDs - escaped_ids = [ - str(rid).replace("'", "''") for rid in unique_record_ids - ] - id_list = "', '".join(escaped_ids) - delete_condition = f"{self.RECORD_ID_COLUMN} IN ('{id_list}')" - - delta_table.delete(delete_condition) - logger.debug( - f"Deleted existing records {unique_record_ids} from {source_key}" - ) - except Exception as e: - logger.debug(f"No existing records to delete: {e}") - - # Filter out duplicates if not overwriting - if not ( - self.duplicate_entry_behavior == "overwrite" or overwrite_existing - ): - # Get existing record IDs - try: - existing_table = delta_table.to_pyarrow_table() - if len(existing_table) > 0: - existing_ids = pc.unique( - existing_table[self.RECORD_ID_COLUMN] - ) - - # Filter out records that already exist - mask = pc.invert( - pc.is_in( - records_renamed[self.RECORD_ID_COLUMN], existing_ids - ) - ) - records_renamed = pc.filter(records_renamed, mask) # type: ignore - - # Update the list of record IDs that will actually be added - if len(records_renamed) > 0: - added_record_ids = pc.unique( - records_renamed[self.RECORD_ID_COLUMN] - ).to_pylist() - else: - added_record_ids = [] - else: - added_record_ids = unique_record_ids - except Exception as e: - logger.debug(f"Error filtering duplicates: {e}") - added_record_ids = unique_record_ids - else: - added_record_ids = unique_record_ids - - # Append the (possibly filtered) records - if len(records_renamed) > 0: - write_deltalake( - table_path, - records_renamed, - mode="append", - schema_mode="merge", - ) - - # Update cache - self._delta_table_cache[source_key] = DeltaTable(str(table_path)) - - else: - # Add to batches for later flushing - # Group records by record_id for individual batch entries - for record_id in unique_record_ids: - # Filter records for this specific record_id - mask = pc.equal(records_renamed[self.RECORD_ID_COLUMN], record_id) # type: ignore - single_record = pc.filter(records_renamed, mask) # type: ignore - - # Add to pending batch (will overwrite if duplicate_entry_behavior allows) - if ( - self.duplicate_entry_behavior == "overwrite" - or overwrite_existing - or record_id not in self._pending_batches[source_key] - ): - self._pending_batches[source_key][str(record_id)] = single_record - added_record_ids.append(record_id) - elif ignore_duplicates: - logger.debug(f"Ignoring duplicate record {record_id}") - else: - # This should have been caught earlier, but just in case - logger.warning(f"Skipping duplicate record {record_id}") - - # Check if we need to flush - batch_size = len(self._pending_batches[source_key]) - if batch_size >= self.batch_size: - self.flush_batch(record_path) - - logger.debug(f"Added {len(added_record_ids)} records to {source_key}") - return [str(rid) for rid in added_record_ids] - - def get_record_by_id( - self, - record_path: tuple[str, ...], - record_id: str, - record_id_column: str | None = None, - flush: bool = False, - ) -> "pa.Table | None": - """ - Get a specific record by record_id with schema preservation. - - Args: - record_path: Tuple of path components - record_id: Unique identifier for the record - - Returns: - Arrow table for the record or None if not found - """ - - if flush: - self.flush_batch(record_path) - self._validate_record_path(record_path) - - # check if record_id is found in pending batches - source_key = self._get_source_key(record_path) - if record_id in self._pending_batches[source_key]: - # Return the pending record after removing the entry id column - return self._remove_record_id_column( - self._pending_batches[source_key][record_id] - ) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read - filter_expr = self._create_record_id_filter(record_id) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - # Handle (remove/rename) the record id column before returning - return self._handle_record_id_column(result, record_id_column) - - except Exception as e: - logger.error( - f"Error getting record {record_id} from {'/'.join(record_path)}: {e}" - ) - raise e - - def get_all_records( - self, - record_path: tuple[str, ...], - record_id_column: str | None = None, - retrieve_pending: bool = True, - flush: bool = False, - ) -> "pa.Table | None": - """ - Retrieve all records for a given source path as a single table with schema preservation. - - Args: - record_path: Tuple of path components - record_id_column: If not None or empty, record id is returned in the result with the specified column name - - Returns: - Arrow table containing all records with original schema, or None if no records found - """ - # TODO: this currently reads everything into memory and then return. Consider implementation that performs everything lazily - - if flush: - self.flush_batch(record_path) - self._validate_record_path(record_path) - - collected_tables = [] - if retrieve_pending: - # Check if there are pending records in the batch - for record_id, arrow_table in self._pending_batches[ - self._get_source_key(record_path) - ].items(): - collected_tables.append( - self._ensure_record_id_column(arrow_table, record_id) - ) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is not None: - try: - # Use filter-based read - result = self._read_table_with_filter(delta_table) - - if len(result) != 0: - collected_tables.append(result) - - except Exception as e: - logger.error( - f"Error getting all records from {'/'.join(record_path)}: {e}" - ) - if collected_tables: - total_table = pa.concat_tables(collected_tables) - - # Handle record_id column based on parameter - return self._handle_record_id_column(total_table, record_id_column) - - return None - - def get_records_by_ids( - self, - record_path: tuple[str, ...], - record_ids: "list[str] | pl.Series | pa.Array", - record_id_column: str | None = None, - flush: bool = False, - ) -> "pa.Table | None": - """ - Retrieve records by entry IDs as a single table with schema preservation. - - Args: - record_path: Tuple of path components - record_ids: Entry IDs to retrieve - add_record_id_column: Control entry ID column inclusion - preserve_input_order: If True, return results in input order with nulls for missing - - Returns: - Arrow table containing all found records with original schema, or None if no records found - """ - - if flush: - self.flush_batch(record_path) - - self._validate_record_path(record_path) - - # Convert input to list of strings for consistency - if isinstance(record_ids, list): - if not record_ids: - return None - record_ids_list = record_ids - elif isinstance(record_ids, pl.Series): - if len(record_ids) == 0: - return None - record_ids_list = record_ids.to_list() - elif isinstance(record_ids, (pa.Array, pa.ChunkedArray)): - if len(record_ids) == 0: - return None - record_ids_list = record_ids.to_pylist() - else: - raise TypeError( - f"record_ids must be list[str], pl.Series, or pa.Array, got {type(record_ids)}" - ) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return None - - try: - # Use schema-preserving read with filters - filter_expr = self._create_record_ids_filter( - cast(list[str], record_ids_list) - ) - result = self._read_table_with_filter(delta_table, filters=filter_expr) - - if len(result) == 0: - return None - - # Handle record_id column based on parameter - return self._handle_record_id_column(result, record_id_column) - - except Exception as e: - logger.error( - f"Error getting records by IDs from {'/'.join(record_path)}: {e}" - ) - return None - - def get_pending_batch_info(self) -> dict[str, int]: - """ - Get information about pending batches. - - Returns: - Dictionary mapping source keys to number of pending records - """ - return { - source_key: len(tables) - for source_key, tables in self._pending_batches.items() - if tables - } - - def list_sources(self) -> list[tuple[str, ...]]: - """ - List all available source paths. - - Returns: - List of source path tuples - """ - sources = [] - - def _scan_directory(current_path: Path, path_components: tuple[str, ...]): - """Recursively scan for Delta tables.""" - for item in current_path.iterdir(): - if not item.is_dir(): - continue - - new_path_components = path_components + (item.name,) - - # Check if this directory contains a Delta table - try: - DeltaTable(str(item)) - sources.append(new_path_components) - except TableNotFoundError: - # Not a Delta table, continue scanning subdirectories - if len(new_path_components) < self.max_hierarchy_depth: - _scan_directory(item, new_path_components) - - _scan_directory(self.base_path, ()) - return sources - - def delete_source(self, record_path: tuple[str, ...]) -> bool: - """ - Delete an entire source (all records for a source path). - - Args: - record_path: Tuple of path components - - Returns: - True if source was deleted, False if it didn't exist - """ - self._validate_record_path(record_path) - - # Flush any pending batches first - self.flush_batch(record_path) - - table_path = self._get_table_path(record_path) - source_key = self._get_source_key(record_path) - - if not table_path.exists(): - return False - - try: - # Remove from caches - if source_key in self._delta_table_cache: - del self._delta_table_cache[source_key] - - # Remove directory - import shutil - - shutil.rmtree(table_path) - - logger.info(f"Deleted source {source_key}") - return True - - except Exception as e: - logger.error(f"Error deleting source {source_key}: {e}") - return False - - def delete_record(self, record_path: tuple[str, ...], record_id: str) -> bool: - """ - Delete a specific record. - - Args: - record_path: Tuple of path components - record_id: ID of the record to delete - - Returns: - True if record was deleted, False if it didn't exist - """ - self._validate_record_path(record_path) - - # Flush any pending batches first - self.flush_batch(record_path) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return False - - try: - # Check if record exists using proper filter - filter_expr = self._create_record_id_filter(record_id) - existing = self._read_table_with_filter(delta_table, filters=filter_expr) - if len(existing) == 0: - return False - - # Delete the record using SQL-style predicate (this is correct for delete operations) - delta_table.delete( - f"{self.RECORD_ID_COLUMN} = '{record_id.replace(chr(39), chr(39) + chr(39))}'" - ) - - # Update cache - source_key = self._get_source_key(record_path) - self._delta_table_cache[source_key] = delta_table - - logger.debug(f"Deleted record {record_id} from {'/'.join(record_path)}") - return True - - except Exception as e: - logger.error( - f"Error deleting record {record_id} from {'/'.join(record_path)}: {e}" - ) - return False - - def get_table_info(self, record_path: tuple[str, ...]) -> dict[str, Any] | None: - """ - Get metadata information about a Delta table. - - Args: - record_path: Tuple of path components - - Returns: - Dictionary with table metadata, or None if table doesn't exist - """ - self._validate_record_path(record_path) - - delta_table = self._get_existing_delta_table(record_path) - if delta_table is None: - return None - - try: - # Get basic info - schema = delta_table.schema() - history = delta_table.history() - source_key = self._get_source_key(record_path) - - # Add pending batch info - pending_info = self.get_pending_batch_info() - pending_count = pending_info.get(source_key, 0) - - return { - "path": str(self._get_table_path(record_path)), - "record_path": record_path, - "schema": schema, - "version": delta_table.version(), - "num_files": len(delta_table.files()), - "history_length": len(history), - "latest_commit": history[0] if history else None, - "pending_records": pending_count, - } - - except Exception as e: - logger.error(f"Error getting table info for {'/'.join(record_path)}: {e}") - return None diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 9fcd198..8e2bb67 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -46,6 +46,24 @@ def run_in_thread(): logger = logging.getLogger(__name__) +class GraphNode: + def __init__(self, label: str, id: int, kernel_type: str): + self.label = label + self.id = id + self.kernel_type = kernel_type + + def __hash__(self): + return hash((self.id, self.kernel_type)) + + def __eq__(self, other): + if not isinstance(other, GraphNode): + return NotImplemented + return (self.id, self.kernel_type) == ( + other.id, + other.kernel_type, + ) + + class Pipeline(GraphTracker): """ Represents a pipeline in the system. @@ -125,6 +143,7 @@ def compile(self) -> None: ] node = self.wrap_invocation(invocation, new_input_streams=input_streams) + for parent in node.upstreams: node_graph.add_edge(parent.source, node) @@ -138,15 +157,17 @@ def compile(self) -> None: logger.info(f"Collision detected for label '{label}': {nodes}") for i, node in enumerate(nodes, start=1): self.nodes[f"{label}_{i}"] = node + node.label = f"{label}_{i}" else: self.nodes[label] = nodes[0] + nodes[0].label = label self.label_lut = {v: k for k, v in self.nodes.items()} self.graph = node_graph def show_graph(self, **kwargs) -> None: - render_graph(self.graph, self.label_lut, **kwargs) + render_graph(self.graph, **kwargs) def run( self, @@ -194,6 +215,7 @@ def wrap_invocation( pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, + kernel_type="pod", ) elif invocation in self.invocation_to_source_lut: source = self.invocation_to_source_lut[invocation] @@ -203,6 +225,7 @@ def wrap_invocation( pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, + kernel_type="source", ) else: node = KernelNode( @@ -211,6 +234,7 @@ def wrap_invocation( pipeline_database=self.pipeline_database, pipeline_path_prefix=self.pipeline_store_path_prefix, label=invocation.label, + kernel_type="operator", ) return node @@ -240,79 +264,206 @@ def rename(self, old_name: str, new_name: str) -> None: logger.info(f"Node '{old_name}' renamed to '{new_name}'") -# import networkx as nx -# # import graphviz -# import matplotlib.pyplot as plt -# import matplotlib.image as mpimg -# import tempfile -# import os +from typing import Optional, Dict, Any class GraphRenderer: - """Simple renderer for NetworkX graphs using Graphviz DOT format""" + """Improved GraphRenderer with centralized default styling""" + + # ==================== + # CENTRALIZED DEFAULTS + # ==================== + DEFAULT_STYLES = { + "rankdir": "TB", + "node_shape": "box", + "node_style": "filled", + "node_color": "navy", + "font_color": "white", + "type_font_color": "#54508C", # muted navy blue + "font_name": "sans-serif", + "font_path": None, # Set to None by default + # 'font_path': './assets/fonts/LexendDeca-Medium.ttf', + "edge_color": "black", + "dpi": 150, + # HTML Label defaults + "main_font_size": 14, # Main label font size + "type_font_size": 11, # Pod type font size (small) + "type_style": "normal", # Pod type text style + } + + DEFAULT_STYLE_RULES = { + "source": { + "fillcolor": "white", + "shape": "rect", + "fontcolor": "black", + "style": "filled", + "typefontcolor": "#3A3737", # dark gray + }, + "operator": { + "fillcolor": "#DFD6CF", # pale beige + "shape": "diamond", + "fontcolor": "black", + "style": "filled", + "typefontcolor": "#3A3737", # dark gray + }, + "pod": { + "fillcolor": "#f5f5f5", # off white + "shape": "cylinder", + "fontcolor": "#090271", # darker navy blue + "style": "filled", + "typefontcolor": "#3A3737", # dark gray + }, + } + + DARK_THEME_RULES = { + "source": { + "fillcolor": "black", + "shape": "rect", + "fontcolor": "white", + "style": "filled", + "typefontcolor": "lightgray", # Light text for dark background + }, + "operator": { + "fillcolor": "#026e8e", # ocean blue + "shape": "diamond", + "fontcolor": "white", + "style": "filled", + "typefontcolor": "lightgray", # Light text for dark background + }, + "pod": { + "fillcolor": "#090271", # darker navy blue + "shape": "cylinder", + "fontcolor": "white", + "style": "filled", + "typefontcolor": "lightgray", # Light text for dark background + }, + } def __init__(self): - """Initialize the renderer""" pass def _sanitize_node_id(self, node_id: Any) -> str: - """Convert node_id to a valid DOT identifier using hash""" return f"node_{hash(node_id)}" + def _create_default_html_label(self, node, node_attrs) -> str: + """ + Create HTML for the label (text) section of the node + + Format: + kernel_type (11pt, small text) + main_label (14pt, normal text) + """ + + main_label = str(node.label) if hasattr(node, "label") else str(node) + kernel_type = str(node.kernel_type) if hasattr(node, "kernel_type") else "" + + if not kernel_type: + # No kernel_type, just return main label + return f'{main_label}' + + # Create HTML label: small kernel_type above, main label below + main_size = self.DEFAULT_STYLES["main_font_size"] + type_size = self.DEFAULT_STYLES["type_font_size"] + font_name = self.DEFAULT_STYLES["font_name"] + type_font_color = node_attrs.get( + "typefontcolor", self.DEFAULT_STYLES["type_font_color"] + ) + + html_label = f'''< + + + +
{kernel_type}
{main_label}
+ >''' + + return html_label + def _get_node_label( - self, node_id: Any, label_lut: dict[Any, str] | None = None + self, node_id: Any, label_lut: Optional[Dict[Any, str]] = None ) -> str: - """Get label for a node""" if label_lut and node_id in label_lut: return label_lut[node_id] return str(node_id) + def _get_node_attributes( + self, node_id: Any, style_rules: Dict | None = None + ) -> Dict[str, str]: + """ + Get styling attributes for a specific node based on its properties + """ + # Use provided rules or defaults + rules = style_rules or self.DEFAULT_STYLE_RULES + + # Default attributes + default_attrs = { + "fillcolor": self.DEFAULT_STYLES["node_color"], + "shape": self.DEFAULT_STYLES["node_shape"], + "fontcolor": self.DEFAULT_STYLES["font_color"], + "fontname": self.DEFAULT_STYLES["font_name"], + "fontsize": self.DEFAULT_STYLES.get("fontsize", "14"), + "style": self.DEFAULT_STYLES["node_style"], + "typefontcolor": self.DEFAULT_STYLES["type_font_color"], + } + + # Check if node has kernel_type attribute + if hasattr(node_id, "kernel_type"): + kernel_type = node_id.kernel_type + if kernel_type in rules: + # Override defaults with rule-specific attributes + rule_attrs = rules[kernel_type].copy() + default_attrs.update(rule_attrs) + + return default_attrs + + def _merge_styles(self, **override_styles) -> dict: + """ + CENTRAL STYLE MERGING + Takes the default styles and overrides them with any user-provided styles. + """ + merged = self.DEFAULT_STYLES.copy() + merged.update(override_styles) # Override defaults with user choices + return merged + def generate_dot( self, graph: "nx.DiGraph", - label_lut: dict[Any, str] | None = None, - rankdir: str = "TB", - node_shape: str = "box", - node_style: str = "filled", - node_color: str = "lightblue", - edge_color: str = "black", - dpi: int = 150, + label_lut: Optional[Dict[Any, str]] = None, + style_rules: Optional[Dict] = None, + **style_overrides, ) -> str: - """ - Generate DOT syntax from NetworkX graph - - Args: - graph: NetworkX DiGraph to render - label_lut: Optional dictionary mapping node_id -> display_label - rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') - node_shape: Shape for all nodes - node_style: Style for all nodes - node_color: Fill color for all nodes - edge_color: Color for all edges - dpi: Resolution for rendered image (default 150) + # Get final styles (defaults + overrides) + styles = self._merge_styles(**style_overrides) - Returns: - DOT format string - """ - try: - import graphviz - except ImportError as e: - raise ImportError( - "Graphviz is not installed. Please install graphviz to render graph of the pipeline." - ) from e + import graphviz dot = graphviz.Digraph(comment="NetworkX Graph") - # Set graph attributes - dot.attr(rankdir=rankdir, dpi=str(dpi)) - dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) - dot.attr("edge", color=edge_color) + # Apply global styles + dot.attr(rankdir=styles["rankdir"], dpi=str(styles["dpi"])) + dot.attr(fontname=styles["font_name"]) + if styles.get("font_size"): + dot.attr(fontsize=styles["fontsize"]) + if styles["font_path"]: + dot.attr(fontpath=styles["font_path"]) - # Add nodes + # Set default edge attributes + dot.attr("edge", color=styles["edge_color"]) + + # Add nodes with default attribute specific styling for node_id in graph.nodes(): sanitized_id = self._sanitize_node_id(node_id) - label = self._get_node_label(node_id, label_lut) - dot.node(sanitized_id, label=label) + + node_attrs = self._get_node_attributes(node_id, style_rules) + + if label_lut and node_id in label_lut: + # Use custom label if provided + label = label_lut[node_id] + else: + # Use default HTML label with kernel_type above main label + label = self._create_default_html_label(node_id, node_attrs) + + # Add nodes with its specific attributes + dot.node(sanitized_id, label=label, **node_attrs) # Add edges for source, target in graph.edges(): @@ -324,61 +475,51 @@ def generate_dot( def render_graph( self, - graph: nx.DiGraph, - label_lut: dict[Any, str] | None = None, + graph: "nx.DiGraph", + label_lut: Optional[Dict[Any, str]] = None, show: bool = True, - output_path: str | None = None, + output_path: Optional[str] = None, raw_output: bool = False, - rankdir: str = "TB", - figsize: tuple = (6, 4), - dpi: int = 150, - **style_kwargs, - ) -> str | None: - """ - Render NetworkX graph using Graphviz + figsize: tuple = (12, 8), + dpi: int = 200, + style_rules: Optional[Dict] = None, + **style_overrides, + ) -> Optional[str]: + # Always generate DOT first + dot_text = self.generate_dot(graph, label_lut, style_rules, **style_overrides) - Args: - graph: NetworkX DiGraph to render - label_lut: Optional dictionary mapping node_id -> display_label - show: Display the graph using matplotlib - output_path: Save graph to file (e.g., 'graph.png', 'graph.pdf') - raw_output: Return DOT syntax instead of rendering - rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') - figsize: Figure size for matplotlib display - dpi: Resolution for rendered image (default 150) - **style_kwargs: Additional styling (node_color, edge_color, node_shape, etc.) + if raw_output: + return dot_text - Returns: - DOT syntax if raw_output=True, None otherwise - """ - try: - import graphviz - except ImportError as e: - raise ImportError( - "Graphviz is not installed. Please install graphviz to render graph of the pipeline." - ) from e + # For rendering, continue with the existing logic but return DOT text + styles = self._merge_styles(**style_overrides) - if raw_output: - return self.generate_dot(graph, label_lut, rankdir, dpi=dpi, **style_kwargs) + import graphviz - # Create Graphviz object dot = graphviz.Digraph(comment="NetworkX Graph") - dot.attr(rankdir=rankdir, dpi=str(dpi)) - # Apply styling - node_shape = style_kwargs.get("node_shape", "box") - node_style = style_kwargs.get("node_style", "filled") - node_color = style_kwargs.get("node_color", "lightblue") - edge_color = style_kwargs.get("edge_color", "black") + # Apply styles directly + dot.attr(rankdir=styles["rankdir"], dpi=str(dpi)) + dot.attr(fontname=styles["font_name"]) + if styles.get("fontsize"): + dot.attr(fontsize=styles["fontsize"]) + if styles["font_path"]: + dot.attr(fontpath=styles["font_path"]) - dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) - dot.attr("edge", color=edge_color) + # Set default edge attributes + dot.attr("edge", color=styles["edge_color"]) - # Add nodes with labels + # Add nodes with specific styling for node_id in graph.nodes(): sanitized_id = self._sanitize_node_id(node_id) - label = self._get_node_label(node_id, label_lut) - dot.node(sanitized_id, label=label) + node_attrs = self._get_node_attributes(node_id, style_rules) + + if label_lut and node_id in label_lut: + label = label_lut[node_id] + else: + label = self._create_default_html_label(node_id, node_attrs) + + dot.node(sanitized_id, label=label, **node_attrs) # Add edges for source, target in graph.edges(): @@ -386,51 +527,313 @@ def render_graph( target_id = self._sanitize_node_id(target) dot.edge(source_id, target_id) - # Handle output if output_path: - # Save to file name, ext = os.path.splitext(output_path) format_type = ext[1:] if ext else "png" dot.render(name, format=format_type, cleanup=True) print(f"Graph saved to {output_path}") + import matplotlib.pyplot as plt + import matplotlib.image as mpimg + if show: - # Display with matplotlib with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: dot.render(tmp.name[:-4], format="png", cleanup=True) - - import matplotlib.pyplot as plt - import matplotlib.image as mpimg - - # Display with matplotlib img = mpimg.imread(tmp.name) - plt.figure(figsize=figsize) - plt.imshow(img) + plt.figure(figsize=figsize, dpi=dpi) + plt.imshow(img, interpolation="none") plt.axis("off") - plt.title("Graph Visualization") + # plt.title("Example Graph") plt.tight_layout() plt.show() - - # Clean up os.unlink(tmp.name) - return None + # Always return DOT text (like the spec) + return dot_text -# Convenience function for quick rendering +# ===================== +# CONVENIENCE FUNCTION +# ===================== def render_graph( - graph: nx.DiGraph, label_lut: dict[Any, str] | None = None, **kwargs -) -> str | None: + graph: "nx.DiGraph", + label_lut: Optional[Dict[Any, str]] = None, + style_rules: Optional[Dict] = None, + **kwargs, +) -> Optional[str]: """ - Convenience function to quickly render a NetworkX graph + Convenience function with conditional node styling Args: - graph: NetworkX DiGraph to render - label_lut: Optional dictionary mapping node_id -> display_label - **kwargs: All other arguments passed to GraphRenderer.render_graph() + graph: NetworkX DiGraph + label_lut: Optional node labels + style_rules: Dict mapping node attributes to styling rules + **kwargs: Other styling arguments + """ + renderer = GraphRenderer() + return renderer.render_graph(graph, label_lut, style_rules=style_rules, **kwargs) + - Returns: - DOT syntax if raw_output=True, None otherwise +def render_graph_dark_theme( + graph: "nx.DiGraph", label_lut: Optional[Dict[Any, str]] = None, **kwargs +) -> Optional[str]: + """ + Render with dark theme - all backgrounds dark, all pod type fonts light + Perfect for dark themed presentations or displays """ renderer = GraphRenderer() - return renderer.render_graph(graph, label_lut, **kwargs) + return renderer.render_graph( + graph, label_lut, style_rules=renderer.DARK_THEME_RULES, **kwargs + ) + + +# ============================================= +# STYLE RULE SETS +# ============================================= + + +class StyleRuleSets: + """Access to different theme style rules""" + + @staticmethod + def get_default_rules(): + """Mixed theme - light node fill colors with dark colored fonts""" + return GraphRenderer.DEFAULT_STYLE_RULES + + @staticmethod + def get_dark_rules(): + """Dark theme - dark node fill colors with light colored fonts""" + return GraphRenderer.DARK_THEME_RULES + + @staticmethod + def create_custom_rules( + source_bg="lightgreen", + operator_bg="orange", + pod_bg="darkslateblue", + source_main_fcolor="black", + operator_main_fcolor="black", + pod_main_fcolor="white", + source_type_fcolor="darkgray", + operator_type_fcolor="darkgray", + kernel_type_fcolor="lightgray", + ): + """Create custom theme rules""" + return { + "source": { + "fillcolor": source_bg, + "shape": "ellipse", + "fontcolor": source_main_fcolor, + "style": "filled", + "type_font_color": source_type_fcolor, + }, + "operator": { + "fillcolor": operator_bg, + "shape": "diamond", + "fontcolor": operator_main_fcolor, + "style": "filled", + "type_font_color": operator_type_fcolor, + }, + "pod": { + "fillcolor": pod_bg, + "shape": "box", + "fontcolor": pod_main_fcolor, + "style": "filled,rounded", + "type_font_color": kernel_type_fcolor, + }, + } + + +# import networkx as nx +# # import graphviz +# import matplotlib.pyplot as plt +# import matplotlib.image as mpimg +# import tempfile +# import os + + +# class GraphRenderer: +# """Simple renderer for NetworkX graphs using Graphviz DOT format""" + +# def __init__(self): +# """Initialize the renderer""" +# pass + +# def _sanitize_node_id(self, node_id: Any) -> str: +# """Convert node_id to a valid DOT identifier using hash""" +# return f"node_{hash(node_id)}" + +# def _get_node_label( +# self, node_id: Any, label_lut: dict[Any, str] | None = None +# ) -> str: +# """Get label for a node""" +# if label_lut and node_id in label_lut: +# return label_lut[node_id] +# return str(node_id) + +# def generate_dot( +# self, +# graph: "nx.DiGraph", +# label_lut: dict[Any, str] | None = None, +# rankdir: str = "TB", +# node_shape: str = "box", +# node_style: str = "filled", +# node_color: str = "lightblue", +# edge_color: str = "black", +# dpi: int = 150, +# ) -> str: +# """ +# Generate DOT syntax from NetworkX graph + +# Args: +# graph: NetworkX DiGraph to render +# label_lut: Optional dictionary mapping node_id -> display_label +# rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') +# node_shape: Shape for all nodes +# node_style: Style for all nodes +# node_color: Fill color for all nodes +# edge_color: Color for all edges +# dpi: Resolution for rendered image (default 150) + +# Returns: +# DOT format string +# """ +# try: +# import graphviz +# except ImportError as e: +# raise ImportError( +# "Graphviz is not installed. Please install graphviz to render graph of the pipeline." +# ) from e + +# dot = graphviz.Digraph(comment="NetworkX Graph") + +# # Set graph attributes +# dot.attr(rankdir=rankdir, dpi=str(dpi)) +# dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) +# dot.attr("edge", color=edge_color) + +# # Add nodes +# for node_id in graph.nodes(): +# sanitized_id = self._sanitize_node_id(node_id) +# label = self._get_node_label(node_id, label_lut) +# dot.node(sanitized_id, label=label) + +# # Add edges +# for source, target in graph.edges(): +# source_id = self._sanitize_node_id(source) +# target_id = self._sanitize_node_id(target) +# dot.edge(source_id, target_id) + +# return dot.source + +# def render_graph( +# self, +# graph: nx.DiGraph, +# label_lut: dict[Any, str] | None = None, +# show: bool = True, +# output_path: str | None = None, +# raw_output: bool = False, +# rankdir: str = "TB", +# figsize: tuple = (6, 4), +# dpi: int = 150, +# **style_kwargs, +# ) -> str | None: +# """ +# Render NetworkX graph using Graphviz + +# Args: +# graph: NetworkX DiGraph to render +# label_lut: Optional dictionary mapping node_id -> display_label +# show: Display the graph using matplotlib +# output_path: Save graph to file (e.g., 'graph.png', 'graph.pdf') +# raw_output: Return DOT syntax instead of rendering +# rankdir: Graph direction ('TB', 'BT', 'LR', 'RL') +# figsize: Figure size for matplotlib display +# dpi: Resolution for rendered image (default 150) +# **style_kwargs: Additional styling (node_color, edge_color, node_shape, etc.) + +# Returns: +# DOT syntax if raw_output=True, None otherwise +# """ +# try: +# import graphviz +# except ImportError as e: +# raise ImportError( +# "Graphviz is not installed. Please install graphviz to render graph of the pipeline." +# ) from e + +# if raw_output: +# return self.generate_dot(graph, label_lut, rankdir, dpi=dpi, **style_kwargs) + +# # Create Graphviz object +# dot = graphviz.Digraph(comment="NetworkX Graph") +# dot.attr(rankdir=rankdir, dpi=str(dpi)) + +# # Apply styling +# node_shape = style_kwargs.get("node_shape", "box") +# node_style = style_kwargs.get("node_style", "filled") +# node_color = style_kwargs.get("node_color", "lightblue") +# edge_color = style_kwargs.get("edge_color", "black") + +# dot.attr("node", shape=node_shape, style=node_style, fillcolor=node_color) +# dot.attr("edge", color=edge_color) + +# # Add nodes with labels +# for node_id in graph.nodes(): +# sanitized_id = self._sanitize_node_id(node_id) +# label = self._get_node_label(node_id, label_lut) +# dot.node(sanitized_id, label=label) + +# # Add edges +# for source, target in graph.edges(): +# source_id = self._sanitize_node_id(source) +# target_id = self._sanitize_node_id(target) +# dot.edge(source_id, target_id) + +# # Handle output +# if output_path: +# # Save to file +# name, ext = os.path.splitext(output_path) +# format_type = ext[1:] if ext else "png" +# dot.render(name, format=format_type, cleanup=True) +# print(f"Graph saved to {output_path}") + +# if show: +# # Display with matplotlib +# with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: +# dot.render(tmp.name[:-4], format="png", cleanup=True) + +# import matplotlib.pyplot as plt +# import matplotlib.image as mpimg + +# # Display with matplotlib +# img = mpimg.imread(tmp.name) +# plt.figure(figsize=figsize) +# plt.imshow(img) +# plt.axis("off") +# plt.title("Graph Visualization") +# plt.tight_layout() +# plt.show() + +# # Clean up +# os.unlink(tmp.name) + +# return None + + +# # Convenience function for quick rendering +# def render_graph( +# graph: nx.DiGraph, label_lut: dict[Any, str] | None = None, **kwargs +# ) -> str | None: +# """ +# Convenience function to quickly render a NetworkX graph + +# Args: +# graph: NetworkX DiGraph to render +# label_lut: Optional dictionary mapping node_id -> display_label +# **kwargs: All other arguments passed to GraphRenderer.render_graph() + +# Returns: +# DOT syntax if raw_output=True, None otherwise +# """ +# renderer = GraphRenderer() +# return renderer.render_graph(graph, label_lut, **kwargs) diff --git a/src/orcapod/pipeline/nodes.py b/src/orcapod/pipeline/nodes.py index 6518aed..27f2abb 100644 --- a/src/orcapod/pipeline/nodes.py +++ b/src/orcapod/pipeline/nodes.py @@ -32,9 +32,11 @@ def __init__( input_streams: Collection[cp.Stream], pipeline_database: dbp.ArrowDatabase, pipeline_path_prefix: tuple[str, ...] = (), + kernel_type: str = "operator", **kwargs, ): super().__init__(**kwargs) + self.kernel_type = kernel_type self._cached_stream: KernelStream | None = None self._input_streams = tuple(input_streams) self._pipeline_path_prefix = pipeline_path_prefix @@ -55,6 +57,10 @@ def __init__( self.pipeline_database = pipeline_database + @property + def id(self) -> str: + return self.content_hash().to_string() + @property def upstreams(self) -> tuple[cp.Stream, ...]: return self._input_streams diff --git a/src/orcapod/semantic_types/semantic_registry.py b/src/orcapod/semantic_types/semantic_registry.py index ab334ba..aa1c604 100644 --- a/src/orcapod/semantic_types/semantic_registry.py +++ b/src/orcapod/semantic_types/semantic_registry.py @@ -4,7 +4,7 @@ from orcapod.utils.lazy_module import LazyModule # from orcapod.semantic_types.type_inference import infer_python_schema_from_pylist_data -from orcapod.types import PythonSchema +from orcapod.types import DataType, PythonSchema from orcapod.semantic_types import pydata_utils if TYPE_CHECKING: @@ -38,8 +38,8 @@ def infer_python_schema_from_pydict(data: dict[str, list[Any]]) -> PythonSchema: def __init__(self, converters: Mapping[str, SemanticStructConverter] | None = None): # Bidirectional mappings between Python types and struct signatures - self._python_to_struct: dict[type, "pa.StructType"] = {} - self._struct_to_python: dict["pa.StructType", type] = {} + self._python_to_struct: dict[DataType, "pa.StructType"] = {} + self._struct_to_python: dict["pa.StructType", DataType] = {} self._struct_to_converter: dict["pa.StructType", SemanticStructConverter] = {} # Name mapping for convenience @@ -101,7 +101,7 @@ def register_converter( self._struct_to_name[struct_signature] = semantic_type_name def get_converter_for_python_type( - self, python_type: type + self, python_type: DataType ) -> SemanticStructConverter | None: """Get converter registered to the Python type.""" # Direct lookup first @@ -140,7 +140,7 @@ def get_converter_for_struct_signature( def get_python_type_for_semantic_struct_signature( self, struct_signature: "pa.StructType" - ) -> type | None: + ) -> DataType | None: """ Get Python type registered to the Arrow struct signature. """ @@ -168,7 +168,7 @@ def list_semantic_types(self) -> list[str]: """Get all registered semantic type names.""" return list(self._name_to_converter.keys()) - def list_python_types(self) -> list[type]: + def list_python_types(self) -> list[DataType]: """Get all registered Python types.""" return list(self._python_to_struct.keys()) diff --git a/src/orcapod/semantic_types/type_inference.py b/src/orcapod/semantic_types/type_inference.py index 3a55c03..b51c267 100644 --- a/src/orcapod/semantic_types/type_inference.py +++ b/src/orcapod/semantic_types/type_inference.py @@ -1,11 +1,12 @@ from types import UnionType from typing import Any, Union, get_origin, get_args +from collections.abc import Collection, Mapping from orcapod.types import PythonSchema def infer_python_schema_from_pylist_data( - data: list[dict], + data: Collection[Mapping[str, Any]], default_type: type = str, ) -> PythonSchema: """ diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index b07eb7f..c3ba97e 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -11,8 +11,8 @@ import types from typing import TypedDict, Any -from collections.abc import Callable -import pyarrow as pa +import typing +from collections.abc import Callable, Mapping import hashlib import logging from orcapod.contexts import DataContext, resolve_context @@ -21,7 +21,16 @@ # Handle generic types from typing import get_origin, get_args -import typing + +from typing import TYPE_CHECKING +from orcapod.types import DataType, PythonSchemaLike +from orcapod.utils.lazy_module import LazyModule + +if TYPE_CHECKING: + import pyarrow as pa +else: + pa = LazyModule("pyarrow") + logger = logging.getLogger(__name__) @@ -97,19 +106,19 @@ def __init__(self, semantic_registry: SemanticTypeRegistry | None = None): self.semantic_registry = semantic_registry # Cache for created TypedDict classes - self._struct_signature_to_typeddict: dict[pa.StructType, type] = {} - self._typeddict_to_struct_signature: dict[type, pa.StructType] = {} + self._struct_signature_to_typeddict: dict[pa.StructType, DataType] = {} + self._typeddict_to_struct_signature: dict[DataType, pa.StructType] = {} self._created_type_names: set[str] = set() # Cache for conversion functions - self._python_to_arrow_converters: dict[type, Callable] = {} + self._python_to_arrow_converters: dict[DataType, Callable] = {} self._arrow_to_python_converters: dict[pa.DataType, Callable] = {} # Cache for type mappings - self._python_to_arrow_types: dict[type, pa.DataType] = {} - self._arrow_to_python_types: dict[pa.DataType, type] = {} + self._python_to_arrow_types: dict[DataType, pa.DataType] = {} + self._arrow_to_python_types: dict[pa.DataType, DataType] = {} - def python_type_to_arrow_type(self, python_type: type) -> pa.DataType: + def python_type_to_arrow_type(self, python_type: DataType) -> pa.DataType: """ Convert Python type hint to Arrow type with caching. @@ -127,7 +136,7 @@ def python_type_to_arrow_type(self, python_type: type) -> pa.DataType: return arrow_type def python_schema_to_arrow_schema( - self, python_schema: dict[str, type] + self, python_schema: PythonSchemaLike ) -> pa.Schema: """ Convert a Python schema (dict of field names to types) to an Arrow schema. @@ -141,7 +150,7 @@ def python_schema_to_arrow_schema( return pa.schema(fields) - def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> type: + def arrow_type_to_python_type(self, arrow_type: pa.DataType) -> DataType: """ Convert Arrow type to Python type hint with caching. @@ -174,7 +183,7 @@ def arrow_schema_to_python_schema(self, arrow_schema: pa.Schema) -> dict[str, ty def python_dicts_to_struct_dicts( self, python_dicts: list[dict[str, Any]], - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, ) -> list[dict[str, Any]]: """ Convert a list of Python dictionaries to an Arrow table. @@ -232,7 +241,7 @@ def struct_dict_to_python_dict( def python_dicts_to_arrow_table( self, python_dicts: list[dict[str, Any]], - python_schema: dict[str, type] | None = None, + python_schema: PythonSchemaLike | None = None, arrow_schema: "pa.Schema | None" = None, ) -> pa.Table: """ @@ -292,7 +301,9 @@ def arrow_table_to_python_dicts( return python_dicts - def get_python_to_arrow_converter(self, python_type: type) -> Callable[[Any], Any]: + def get_python_to_arrow_converter( + self, python_type: DataType + ) -> Callable[[Any], Any]: """ Get cached conversion function for Python value → Arrow value. @@ -326,7 +337,7 @@ def get_arrow_to_python_converter( return converter - def _convert_python_to_arrow(self, python_type: type) -> pa.DataType: + def _convert_python_to_arrow(self, python_type: DataType) -> pa.DataType: """Core Python → Arrow type conversion logic.""" if python_type in _PYTHON_TO_ARROW_MAP: @@ -351,7 +362,7 @@ def _convert_python_to_arrow(self, python_type: type) -> pa.DataType: if origin is None: # Handle string type names if hasattr(python_type, "__name__"): - type_name = python_type.__name__ + type_name = getattr(python_type, "__name__") if type_name in _PYTHON_TO_ARROW_MAP: return _PYTHON_TO_ARROW_MAP[type_name] raise ValueError(f"Unsupported Python type: {python_type}") @@ -526,7 +537,9 @@ def _convert_arrow_to_python(self, arrow_type: pa.DataType) -> type | Any: # Default case for unsupported types return Any - def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> type: + def _get_or_create_typeddict_for_struct( + self, struct_type: pa.StructType + ) -> DataType: """Get or create a TypedDict class for an Arrow struct type.""" # Check cache first @@ -534,7 +547,7 @@ def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> typ return self._struct_signature_to_typeddict[struct_type] # Create field specifications for TypedDict - field_specs: dict[str, type] = {} + field_specs: dict[str, DataType] = {} for field in struct_type: field_name = field.name python_type = self.arrow_type_to_python_type(field.type) @@ -552,7 +565,8 @@ def _get_or_create_typeddict_for_struct(self, struct_type: pa.StructType) -> typ return typeddict_class - def _generate_unique_type_name(self, field_specs: dict[str, type]) -> str: + # TODO: consider setting type of field_specs to PythonSchema + def _generate_unique_type_name(self, field_specs: Mapping[str, DataType]) -> str: """Generate a unique name for TypedDict based on field specifications.""" # Create deterministic signature that includes both names and types @@ -591,7 +605,7 @@ def _generate_unique_type_name(self, field_specs: dict[str, type]) -> str: return base_name def _create_python_to_arrow_converter( - self, python_type: type + self, python_type: DataType ) -> Callable[[Any], Any]: """Create a cached conversion function for Python → Arrow values.""" diff --git a/src/orcapod/utils/arrow_utils.py b/src/orcapod/utils/arrow_utils.py index 7d5376c..deeabcb 100644 --- a/src/orcapod/utils/arrow_utils.py +++ b/src/orcapod/utils/arrow_utils.py @@ -1,7 +1,6 @@ # TODO: move this to a separate module from collections import defaultdict -import pyarrow as pa from collections.abc import Mapping, Collection from typing import Any @@ -313,7 +312,7 @@ def pydict_to_pylist(pydict: dict) -> list[dict]: return result -def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: +def join_arrow_schemas(*schemas: "pa.Schema") -> "pa.Schema": """Join multiple Arrow schemas into a single schema, ensuring compatibility of fields. In particular, no field names should collide.""" merged_fields = [] @@ -323,8 +322,8 @@ def join_arrow_schemas(*schemas: pa.Schema) -> pa.Schema: def select_table_columns_with_prefixes( - table: pa.Table, prefix: str | Collection[str] -) -> pa.Table: + table: "pa.Table", prefix: str | Collection[str] +) -> "pa.Table": """ Select columns from a PyArrow table that start with a specific prefix. @@ -344,8 +343,8 @@ def select_table_columns_with_prefixes( def select_schema_columns_with_prefixes( - schema: pa.Schema, prefix: str | Collection[str] -) -> pa.Schema: + schema: "pa.Schema", prefix: str | Collection[str] +) -> "pa.Schema": """ Select columns from an Arrow schema that start with a specific prefix. @@ -364,7 +363,7 @@ def select_schema_columns_with_prefixes( return pa.schema(selected_fields) -def select_arrow_schema(schema: pa.Schema, columns: Collection[str]) -> pa.Schema: +def select_arrow_schema(schema: "pa.Schema", columns: Collection[str]) -> "pa.Schema": """ Select specific columns from an Arrow schema. @@ -379,7 +378,7 @@ def select_arrow_schema(schema: pa.Schema, columns: Collection[str]) -> pa.Schem return pa.schema(selected_fields) -def hstack_tables(*tables: pa.Table) -> pa.Table: +def hstack_tables(*tables: "pa.Table") -> "pa.Table": """ Horizontally stack multiple PyArrow tables by concatenating their columns. @@ -429,7 +428,7 @@ def hstack_tables(*tables: pa.Table) -> pa.Table: def check_arrow_schema_compatibility( - incoming_schema: pa.Schema, target_schema: pa.Schema, strict: bool = False + incoming_schema: "pa.Schema", target_schema: "pa.Schema", strict: bool = False ) -> tuple[bool, list[str]]: # TODO: add strict comparison """ @@ -501,7 +500,7 @@ def check_arrow_schema_compatibility( def split_by_column_groups( table, *column_groups: Collection[str], -) -> tuple[pa.Table | None, ...]: +) -> tuple["pa.Table | None", ...]: """ Split the table into multiple tables based on the provided column groups. Each group is a collection of column names that should be included in the same table. @@ -531,13 +530,13 @@ def split_by_column_groups( def prepare_prefixed_columns( - table: pa.Table | pa.RecordBatch, + table: "pa.Table | pa.RecordBatch", prefix_info: Collection[str] | Mapping[str, Any | None] | Mapping[str, Mapping[str, Any | None]], exclude_columns: Collection[str] = (), exclude_prefixes: Collection[str] = (), -) -> tuple[pa.Table, dict[str, pa.Table]]: +) -> tuple["pa.Table", dict[str, "pa.Table"]]: """ """ all_prefix_info = {} if isinstance(prefix_info, Mapping): @@ -636,7 +635,7 @@ def prepare_prefixed_columns( return data_table, result_tables -def drop_schema_columns(schema: pa.Schema, columns: Collection[str]) -> pa.Schema: +def drop_schema_columns(schema: "pa.Schema", columns: Collection[str]) -> "pa.Schema": """ Drop specified columns from a PyArrow schema. diff --git a/src/orcapod/utils/types_utils.py b/src/orcapod/utils/types_utils.py index 372d15e..5c25d03 100644 --- a/src/orcapod/utils/types_utils.py +++ b/src/orcapod/utils/types_utils.py @@ -5,7 +5,7 @@ from orcapod.types import PythonSchema, PythonSchemaLike import inspect import logging - +import sys logger = logging.getLogger(__name__) @@ -230,14 +230,79 @@ def get_typespec_from_dict( } +# def get_compatible_type(type1: Any, type2: Any) -> Any: +# if type1 is type2: +# return type1 +# if issubclass(type1, type2): +# return type2 +# if issubclass(type2, type1): +# return type1 +# raise TypeError(f"Types {type1} and {type2} are not compatible") + + def get_compatible_type(type1: Any, type2: Any) -> Any: + # Handle identical types if type1 is type2: return type1 - if issubclass(type1, type2): + + # Handle None/NoneType + if type1 is type(None) or type2 is type(None): + # You might want to handle Optional types here + if type1 is type(None): + return type2 + return type1 + + # Get origins for generic types (e.g., list from list[int]) + origin1 = get_origin(type1) or type1 + origin2 = get_origin(type2) or type2 + + # If origins are different, check basic subclass relationship + if origin1 != origin2: + try: + if issubclass(origin1, origin2): + return type2 + if issubclass(origin2, origin1): + return type1 + except TypeError: + # issubclass fails on some special forms + pass + raise TypeError(f"Types {type1} and {type2} are not compatible") + + # Same origin - check type arguments + args1 = get_args(type1) + args2 = get_args(type2) + + # If no type arguments, return the origin + if not args1 and not args2: + return origin1 + + # If only one has type arguments, prefer the more specific one + if not args1: return type2 - if issubclass(type2, type1): + if not args2: return type1 - raise TypeError(f"Types {type1} and {type2} are not compatible") + + # Both have type arguments - recursively check compatibility + if len(args1) != len(args2): + raise TypeError(f"Types {type1} and {type2} have incompatible argument counts") + + compatible_args = [] + for arg1, arg2 in zip(args1, args2): + try: + compatible_args.append(get_compatible_type(arg1, arg2)) + except TypeError: + raise TypeError( + f"Types {type1} and {type2} have incompatible type arguments" + ) + + # Reconstruct the generic type + if sys.version_info >= (3, 9): + return origin1[tuple(compatible_args)] + else: + # For Python < 3.9, you might need to use typing._GenericAlias + from typing import _GenericAlias + + return _GenericAlias(origin1, tuple(compatible_args)) def union_typespecs(*typespecs: PythonSchema) -> PythonSchema: From 4c397d634d1c902ef521f396034cd11693dc5453 Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 1 Sep 2025 12:21:50 -0700 Subject: [PATCH 223/224] buid: remove pixi and cleanup pyproject.toml --- pixi.lock | 3431 ------------------------------------------------ pyproject.toml | 20 +- 2 files changed, 1 insertion(+), 3450 deletions(-) delete mode 100644 pixi.lock diff --git a/pixi.lock b/pixi.lock deleted file mode 100644 index 54acc8c..0000000 --- a/pixi.lock +++ /dev/null @@ -1,3431 +0,0 @@ -version: 6 -environments: - all: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl - - pypi: ./ - default: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: ./ - dev: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - - pypi: https://files.pythonhosted.org/packages/cb/ed/d1bf75c089857d38332cf45416e419b47382b345ba5dfc4fae69397830d9/adlfs-2024.12.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d4/78/bf94897361fdd650850f0f2e405b2293e2f12808239046232bdedf554301/azure_core-1.35.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/88/2a/75f56b14f115189155cf12e46b366ad1fe3357af5a1a7c09f7446662d617/azure_datalake_store-0.0.53-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a9/74/17428cb429e8d52f6d0d69ed685f4760a545cb0156594963a9337b53b6c9/azure_identity-1.24.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/72/66/88566a6484e746c0b075f7c9bb248e8548eda0a486de4460d150a41e2d57/boto3-1.39.11-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/ea/2f/6ae1db51dc34db499bfe340e89f79a63bd115fc32513a7bacdf17d33cd86/coverage-7.10.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/79/62/120842ab20d9150a9d3a6bdc07fe2870384e82f5266d41c53b08a3a96b34/cryptography-45.0.6-cp311-abi3-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4d/a0/c95baae08a75bceabb79868d663a0736655e427ab9c81fb848da29edaeac/debugpy-1.8.16-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/41/9d/2814a2c47429dc2e197e176de25a946d4538422b081ade8638e585e4006f/google_cloud_storage-3.3.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/03/b6/39bcf01e1185882f34bc9fb77d1fb4a27911a55f60ab407de34abc8a2347/httpie-3.2.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/34/82/fc5ce89006389a6426ef28e326fc065b0fbaaed230373b62d14c889f47ea/mmh3-5.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/86/5b/fbc73e91f7727ae1e79b21ed833308e99dc11cc1cd3d4717f579775de5e9/msal-1.33.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz - - pypi: https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/bc/16/4ea354101abb1287856baa4af2732be351c7bee728065aed451b678153fd/pytest_cov-6.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7e/0a/2356305c423a975000867de56888b79e44ec2192c690ff93c3109fd78081/pyzmq-27.0.1-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/96/7c/a81ef5ef10978dd073a854e0fa93b5d8021d0594b639cc8f6453c3c78a1d/strictyaml-1.7.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl - - pypi: ./ - ray: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - - pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl - - pypi: ./ - redis: - channels: - - url: https://conda.anaconda.org/conda-forge/ - indexes: - - https://pypi.org/simple - packages: - linux-64: - - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - - conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - - conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - - pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - - pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - - pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - - pypi: ./ -packages: -- conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 - sha256: fe51de6107f9edc7aa4f786a70f4a883943bc9d39b3bb7307c04c41410990726 - md5: d7c89558ba9fa0495403155b64376d81 - license: None - purls: [] - size: 2562 - timestamp: 1578324546067 -- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2 - build_number: 16 - sha256: fbe2c5e56a653bebb982eda4876a9178aedfc2b545f25d0ce9c4c0b508253d22 - md5: 73aaf86a425cc6e73fcf236a5a46396d - depends: - - _libgcc_mutex 0.1 conda_forge - - libgomp >=7.5.0 - constrains: - - openmp_impl 9999 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 23621 - timestamp: 1650670423406 -- pypi: https://files.pythonhosted.org/packages/cb/ed/d1bf75c089857d38332cf45416e419b47382b345ba5dfc4fae69397830d9/adlfs-2024.12.0-py3-none-any.whl - name: adlfs - version: 2024.12.0 - sha256: 00aab061ddec0413b2039487e656b62e01ece8ef1ca0493f76034a596cf069e3 - requires_dist: - - azure-core>=1.28.0,<2.0.0 - - azure-datalake-store>=0.0.53,<0.1 - - azure-identity - - azure-storage-blob>=12.17.0 - - fsspec>=2023.12.0 - - aiohttp>=3.7.0 - - sphinx ; extra == 'docs' - - myst-parser ; extra == 'docs' - - furo ; extra == 'docs' - - numpydoc ; extra == 'docs' - - pytest ; extra == 'tests' - - docker ; extra == 'tests' - - pytest-mock ; extra == 'tests' - - arrow ; extra == 'tests' - - dask[dataframe] ; extra == 'tests' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/e2/68/b29577197aa2e54b50d6f214524790cc1cb27d289585ad7c7bdfe5125285/aiobotocore-2.24.0-py3-none-any.whl - name: aiobotocore - version: 2.24.0 - sha256: 72bb1f8eb1b962779a95e1bcc9cf35bc33196ad763b622a40ae7fa9d2e95c87c - requires_dist: - - aiohttp>=3.9.2,<4.0.0 - - aioitertools>=0.5.1,<1.0.0 - - botocore>=1.39.9,<1.39.12 - - python-dateutil>=2.1,<3.0.0 - - jmespath>=0.7.1,<2.0.0 - - multidict>=6.0.0,<7.0.0 - - wrapt>=1.10.10,<2.0.0 - - awscli>=1.41.9,<1.41.12 ; extra == 'awscli' - - boto3>=1.39.9,<1.39.12 ; extra == 'boto3' - - httpx>=0.25.1,<0.29 ; extra == 'httpx' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl - name: aiohappyeyeballs - version: 2.6.1 - sha256: f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/85/b8/9e7175e1fa0ac8e56baa83bf3c214823ce250d0028955dfb23f43d5e61fd/aiohttp-3.12.15-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: aiohttp - version: 3.12.15 - sha256: 5346b93e62ab51ee2a9d68e8f73c7cf96ffb73568a23e683f931e52450e4148d - requires_dist: - - aiohappyeyeballs>=2.5.0 - - aiosignal>=1.4.0 - - async-timeout>=4.0,<6.0 ; python_full_version < '3.11' - - attrs>=17.3.0 - - frozenlist>=1.1.1 - - multidict>=4.5,<7.0 - - propcache>=0.2.0 - - yarl>=1.17.0,<2.0 - - aiodns>=3.3.0 ; extra == 'speedups' - - brotli ; platform_python_implementation == 'CPython' and extra == 'speedups' - - brotlicffi ; platform_python_implementation != 'CPython' and extra == 'speedups' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/98/3b/40a68de458904bcc143622015fff2352b6461cd92fd66d3527bf1c6f5716/aiohttp_cors-0.8.1-py3-none-any.whl - name: aiohttp-cors - version: 0.8.1 - sha256: 3180cf304c5c712d626b9162b195b1db7ddf976a2a25172b35bb2448b890a80d - requires_dist: - - aiohttp>=3.9 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl - name: aioitertools - version: 0.12.0 - sha256: fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796 - requires_dist: - - typing-extensions>=4.0 ; python_full_version < '3.10' - - attribution==1.8.0 ; extra == 'dev' - - black==24.8.0 ; extra == 'dev' - - build>=1.2 ; extra == 'dev' - - coverage==7.6.1 ; extra == 'dev' - - flake8==7.1.1 ; extra == 'dev' - - flit==3.9.0 ; extra == 'dev' - - mypy==1.11.2 ; extra == 'dev' - - usort==1.0.8.post1 ; extra == 'dev' - - ufmt==2.7.1 ; extra == 'dev' - - sphinx==8.0.2 ; extra == 'docs' - - sphinx-mdinclude==0.6.2 ; extra == 'docs' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl - name: aiosignal - version: 1.4.0 - sha256: 053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e - requires_dist: - - frozenlist>=1.1.0 - - typing-extensions>=4.2 ; python_full_version < '3.13' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl - name: annotated-types - version: 0.7.0 - sha256: 1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53 - requires_dist: - - typing-extensions>=4.0.0 ; python_full_version < '3.9' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4f/d3/a8b22fa575b297cd6e3e3b0155c7e25db170edf1c74783d6a31a2490b8d9/argon2_cffi-25.1.0-py3-none-any.whl - name: argon2-cffi - version: 25.1.0 - sha256: fdc8b074db390fccb6eb4a3604ae7231f219aa669a2652e0f20e16ba513d5741 - requires_dist: - - argon2-cffi-bindings - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/09/52/94108adfdd6e2ddf58be64f959a0b9c7d4ef2fa71086c38356d22dc501ea/argon2_cffi_bindings-25.1.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - name: argon2-cffi-bindings - version: 25.1.0 - sha256: d3e924cfc503018a714f94a49a149fdc0b644eaead5d1f089330399134fa028a - requires_dist: - - cffi>=1.0.1 ; python_full_version < '3.14' - - cffi>=2.0.0b1 ; python_full_version >= '3.14' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/b4/e1/6ab0dd6f362f95ef855d2ba7aacf55c9dd08c55a3d8c5339eafa20f3e0f3/arro3_core-0.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: arro3-core - version: 0.5.1 - sha256: c4876a3c34bd54d970c498e2f61bfb7e36306934fd6acbfa5de497f093972bf0 - requires_dist: - - typing-extensions ; python_full_version < '3.12' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl - name: asttokens - version: 3.0.0 - sha256: e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2 - requires_dist: - - astroid>=2,<4 ; extra == 'astroid' - - astroid>=2,<4 ; extra == 'test' - - pytest ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-xdist ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl - name: attrs - version: 25.3.0 - sha256: 427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3 - requires_dist: - - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'benchmark' - - hypothesis ; extra == 'benchmark' - - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'benchmark' - - pympler ; extra == 'benchmark' - - pytest-codspeed ; extra == 'benchmark' - - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'benchmark' - - pytest-xdist[psutil] ; extra == 'benchmark' - - pytest>=4.3.0 ; extra == 'benchmark' - - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'cov' - - coverage[toml]>=5.3 ; extra == 'cov' - - hypothesis ; extra == 'cov' - - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'cov' - - pympler ; extra == 'cov' - - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'cov' - - pytest-xdist[psutil] ; extra == 'cov' - - pytest>=4.3.0 ; extra == 'cov' - - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'dev' - - hypothesis ; extra == 'dev' - - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'dev' - - pre-commit-uv ; extra == 'dev' - - pympler ; extra == 'dev' - - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'dev' - - pytest-xdist[psutil] ; extra == 'dev' - - pytest>=4.3.0 ; extra == 'dev' - - cogapp ; extra == 'docs' - - furo ; extra == 'docs' - - myst-parser ; extra == 'docs' - - sphinx ; extra == 'docs' - - sphinx-notfound-page ; extra == 'docs' - - sphinxcontrib-towncrier ; extra == 'docs' - - towncrier ; extra == 'docs' - - cloudpickle ; platform_python_implementation == 'CPython' and extra == 'tests' - - hypothesis ; extra == 'tests' - - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests' - - pympler ; extra == 'tests' - - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests' - - pytest-xdist[psutil] ; extra == 'tests' - - pytest>=4.3.0 ; extra == 'tests' - - mypy>=1.11.1 ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests-mypy' - - pytest-mypy-plugins ; python_full_version >= '3.10' and platform_python_implementation == 'CPython' and extra == 'tests-mypy' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/d4/78/bf94897361fdd650850f0f2e405b2293e2f12808239046232bdedf554301/azure_core-1.35.0-py3-none-any.whl - name: azure-core - version: 1.35.0 - sha256: 8db78c72868a58f3de8991eb4d22c4d368fae226dac1002998d6c50437e7dad1 - requires_dist: - - requests>=2.21.0 - - six>=1.11.0 - - typing-extensions>=4.6.0 - - aiohttp>=3.0 ; extra == 'aio' - - opentelemetry-api~=1.26 ; extra == 'tracing' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/88/2a/75f56b14f115189155cf12e46b366ad1fe3357af5a1a7c09f7446662d617/azure_datalake_store-0.0.53-py2.py3-none-any.whl - name: azure-datalake-store - version: 0.0.53 - sha256: a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b - requires_dist: - - cffi - - msal>=1.16.0,<2 - - requests>=2.20.0 - - azure-nspkg ; python_full_version < '3' - - pathlib2 ; python_full_version < '3.4' - - futures ; python_full_version < '2.8' -- pypi: https://files.pythonhosted.org/packages/a9/74/17428cb429e8d52f6d0d69ed685f4760a545cb0156594963a9337b53b6c9/azure_identity-1.24.0-py3-none-any.whl - name: azure-identity - version: 1.24.0 - sha256: 9e04997cde0ab02ed66422c74748548e620b7b29361c72ce622acab0267ff7c4 - requires_dist: - - azure-core>=1.31.0 - - cryptography>=2.5 - - msal>=1.30.0 - - msal-extensions>=1.2.0 - - typing-extensions>=4.0.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/5b/64/63dbfdd83b31200ac58820a7951ddfdeed1fbee9285b0f3eae12d1357155/azure_storage_blob-12.26.0-py3-none-any.whl - name: azure-storage-blob - version: 12.26.0 - sha256: 8c5631b8b22b4f53ec5fff2f3bededf34cfef111e2af613ad42c9e6de00a77fe - requires_dist: - - azure-core>=1.30.0 - - cryptography>=2.1.4 - - typing-extensions>=4.6.0 - - isodate>=0.6.1 - - azure-core[aio]>=1.30.0 ; extra == 'aio' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/94/31/87045d1c66ee10a52486c9d2047bc69f00f2689f69401bb1e998afb4b205/beartype-0.21.0-py3-none-any.whl - name: beartype - version: 0.21.0 - sha256: b6a1bd56c72f31b0a496a36cc55df6e2f475db166ad07fa4acc7e74f4c7f34c0 - requires_dist: - - autoapi>=0.9.0 ; extra == 'dev' - - click ; extra == 'dev' - - coverage>=5.5 ; extra == 'dev' - - equinox ; sys_platform == 'linux' and extra == 'dev' - - jax[cpu] ; sys_platform == 'linux' and extra == 'dev' - - jaxtyping ; sys_platform == 'linux' and extra == 'dev' - - langchain ; extra == 'dev' - - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'dev' - - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'dev' - - numba ; python_full_version < '3.13' and extra == 'dev' - - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'dev' - - pandera ; extra == 'dev' - - pydata-sphinx-theme<=0.7.2 ; extra == 'dev' - - pygments ; extra == 'dev' - - pyright>=1.1.370 ; extra == 'dev' - - pytest>=4.0.0 ; extra == 'dev' - - rich-click ; extra == 'dev' - - sphinx ; extra == 'dev' - - sphinx>=4.2.0,<6.0.0 ; extra == 'dev' - - sphinxext-opengraph>=0.7.5 ; extra == 'dev' - - sqlalchemy ; extra == 'dev' - - tox>=3.20.1 ; extra == 'dev' - - typing-extensions>=3.10.0.0 ; extra == 'dev' - - xarray ; extra == 'dev' - - autoapi>=0.9.0 ; extra == 'doc-rtd' - - pydata-sphinx-theme<=0.7.2 ; extra == 'doc-rtd' - - sphinx>=4.2.0,<6.0.0 ; extra == 'doc-rtd' - - sphinxext-opengraph>=0.7.5 ; extra == 'doc-rtd' - - click ; extra == 'test' - - coverage>=5.5 ; extra == 'test' - - equinox ; sys_platform == 'linux' and extra == 'test' - - jax[cpu] ; sys_platform == 'linux' and extra == 'test' - - jaxtyping ; sys_platform == 'linux' and extra == 'test' - - langchain ; extra == 'test' - - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'test' - - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'test' - - numba ; python_full_version < '3.13' and extra == 'test' - - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'test' - - pandera ; extra == 'test' - - pygments ; extra == 'test' - - pyright>=1.1.370 ; extra == 'test' - - pytest>=4.0.0 ; extra == 'test' - - rich-click ; extra == 'test' - - sphinx ; extra == 'test' - - sqlalchemy ; extra == 'test' - - tox>=3.20.1 ; extra == 'test' - - typing-extensions>=3.10.0.0 ; extra == 'test' - - xarray ; extra == 'test' - - click ; extra == 'test-tox' - - equinox ; sys_platform == 'linux' and extra == 'test-tox' - - jax[cpu] ; sys_platform == 'linux' and extra == 'test-tox' - - jaxtyping ; sys_platform == 'linux' and extra == 'test-tox' - - langchain ; extra == 'test-tox' - - mypy>=0.800 ; platform_python_implementation != 'PyPy' and extra == 'test-tox' - - nuitka>=1.2.6 ; sys_platform == 'linux' and extra == 'test-tox' - - numba ; python_full_version < '3.13' and extra == 'test-tox' - - numpy ; platform_python_implementation != 'PyPy' and sys_platform != 'darwin' and extra == 'test-tox' - - pandera ; extra == 'test-tox' - - pygments ; extra == 'test-tox' - - pyright>=1.1.370 ; extra == 'test-tox' - - pytest>=4.0.0 ; extra == 'test-tox' - - rich-click ; extra == 'test-tox' - - sphinx ; extra == 'test-tox' - - sqlalchemy ; extra == 'test-tox' - - typing-extensions>=3.10.0.0 ; extra == 'test-tox' - - xarray ; extra == 'test-tox' - - coverage>=5.5 ; extra == 'test-tox-coverage' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/72/66/88566a6484e746c0b075f7c9bb248e8548eda0a486de4460d150a41e2d57/boto3-1.39.11-py3-none-any.whl - name: boto3 - version: 1.39.11 - sha256: af8f1dad35eceff7658fab43b39b0f55892b6e3dd12308733521cc24dd2c9a02 - requires_dist: - - botocore>=1.39.11,<1.40.0 - - jmespath>=0.7.1,<2.0.0 - - s3transfer>=0.13.0,<0.14.0 - - botocore[crt]>=1.21.0,<2.0a0 ; extra == 'crt' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl - name: botocore - version: 1.39.11 - sha256: 1545352931a8a186f3e977b1e1a4542d7d434796e274c3c62efd0210b5ea76dc - requires_dist: - - jmespath>=0.7.1,<2.0.0 - - python-dateutil>=2.1,<3.0.0 - - urllib3>=1.25.4,<1.27 ; python_full_version < '3.10' - - urllib3>=1.25.4,!=2.2.0,<3 ; python_full_version >= '3.10' - - awscrt==0.23.8 ; extra == 'crt' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda - sha256: 5ced96500d945fb286c9c838e54fa759aa04a7129c59800f0846b4335cee770d - md5: 62ee74e96c5ebb0af99386de58cf9553 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc-ng >=12 - license: bzip2-1.0.6 - license_family: BSD - purls: [] - size: 252783 - timestamp: 1720974456583 -- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2025.8.3-hbd8a1cb_0.conda - sha256: 837b795a2bb39b75694ba910c13c15fa4998d4bb2a622c214a6a5174b2ae53d1 - md5: 74784ee3d225fc3dca89edb635b4e5cc - depends: - - __unix - license: ISC - purls: [] - size: 154402 - timestamp: 1754210968730 -- pypi: https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl - name: cachetools - version: 5.5.2 - sha256: d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl - name: certifi - version: 2025.8.3 - sha256: f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: cffi - version: 1.17.1 - sha256: dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd - requires_dist: - - pycparser - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/7e/95/42aa2156235cbc8fa61208aded06ef46111c4d3f0de233107b3f38631803/charset_normalizer-3.4.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - name: charset-normalizer - version: 3.4.3 - sha256: 416175faf02e4b0810f1f38bcb54682878a4af94059a1cd63b8747244420801f - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl - name: click - version: 8.2.1 - sha256: 61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b - requires_dist: - - colorama ; sys_platform == 'win32' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/e2/98/0d791b3d1eaed89d7d370b5cf9b8079b124da0545559417f394ba21b5532/colorful-0.5.7-py2.py3-none-any.whl - name: colorful - version: 0.5.7 - sha256: 495dd3a23151a9568cee8a90fc1174c902ad7ef06655f50b6bddf9e80008da69 - requires_dist: - - colorama ; sys_platform == 'win32' -- pypi: https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl - name: comm - version: 0.2.3 - sha256: c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417 - requires_dist: - - pytest ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - name: contourpy - version: 1.3.3 - sha256: 4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9 - requires_dist: - - numpy>=1.25 - - furo ; extra == 'docs' - - sphinx>=7.2 ; extra == 'docs' - - sphinx-copybutton ; extra == 'docs' - - bokeh ; extra == 'bokeh' - - selenium ; extra == 'bokeh' - - contourpy[bokeh,docs] ; extra == 'mypy' - - bokeh ; extra == 'mypy' - - docutils-stubs ; extra == 'mypy' - - mypy==1.17.0 ; extra == 'mypy' - - types-pillow ; extra == 'mypy' - - contourpy[test-no-images] ; extra == 'test' - - matplotlib ; extra == 'test' - - pillow ; extra == 'test' - - pytest ; extra == 'test-no-images' - - pytest-cov ; extra == 'test-no-images' - - pytest-rerunfailures ; extra == 'test-no-images' - - pytest-xdist ; extra == 'test-no-images' - - wurlitzer ; extra == 'test-no-images' - requires_python: '>=3.11' -- pypi: https://files.pythonhosted.org/packages/ea/2f/6ae1db51dc34db499bfe340e89f79a63bd115fc32513a7bacdf17d33cd86/coverage-7.10.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - name: coverage - version: 7.10.3 - sha256: 913ceddb4289cbba3a310704a424e3fb7aac2bc0c3a23ea473193cb290cf17d4 - requires_dist: - - tomli ; python_full_version <= '3.11' and extra == 'toml' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/79/62/120842ab20d9150a9d3a6bdc07fe2870384e82f5266d41c53b08a3a96b34/cryptography-45.0.6-cp311-abi3-manylinux_2_28_x86_64.whl - name: cryptography - version: 45.0.6 - sha256: 1b7fa6a1c1188c7ee32e47590d16a5a0646270921f8020efc9a511648e1b2e08 - requires_dist: - - cffi>=1.14 ; platform_python_implementation != 'PyPy' - - bcrypt>=3.1.5 ; extra == 'ssh' - - nox>=2024.4.15 ; extra == 'nox' - - nox[uv]>=2024.3.2 ; python_full_version >= '3.8' and extra == 'nox' - - cryptography-vectors==45.0.6 ; extra == 'test' - - pytest>=7.4.0 ; extra == 'test' - - pytest-benchmark>=4.0 ; extra == 'test' - - pytest-cov>=2.10.1 ; extra == 'test' - - pytest-xdist>=3.5.0 ; extra == 'test' - - pretend>=0.7 ; extra == 'test' - - certifi>=2024 ; extra == 'test' - - pytest-randomly ; extra == 'test-randomorder' - - sphinx>=5.3.0 ; extra == 'docs' - - sphinx-rtd-theme>=3.0.0 ; python_full_version >= '3.8' and extra == 'docs' - - sphinx-inline-tabs ; python_full_version >= '3.8' and extra == 'docs' - - pyenchant>=3 ; extra == 'docstest' - - readme-renderer>=30.0 ; extra == 'docstest' - - sphinxcontrib-spelling>=7.3.1 ; extra == 'docstest' - - build>=1.0.0 ; extra == 'sdist' - - ruff>=0.3.6 ; extra == 'pep8test' - - mypy>=1.4 ; extra == 'pep8test' - - check-sdist ; python_full_version >= '3.8' and extra == 'pep8test' - - click>=8.0.1 ; extra == 'pep8test' - requires_python: '>=3.7,!=3.9.0,!=3.9.1' -- pypi: https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl - name: cycler - version: 0.12.1 - sha256: 85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30 - requires_dist: - - ipython ; extra == 'docs' - - matplotlib ; extra == 'docs' - - numpydoc ; extra == 'docs' - - sphinx ; extra == 'docs' - - pytest ; extra == 'tests' - - pytest-cov ; extra == 'tests' - - pytest-xdist ; extra == 'tests' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4d/a0/c95baae08a75bceabb79868d663a0736655e427ab9c81fb848da29edaeac/debugpy-1.8.16-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: debugpy - version: 1.8.16 - sha256: bee89e948bc236a5c43c4214ac62d28b29388453f5fd328d739035e205365f0b - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl - name: decorator - version: 5.2.1 - sha256: d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl - name: defusedxml - version: 0.7.1 - sha256: a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*' -- pypi: https://files.pythonhosted.org/packages/24/00/24dbce2a5c13c69b04dba718e64e4f74d5882ac94350228a004a27e5975c/deltalake-1.1.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: deltalake - version: 1.1.4 - sha256: 7f28480d3a19f93a75687a1a2a4449b3a6b7355243b765e4379f501dcac03eea - requires_dist: - - arro3-core>=0.5.0 - - deprecated>=1.2.18 - - pandas ; extra == 'pandas' - - pyarrow>=16 ; extra == 'pyarrow' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl - name: deprecated - version: 1.2.18 - sha256: bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec - requires_dist: - - wrapt>=1.10,<2 - - tox ; extra == 'dev' - - pytest ; extra == 'dev' - - pytest-cov ; extra == 'dev' - - bump2version<1 ; extra == 'dev' - - setuptools ; python_full_version >= '3.12' and extra == 'dev' - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' -- pypi: https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl - name: distlib - version: 0.4.0 - sha256: 9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16 -- pypi: https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl - name: executing - version: 2.2.0 - sha256: 11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa - requires_dist: - - asttokens>=2.1.0 ; extra == 'tests' - - ipython ; extra == 'tests' - - pytest ; extra == 'tests' - - coverage ; extra == 'tests' - - coverage-enable-subprocess ; extra == 'tests' - - littleutils ; extra == 'tests' - - rich ; python_full_version >= '3.11' and extra == 'tests' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl - name: filelock - version: 3.18.0 - sha256: c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de - requires_dist: - - furo>=2024.8.6 ; extra == 'docs' - - sphinx-autodoc-typehints>=3 ; extra == 'docs' - - sphinx>=8.1.3 ; extra == 'docs' - - covdefaults>=2.3 ; extra == 'testing' - - coverage>=7.6.10 ; extra == 'testing' - - diff-cover>=9.2.1 ; extra == 'testing' - - pytest-asyncio>=0.25.2 ; extra == 'testing' - - pytest-cov>=6 ; extra == 'testing' - - pytest-mock>=3.14 ; extra == 'testing' - - pytest-timeout>=2.3.1 ; extra == 'testing' - - pytest>=8.3.4 ; extra == 'testing' - - virtualenv>=20.28.1 ; extra == 'testing' - - typing-extensions>=4.12.2 ; python_full_version < '3.11' and extra == 'typing' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/75/b4/b96bb66f6f8cc4669de44a158099b249c8159231d254ab6b092909388be5/fonttools-4.59.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl - name: fonttools - version: 4.59.0 - sha256: efd7e6660674e234e29937bc1481dceb7e0336bfae75b856b4fb272b5093c5d4 - requires_dist: - - lxml>=4.0 ; extra == 'lxml' - - brotli>=1.0.1 ; platform_python_implementation == 'CPython' and extra == 'woff' - - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'woff' - - zopfli>=0.1.4 ; extra == 'woff' - - unicodedata2>=15.1.0 ; python_full_version < '3.13' and extra == 'unicode' - - lz4>=1.7.4.2 ; extra == 'graphite' - - scipy ; platform_python_implementation != 'PyPy' and extra == 'interpolatable' - - munkres ; platform_python_implementation == 'PyPy' and extra == 'interpolatable' - - pycairo ; extra == 'interpolatable' - - matplotlib ; extra == 'plot' - - sympy ; extra == 'symfont' - - xattr ; sys_platform == 'darwin' and extra == 'type1' - - skia-pathops>=0.5.0 ; extra == 'pathops' - - uharfbuzz>=0.23.0 ; extra == 'repacker' - - lxml>=4.0 ; extra == 'all' - - brotli>=1.0.1 ; platform_python_implementation == 'CPython' and extra == 'all' - - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'all' - - zopfli>=0.1.4 ; extra == 'all' - - unicodedata2>=15.1.0 ; python_full_version < '3.13' and extra == 'all' - - lz4>=1.7.4.2 ; extra == 'all' - - scipy ; platform_python_implementation != 'PyPy' and extra == 'all' - - munkres ; platform_python_implementation == 'PyPy' and extra == 'all' - - pycairo ; extra == 'all' - - matplotlib ; extra == 'all' - - sympy ; extra == 'all' - - xattr ; sys_platform == 'darwin' and extra == 'all' - - skia-pathops>=0.5.0 ; extra == 'all' - - uharfbuzz>=0.23.0 ; extra == 'all' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/72/31/bc8c5c99c7818293458fe745dab4fd5730ff49697ccc82b554eb69f16a24/frozenlist-1.7.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: frozenlist - version: 1.7.0 - sha256: 8bd7eb96a675f18aa5c553eb7ddc24a43c8c18f22e1f9925528128c052cdbe00 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/2f/e0/014d5d9d7a4564cf1c40b5039bc882db69fd881111e03ab3657ac0b218e2/fsspec-2025.7.0-py3-none-any.whl - name: fsspec - version: 2025.7.0 - sha256: 8b012e39f63c7d5f10474de957f3ab793b47b45ae7d39f2fb735f8bbe25c0e21 - requires_dist: - - adlfs ; extra == 'abfs' - - adlfs ; extra == 'adl' - - pyarrow>=1 ; extra == 'arrow' - - dask ; extra == 'dask' - - distributed ; extra == 'dask' - - pre-commit ; extra == 'dev' - - ruff>=0.5 ; extra == 'dev' - - numpydoc ; extra == 'doc' - - sphinx ; extra == 'doc' - - sphinx-design ; extra == 'doc' - - sphinx-rtd-theme ; extra == 'doc' - - yarl ; extra == 'doc' - - dropbox ; extra == 'dropbox' - - dropboxdrivefs ; extra == 'dropbox' - - requests ; extra == 'dropbox' - - adlfs ; extra == 'full' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'full' - - dask ; extra == 'full' - - distributed ; extra == 'full' - - dropbox ; extra == 'full' - - dropboxdrivefs ; extra == 'full' - - fusepy ; extra == 'full' - - gcsfs ; extra == 'full' - - libarchive-c ; extra == 'full' - - ocifs ; extra == 'full' - - panel ; extra == 'full' - - paramiko ; extra == 'full' - - pyarrow>=1 ; extra == 'full' - - pygit2 ; extra == 'full' - - requests ; extra == 'full' - - s3fs ; extra == 'full' - - smbprotocol ; extra == 'full' - - tqdm ; extra == 'full' - - fusepy ; extra == 'fuse' - - gcsfs ; extra == 'gcs' - - pygit2 ; extra == 'git' - - requests ; extra == 'github' - - gcsfs ; extra == 'gs' - - panel ; extra == 'gui' - - pyarrow>=1 ; extra == 'hdfs' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'http' - - libarchive-c ; extra == 'libarchive' - - ocifs ; extra == 'oci' - - s3fs ; extra == 's3' - - paramiko ; extra == 'sftp' - - smbprotocol ; extra == 'smb' - - paramiko ; extra == 'ssh' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test' - - numpy ; extra == 'test' - - pytest ; extra == 'test' - - pytest-asyncio!=0.22.0 ; extra == 'test' - - pytest-benchmark ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-mock ; extra == 'test' - - pytest-recording ; extra == 'test' - - pytest-rerunfailures ; extra == 'test' - - requests ; extra == 'test' - - aiobotocore>=2.5.4,<3.0.0 ; extra == 'test-downstream' - - dask[dataframe,test] ; extra == 'test-downstream' - - moto[server]>4,<5 ; extra == 'test-downstream' - - pytest-timeout ; extra == 'test-downstream' - - xarray ; extra == 'test-downstream' - - adlfs ; extra == 'test-full' - - aiohttp!=4.0.0a0,!=4.0.0a1 ; extra == 'test-full' - - cloudpickle ; extra == 'test-full' - - dask ; extra == 'test-full' - - distributed ; extra == 'test-full' - - dropbox ; extra == 'test-full' - - dropboxdrivefs ; extra == 'test-full' - - fastparquet ; extra == 'test-full' - - fusepy ; extra == 'test-full' - - gcsfs ; extra == 'test-full' - - jinja2 ; extra == 'test-full' - - kerchunk ; extra == 'test-full' - - libarchive-c ; extra == 'test-full' - - lz4 ; extra == 'test-full' - - notebook ; extra == 'test-full' - - numpy ; extra == 'test-full' - - ocifs ; extra == 'test-full' - - pandas ; extra == 'test-full' - - panel ; extra == 'test-full' - - paramiko ; extra == 'test-full' - - pyarrow ; extra == 'test-full' - - pyarrow>=1 ; extra == 'test-full' - - pyftpdlib ; extra == 'test-full' - - pygit2 ; extra == 'test-full' - - pytest ; extra == 'test-full' - - pytest-asyncio!=0.22.0 ; extra == 'test-full' - - pytest-benchmark ; extra == 'test-full' - - pytest-cov ; extra == 'test-full' - - pytest-mock ; extra == 'test-full' - - pytest-recording ; extra == 'test-full' - - pytest-rerunfailures ; extra == 'test-full' - - python-snappy ; extra == 'test-full' - - requests ; extra == 'test-full' - - smbprotocol ; extra == 'test-full' - - tqdm ; extra == 'test-full' - - urllib3 ; extra == 'test-full' - - zarr ; extra == 'test-full' - - zstandard ; python_full_version < '3.14' and extra == 'test-full' - - tqdm ; extra == 'tqdm' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/21/f5/54bccbee01efbc25581db6aafefb6f6c277d880930f7a083b10052382463/gcsfs-2025.7.0-py2.py3-none-any.whl - name: gcsfs - version: 2025.7.0 - sha256: 653503331d58cb02bb34e725d4595d166e93f7f2f3ff88e4c66ef535ae66eae5 - requires_dist: - - aiohttp!=4.0.0a0,!=4.0.0a1 - - decorator>4.1.2 - - fsspec==2025.7.0 - - google-auth>=1.2 - - google-auth-oauthlib - - google-cloud-storage - - requests - - fusepy ; extra == 'gcsfuse' - - crcmod ; extra == 'crc' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl - name: ghp-import - version: 2.1.0 - sha256: 8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619 - requires_dist: - - python-dateutil>=2.8.1 - - twine ; extra == 'dev' - - markdown ; extra == 'dev' - - flake8 ; extra == 'dev' - - wheel ; extra == 'dev' -- pypi: https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl - name: google-api-core - version: 2.25.1 - sha256: 8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7 - requires_dist: - - googleapis-common-protos>=1.56.2,<2.0.0 - - protobuf>=3.19.5,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0 - - proto-plus>=1.22.3,<2.0.0 - - proto-plus>=1.25.0,<2.0.0 ; python_full_version >= '3.13' - - google-auth>=2.14.1,<3.0.0 - - requests>=2.18.0,<3.0.0 - - google-auth[aiohttp]>=2.35.0,<3.0.0 ; extra == 'async-rest' - - grpcio>=1.33.2,<2.0.0 ; extra == 'grpc' - - grpcio>=1.49.1,<2.0.0 ; python_full_version >= '3.11' and extra == 'grpc' - - grpcio-status>=1.33.2,<2.0.0 ; extra == 'grpc' - - grpcio-status>=1.49.1,<2.0.0 ; python_full_version >= '3.11' and extra == 'grpc' - - grpcio-gcp>=0.2.2,<1.0.0 ; extra == 'grpcgcp' - - grpcio-gcp>=0.2.2,<1.0.0 ; extra == 'grpcio-gcp' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl - name: google-auth - version: 2.40.3 - sha256: 1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca - requires_dist: - - cachetools>=2.0.0,<6.0 - - pyasn1-modules>=0.2.1 - - rsa>=3.1.4,<5 - - aiohttp>=3.6.2,<4.0.0 ; extra == 'aiohttp' - - requests>=2.20.0,<3.0.0 ; extra == 'aiohttp' - - cryptography ; extra == 'enterprise-cert' - - pyopenssl ; extra == 'enterprise-cert' - - pyjwt>=2.0 ; extra == 'pyjwt' - - cryptography>=38.0.3 ; extra == 'pyjwt' - - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'pyjwt' - - pyopenssl>=20.0.0 ; extra == 'pyopenssl' - - cryptography>=38.0.3 ; extra == 'pyopenssl' - - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'pyopenssl' - - pyu2f>=0.1.5 ; extra == 'reauth' - - requests>=2.20.0,<3.0.0 ; extra == 'requests' - - grpcio ; extra == 'testing' - - flask ; extra == 'testing' - - freezegun ; extra == 'testing' - - mock ; extra == 'testing' - - oauth2client ; extra == 'testing' - - pyjwt>=2.0 ; extra == 'testing' - - cryptography>=38.0.3 ; extra == 'testing' - - pytest ; extra == 'testing' - - pytest-cov ; extra == 'testing' - - pytest-localserver ; extra == 'testing' - - pyopenssl>=20.0.0 ; extra == 'testing' - - pyu2f>=0.1.5 ; extra == 'testing' - - responses ; extra == 'testing' - - urllib3 ; extra == 'testing' - - packaging ; extra == 'testing' - - aiohttp>=3.6.2,<4.0.0 ; extra == 'testing' - - requests>=2.20.0,<3.0.0 ; extra == 'testing' - - aioresponses ; extra == 'testing' - - pytest-asyncio ; extra == 'testing' - - pyopenssl<24.3.0 ; extra == 'testing' - - aiohttp<3.10.0 ; extra == 'testing' - - cryptography<39.0.0 ; python_full_version < '3.8' and extra == 'testing' - - urllib3 ; extra == 'urllib3' - - packaging ; extra == 'urllib3' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl - name: google-auth-oauthlib - version: 1.2.2 - sha256: fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2 - requires_dist: - - google-auth>=2.15.0 - - requests-oauthlib>=0.7.0 - - click>=6.0.0 ; extra == 'tool' - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl - name: google-cloud-core - version: 2.4.3 - sha256: 5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e - requires_dist: - - google-api-core>=1.31.6,!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0 - - google-auth>=1.25.0,<3.0.dev0 - - importlib-metadata>1.0.0 ; python_full_version < '3.8' - - grpcio>=1.38.0,<2.0.dev0 ; extra == 'grpc' - - grpcio-status>=1.38.0,<2.0.dev0 ; extra == 'grpc' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/41/9d/2814a2c47429dc2e197e176de25a946d4538422b081ade8638e585e4006f/google_cloud_storage-3.3.0-py3-none-any.whl - name: google-cloud-storage - version: 3.3.0 - sha256: 0338ecd6621b3ecacb108f1cf7513ff0d1bca7f1ff4d58e0220b59f3a725ff23 - requires_dist: - - google-auth>=2.26.1,<3.0.0 - - google-api-core>=2.15.0,<3.0.0 - - google-cloud-core>=2.4.2,<3.0.0 - - google-resumable-media>=2.7.2,<3.0.0 - - requests>=2.22.0,<3.0.0 - - google-crc32c>=1.1.3,<2.0.0 - - protobuf>=3.20.2,<7.0.0 ; extra == 'protobuf' - - opentelemetry-api>=1.1.0,<2.0.0 ; extra == 'tracing' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: google-crc32c - version: 1.7.1 - sha256: 6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6 - requires_dist: - - importlib-resources>=1.3 ; python_full_version < '3.9' and os_name == 'nt' - - pytest ; extra == 'testing' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl - name: google-resumable-media - version: 2.7.2 - sha256: 3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa - requires_dist: - - google-crc32c>=1.0,<2.0.dev0 - - aiohttp>=3.6.2,<4.0.0.dev0 ; extra == 'aiohttp' - - google-auth>=1.22.0,<2.0.dev0 ; extra == 'aiohttp' - - requests>=2.18.0,<3.0.0.dev0 ; extra == 'requests' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl - name: googleapis-common-protos - version: 1.70.0 - sha256: b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8 - requires_dist: - - protobuf>=3.20.2,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<7.0.0 - - grpcio>=1.44.0,<2.0.0 ; extra == 'grpc' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/3f/ca/4fdc7bf59bf6994aa45cbd4ef1055cd65e2884de6113dbd49f75498ddb08/grpcio-1.74.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: grpcio - version: 1.74.0 - sha256: e1ea6176d7dfd5b941ea01c2ec34de9531ba494d541fe2057c904e601879f249 - requires_dist: - - grpcio-tools>=1.74.0 ; extra == 'protobuf' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/03/b6/39bcf01e1185882f34bc9fb77d1fb4a27911a55f60ab407de34abc8a2347/httpie-3.2.4-py3-none-any.whl - name: httpie - version: 3.2.4 - sha256: 4bd0435cc4b9bca59501bc65089de96f3e93b393803f32a81951db62050ebf0b - requires_dist: - - pip - - charset-normalizer>=2.0.0 - - defusedxml>=0.6.0 - - requests[socks]>=2.22.0 - - pygments>=2.5.2 - - requests-toolbelt>=0.9.1 - - multidict>=4.7.0 - - setuptools - - rich>=9.10.0 - - importlib-metadata>=1.4.0 ; python_full_version < '3.8' - - colorama>=0.2.4 ; sys_platform == 'win32' - - pytest ; extra == 'dev' - - pytest-httpbin>=0.0.6 ; extra == 'dev' - - responses ; extra == 'dev' - - pytest-mock ; extra == 'dev' - - werkzeug<2.1.0 ; extra == 'dev' - - flake8 ; extra == 'dev' - - flake8-comprehensions ; extra == 'dev' - - flake8-deprecated ; extra == 'dev' - - flake8-mutable ; extra == 'dev' - - flake8-tuple ; extra == 'dev' - - pyopenssl ; extra == 'dev' - - pytest-cov ; extra == 'dev' - - pyyaml ; extra == 'dev' - - twine ; extra == 'dev' - - wheel ; extra == 'dev' - - jinja2 ; extra == 'dev' - - pytest ; extra == 'test' - - pytest-httpbin>=0.0.6 ; extra == 'test' - - responses ; extra == 'test' - - pytest-mock ; extra == 'test' - - werkzeug<2.1.0 ; extra == 'test' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl - name: idna - version: '3.10' - sha256: 946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 - requires_dist: - - ruff>=0.6.2 ; extra == 'all' - - mypy>=1.11.2 ; extra == 'all' - - pytest>=8.3.2 ; extra == 'all' - - flake8>=7.1.1 ; extra == 'all' - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl - name: importlib-metadata - version: 8.7.0 - sha256: e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd - requires_dist: - - zipp>=3.20 - - typing-extensions>=3.6.4 ; python_full_version < '3.8' - - pytest>=6,!=8.1.* ; extra == 'test' - - importlib-resources>=1.3 ; python_full_version < '3.9' and extra == 'test' - - packaging ; extra == 'test' - - pyfakefs ; extra == 'test' - - flufl-flake8 ; extra == 'test' - - pytest-perf>=0.9.2 ; extra == 'test' - - jaraco-test>=5.4 ; extra == 'test' - - sphinx>=3.5 ; extra == 'doc' - - jaraco-packaging>=9.3 ; extra == 'doc' - - rst-linker>=1.9 ; extra == 'doc' - - furo ; extra == 'doc' - - sphinx-lint ; extra == 'doc' - - jaraco-tidelift>=1.4 ; extra == 'doc' - - ipython ; extra == 'perf' - - pytest-checkdocs>=2.4 ; extra == 'check' - - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' - - pytest-cov ; extra == 'cover' - - pytest-enabler>=2.2 ; extra == 'enabler' - - pytest-mypy ; extra == 'type' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl - name: iniconfig - version: 2.1.0 - sha256: 9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl - name: ipykernel - version: 6.30.1 - sha256: aa6b9fb93dca949069d8b85b6c79b2518e32ac583ae9c7d37c51d119e18b3fb4 - requires_dist: - - appnope>=0.1.2 ; sys_platform == 'darwin' - - comm>=0.1.1 - - debugpy>=1.6.5 - - ipython>=7.23.1 - - jupyter-client>=8.0.0 - - jupyter-core>=4.12,!=5.0.* - - matplotlib-inline>=0.1 - - nest-asyncio>=1.4 - - packaging>=22 - - psutil>=5.7 - - pyzmq>=25 - - tornado>=6.2 - - traitlets>=5.4.0 - - coverage[toml] ; extra == 'cov' - - matplotlib ; extra == 'cov' - - pytest-cov ; extra == 'cov' - - trio ; extra == 'cov' - - intersphinx-registry ; extra == 'docs' - - myst-parser ; extra == 'docs' - - pydata-sphinx-theme ; extra == 'docs' - - sphinx ; extra == 'docs' - - sphinx-autodoc-typehints ; extra == 'docs' - - sphinxcontrib-github-alt ; extra == 'docs' - - sphinxcontrib-spelling ; extra == 'docs' - - trio ; extra == 'docs' - - pyqt5 ; extra == 'pyqt5' - - pyside6 ; extra == 'pyside6' - - flaky ; extra == 'test' - - ipyparallel ; extra == 'test' - - pre-commit ; extra == 'test' - - pytest-asyncio>=0.23.5 ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest>=7.0,<9 ; extra == 'test' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/63/f8/0031ee2b906a15a33d6bfc12dd09c3dfa966b3cb5b284ecfb7549e6ac3c4/ipython-9.4.0-py3-none-any.whl - name: ipython - version: 9.4.0 - sha256: 25850f025a446d9b359e8d296ba175a36aedd32e83ca9b5060430fe16801f066 - requires_dist: - - colorama ; sys_platform == 'win32' - - decorator - - ipython-pygments-lexers - - jedi>=0.16 - - matplotlib-inline - - pexpect>4.3 ; sys_platform != 'emscripten' and sys_platform != 'win32' - - prompt-toolkit>=3.0.41,<3.1.0 - - pygments>=2.4.0 - - stack-data - - traitlets>=5.13.0 - - typing-extensions>=4.6 ; python_full_version < '3.12' - - black ; extra == 'black' - - docrepr ; extra == 'doc' - - exceptiongroup ; extra == 'doc' - - intersphinx-registry ; extra == 'doc' - - ipykernel ; extra == 'doc' - - ipython[test] ; extra == 'doc' - - matplotlib ; extra == 'doc' - - setuptools>=18.5 ; extra == 'doc' - - sphinx-toml==0.0.4 ; extra == 'doc' - - sphinx-rtd-theme ; extra == 'doc' - - sphinx>=1.3 ; extra == 'doc' - - typing-extensions ; extra == 'doc' - - pytest ; extra == 'test' - - pytest-asyncio<0.22 ; extra == 'test' - - testpath ; extra == 'test' - - packaging ; extra == 'test' - - ipython[test] ; extra == 'test-extra' - - curio ; extra == 'test-extra' - - jupyter-ai ; extra == 'test-extra' - - matplotlib!=3.2.0 ; extra == 'test-extra' - - nbformat ; extra == 'test-extra' - - nbclient ; extra == 'test-extra' - - ipykernel ; extra == 'test-extra' - - numpy>=1.23 ; extra == 'test-extra' - - pandas ; extra == 'test-extra' - - trio ; extra == 'test-extra' - - matplotlib ; extra == 'matplotlib' - - ipython[doc,matplotlib,test,test-extra] ; extra == 'all' - requires_python: '>=3.11' -- pypi: https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl - name: ipython-pygments-lexers - version: 1.1.1 - sha256: a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c - requires_dist: - - pygments - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl - name: ipywidgets - version: 8.1.7 - sha256: 764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb - requires_dist: - - comm>=0.1.3 - - ipython>=6.1.0 - - traitlets>=4.3.1 - - widgetsnbextension~=4.0.14 - - jupyterlab-widgets~=3.0.15 - - jsonschema ; extra == 'test' - - ipykernel ; extra == 'test' - - pytest>=3.6.0 ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytz ; extra == 'test' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl - name: isodate - version: 0.7.2 - sha256: 28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl - name: jedi - version: 0.19.2 - sha256: a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9 - requires_dist: - - parso>=0.8.4,<0.9.0 - - jinja2==2.11.3 ; extra == 'docs' - - markupsafe==1.1.1 ; extra == 'docs' - - pygments==2.8.1 ; extra == 'docs' - - alabaster==0.7.12 ; extra == 'docs' - - babel==2.9.1 ; extra == 'docs' - - chardet==4.0.0 ; extra == 'docs' - - commonmark==0.8.1 ; extra == 'docs' - - docutils==0.17.1 ; extra == 'docs' - - future==0.18.2 ; extra == 'docs' - - idna==2.10 ; extra == 'docs' - - imagesize==1.2.0 ; extra == 'docs' - - mock==1.0.1 ; extra == 'docs' - - packaging==20.9 ; extra == 'docs' - - pyparsing==2.4.7 ; extra == 'docs' - - pytz==2021.1 ; extra == 'docs' - - readthedocs-sphinx-ext==2.1.4 ; extra == 'docs' - - recommonmark==0.5.0 ; extra == 'docs' - - requests==2.25.1 ; extra == 'docs' - - six==1.15.0 ; extra == 'docs' - - snowballstemmer==2.1.0 ; extra == 'docs' - - sphinx-rtd-theme==0.4.3 ; extra == 'docs' - - sphinx==1.8.5 ; extra == 'docs' - - sphinxcontrib-serializinghtml==1.1.4 ; extra == 'docs' - - sphinxcontrib-websupport==1.2.4 ; extra == 'docs' - - urllib3==1.26.4 ; extra == 'docs' - - flake8==5.0.4 ; extra == 'qa' - - mypy==0.971 ; extra == 'qa' - - types-setuptools==67.2.0.1 ; extra == 'qa' - - django ; extra == 'testing' - - attrs ; extra == 'testing' - - colorama ; extra == 'testing' - - docopt ; extra == 'testing' - - pytest<9.0.0 ; extra == 'testing' - requires_python: '>=3.6' -- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhd8ed1ab_0.conda - sha256: f1ac18b11637ddadc05642e8185a851c7fab5998c6f5470d716812fae943b2af - md5: 446bd6c8cb26050d528881df495ce646 - depends: - - markupsafe >=2.0 - - python >=3.9 - license: BSD-3-Clause - license_family: BSD - purls: - - pkg:pypi/jinja2?source=hash-mapping - size: 112714 - timestamp: 1741263433881 -- pypi: https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl - name: jmespath - version: 1.0.1 - sha256: 02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/fe/54/c86cd8e011fe98803d7e382fd67c0df5ceab8d2b7ad8c5a81524f791551c/jsonschema-4.25.0-py3-none-any.whl - name: jsonschema - version: 4.25.0 - sha256: 24c2e8da302de79c8b9382fee3e76b355e44d2a4364bb207159ce10b517bd716 - requires_dist: - - attrs>=22.2.0 - - jsonschema-specifications>=2023.3.6 - - referencing>=0.28.4 - - rpds-py>=0.7.1 - - fqdn ; extra == 'format' - - idna ; extra == 'format' - - isoduration ; extra == 'format' - - jsonpointer>1.13 ; extra == 'format' - - rfc3339-validator ; extra == 'format' - - rfc3987 ; extra == 'format' - - uri-template ; extra == 'format' - - webcolors>=1.11 ; extra == 'format' - - fqdn ; extra == 'format-nongpl' - - idna ; extra == 'format-nongpl' - - isoduration ; extra == 'format-nongpl' - - jsonpointer>1.13 ; extra == 'format-nongpl' - - rfc3339-validator ; extra == 'format-nongpl' - - rfc3986-validator>0.1.0 ; extra == 'format-nongpl' - - rfc3987-syntax>=1.1.0 ; extra == 'format-nongpl' - - uri-template ; extra == 'format-nongpl' - - webcolors>=24.6.0 ; extra == 'format-nongpl' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/01/0e/b27cdbaccf30b890c40ed1da9fd4a3593a5cf94dae54fb34f8a4b74fcd3f/jsonschema_specifications-2025.4.1-py3-none-any.whl - name: jsonschema-specifications - version: 2025.4.1 - sha256: 4653bffbd6584f7de83a67e0d620ef16900b390ddc7939d56684d6c81e33f1af - requires_dist: - - referencing>=0.31.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/11/85/b0394e0b6fcccd2c1eeefc230978a6f8cb0c5df1e4cd3e7625735a0d7d1e/jupyter_client-8.6.3-py3-none-any.whl - name: jupyter-client - version: 8.6.3 - sha256: e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f - requires_dist: - - importlib-metadata>=4.8.3 ; python_full_version < '3.10' - - jupyter-core>=4.12,!=5.0.* - - python-dateutil>=2.8.2 - - pyzmq>=23.0 - - tornado>=6.2 - - traitlets>=5.3 - - ipykernel ; extra == 'docs' - - myst-parser ; extra == 'docs' - - pydata-sphinx-theme ; extra == 'docs' - - sphinx-autodoc-typehints ; extra == 'docs' - - sphinx>=4 ; extra == 'docs' - - sphinxcontrib-github-alt ; extra == 'docs' - - sphinxcontrib-spelling ; extra == 'docs' - - coverage ; extra == 'test' - - ipykernel>=6.14 ; extra == 'test' - - mypy ; extra == 'test' - - paramiko ; sys_platform == 'win32' and extra == 'test' - - pre-commit ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-jupyter[client]>=0.4.1 ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest<8.2.0 ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl - name: jupyter-core - version: 5.8.1 - sha256: c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0 - requires_dist: - - platformdirs>=2.5 - - pywin32>=300 ; platform_python_implementation != 'PyPy' and sys_platform == 'win32' - - traitlets>=5.3 - - intersphinx-registry ; extra == 'docs' - - myst-parser ; extra == 'docs' - - pydata-sphinx-theme ; extra == 'docs' - - sphinx-autodoc-typehints ; extra == 'docs' - - sphinxcontrib-spelling ; extra == 'docs' - - traitlets ; extra == 'docs' - - ipykernel ; extra == 'test' - - pre-commit ; extra == 'test' - - pytest-cov ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest<9 ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl - name: jupyterlab-widgets - version: 3.0.15 - sha256: d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - name: kiwisolver - version: 1.4.9 - sha256: b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098 - requires_python: '>=3.10' -- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.44-h1423503_1.conda - sha256: 1a620f27d79217c1295049ba214c2f80372062fd251b569e9873d4a953d27554 - md5: 0be7c6e070c19105f966d3758448d018 - depends: - - __glibc >=2.17,<3.0.a0 - constrains: - - binutils_impl_linux-64 2.44 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 676044 - timestamp: 1752032747103 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.1-hecca717_0.conda - sha256: da2080da8f0288b95dd86765c801c6e166c4619b910b11f9a8446fb852438dc2 - md5: 4211416ecba1866fab0c6470986c22d6 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - constrains: - - expat 2.7.1.* - license: MIT - license_family: MIT - purls: [] - size: 74811 - timestamp: 1752719572741 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.6-h2dba641_1.conda - sha256: 764432d32db45466e87f10621db5b74363a9f847d2b8b1f9743746cd160f06ab - md5: ede4673863426c0883c0063d853bbd85 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: MIT - license_family: MIT - purls: [] - size: 57433 - timestamp: 1743434498161 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.1.0-h767d61c_4.conda - sha256: 144e35c1c2840f2dc202f6915fc41879c19eddbb8fa524e3ca4aa0d14018b26f - md5: f406dcbb2e7bef90d793e50e79a2882b - depends: - - __glibc >=2.17,<3.0.a0 - - _openmp_mutex >=4.5 - constrains: - - libgcc-ng ==15.1.0=*_4 - - libgomp 15.1.0 h767d61c_4 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 824153 - timestamp: 1753903866511 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.1.0-h69a702a_4.conda - sha256: 76ceac93ed98f208363d6e9c75011b0ff7b97b20f003f06461a619557e726637 - md5: 28771437ffcd9f3417c66012dc49a3be - depends: - - libgcc 15.1.0 h767d61c_4 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 29249 - timestamp: 1753903872571 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.1.0-h767d61c_4.conda - sha256: e0487a8fec78802ac04da0ac1139c3510992bc58a58cde66619dde3b363c2933 - md5: 3baf8976c96134738bba224e9ef6b1e5 - depends: - - __glibc >=2.17,<3.0.a0 - license: GPL-3.0-only WITH GCC-exception-3.1 - license_family: GPL - purls: [] - size: 447289 - timestamp: 1753903801049 -- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.1-hb9d3cd8_2.conda - sha256: f2591c0069447bbe28d4d696b7fcb0c5bd0b4ac582769b89addbcf26fb3430d8 - md5: 1a580f7796c7bf6393fddb8bbbde58dc - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - xz 5.8.1.* - license: 0BSD - purls: [] - size: 112894 - timestamp: 1749230047870 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb9d3cd8_0.conda - sha256: 3aa92d4074d4063f2a162cd8ecb45dccac93e543e565c01a787e16a43501f7ee - md5: c7e925f37e3b40d893459e625f6a53f1 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: BSD-2-Clause - license_family: BSD - purls: [] - size: 91183 - timestamp: 1748393666725 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.50.4-h0c1763c_0.conda - sha256: 6d9c32fc369af5a84875725f7ddfbfc2ace795c28f246dc70055a79f9b2003da - md5: 0b367fad34931cb79e0d6b7e5c06bb1c - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=14 - - libzlib >=1.3.1,<2.0a0 - license: blessing - purls: [] - size: 932581 - timestamp: 1753948484112 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda - sha256: 787eb542f055a2b3de553614b25f09eefb0a0931b0c87dbcce6efdfd92f04f18 - md5: 40b61aab5c7ba9ff276c41cfffe6b80b - depends: - - libgcc-ng >=12 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 33601 - timestamp: 1680112270483 -- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda - sha256: d4bfe88d7cb447768e31650f06257995601f89076080e76df55e3112d4e47dc4 - md5: edb0dca6bc32e4f4789199455a1dbeb8 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - constrains: - - zlib 1.3.1 *_2 - license: Zlib - license_family: Other - purls: [] - size: 60963 - timestamp: 1727963148474 -- pypi: https://files.pythonhosted.org/packages/96/2b/34cc11786bc00d0f04d0f5fdc3a2b1ae0b6239eef72d3d345805f9ad92a1/markdown-3.8.2-py3-none-any.whl - name: markdown - version: 3.8.2 - sha256: 5c83764dbd4e00bdd94d85a19b8d55ccca20fe35b2e678a1422b380324dd5f24 - requires_dist: - - importlib-metadata>=4.4 ; python_full_version < '3.10' - - coverage ; extra == 'testing' - - pyyaml ; extra == 'testing' - - mkdocs>=1.6 ; extra == 'docs' - - mkdocs-nature>=0.6 ; extra == 'docs' - - mdx-gh-links>=0.2 ; extra == 'docs' - - mkdocstrings[python] ; extra == 'docs' - - mkdocs-gen-files ; extra == 'docs' - - mkdocs-section-index ; extra == 'docs' - - mkdocs-literate-nav ; extra == 'docs' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl - name: markdown-it-py - version: 4.0.0 - sha256: 87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147 - requires_dist: - - mdurl~=0.1 - - psutil ; extra == 'benchmarking' - - pytest ; extra == 'benchmarking' - - pytest-benchmark ; extra == 'benchmarking' - - commonmark~=0.9 ; extra == 'compare' - - markdown~=3.4 ; extra == 'compare' - - mistletoe~=1.0 ; extra == 'compare' - - mistune~=3.0 ; extra == 'compare' - - panflute~=2.3 ; extra == 'compare' - - markdown-it-pyrs ; extra == 'compare' - - linkify-it-py>=1,<3 ; extra == 'linkify' - - mdit-py-plugins>=0.5.0 ; extra == 'plugins' - - gprof2dot ; extra == 'profiling' - - mdit-py-plugins>=0.5.0 ; extra == 'rtd' - - myst-parser ; extra == 'rtd' - - pyyaml ; extra == 'rtd' - - sphinx ; extra == 'rtd' - - sphinx-copybutton ; extra == 'rtd' - - sphinx-design ; extra == 'rtd' - - sphinx-book-theme~=1.0 ; extra == 'rtd' - - jupyter-sphinx ; extra == 'rtd' - - ipykernel ; extra == 'rtd' - - coverage ; extra == 'testing' - - pytest ; extra == 'testing' - - pytest-cov ; extra == 'testing' - - pytest-regressions ; extra == 'testing' - - requests ; extra == 'testing' - requires_python: '>=3.10' -- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.2-py313h8060acc_1.conda - sha256: d812caf52efcea7c9fd0eafb21d45dadfd0516812f667b928bee50e87634fae5 - md5: 21b62c55924f01b6eef6827167b46acb - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - python >=3.13,<3.14.0a0 - - python_abi 3.13.* *_cp313 - constrains: - - jinja2 >=3.0.0 - license: BSD-3-Clause - license_family: BSD - purls: - - pkg:pypi/markupsafe?source=hash-mapping - size: 24856 - timestamp: 1733219782830 -- pypi: https://files.pythonhosted.org/packages/52/1b/233e3094b749df16e3e6cd5a44849fd33852e692ad009cf7de00cf58ddf6/matplotlib-3.10.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl - name: matplotlib - version: 3.10.5 - sha256: d52fd5b684d541b5a51fb276b2b97b010c75bee9aa392f96b4a07aeb491e33c7 - requires_dist: - - contourpy>=1.0.1 - - cycler>=0.10 - - fonttools>=4.22.0 - - kiwisolver>=1.3.1 - - numpy>=1.23 - - packaging>=20.0 - - pillow>=8 - - pyparsing>=2.3.1 - - python-dateutil>=2.7 - - meson-python>=0.13.1,<0.17.0 ; extra == 'dev' - - pybind11>=2.13.2,!=2.13.3 ; extra == 'dev' - - setuptools-scm>=7 ; extra == 'dev' - - setuptools>=64 ; extra == 'dev' - requires_python: '>=3.10' -- pypi: https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl - name: matplotlib-inline - version: 0.1.7 - sha256: df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca - requires_dist: - - traitlets - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl - name: mdurl - version: 0.1.2 - sha256: 84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl - name: mergedeep - version: 1.3.4 - sha256: 70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307 - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/89/a3/00260f8df72b51afa1f182dd609533c77fa2407918c4c2813d87b4a56725/minio-7.2.16-py3-none-any.whl - name: minio - version: 7.2.16 - sha256: 9288ab988ca57c181eb59a4c96187b293131418e28c164392186c2b89026b223 - requires_dist: - - argon2-cffi - - certifi - - pycryptodome - - typing-extensions - - urllib3 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl - name: mkdocs - version: 1.6.1 - sha256: db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e - requires_dist: - - click>=7.0 - - colorama>=0.4 ; sys_platform == 'win32' - - ghp-import>=1.0 - - importlib-metadata>=4.4 ; python_full_version < '3.10' - - jinja2>=2.11.1 - - markdown>=3.3.6 - - markupsafe>=2.0.1 - - mergedeep>=1.3.4 - - mkdocs-get-deps>=0.2.0 - - packaging>=20.5 - - pathspec>=0.11.1 - - pyyaml-env-tag>=0.1 - - pyyaml>=5.1 - - watchdog>=2.0 - - babel>=2.9.0 ; extra == 'i18n' - - babel==2.9.0 ; extra == 'min-versions' - - click==7.0 ; extra == 'min-versions' - - colorama==0.4 ; sys_platform == 'win32' and extra == 'min-versions' - - ghp-import==1.0 ; extra == 'min-versions' - - importlib-metadata==4.4 ; python_full_version < '3.10' and extra == 'min-versions' - - jinja2==2.11.1 ; extra == 'min-versions' - - markdown==3.3.6 ; extra == 'min-versions' - - markupsafe==2.0.1 ; extra == 'min-versions' - - mergedeep==1.3.4 ; extra == 'min-versions' - - mkdocs-get-deps==0.2.0 ; extra == 'min-versions' - - packaging==20.5 ; extra == 'min-versions' - - pathspec==0.11.1 ; extra == 'min-versions' - - pyyaml-env-tag==0.1 ; extra == 'min-versions' - - pyyaml==5.1 ; extra == 'min-versions' - - watchdog==2.0 ; extra == 'min-versions' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/9f/d4/029f984e8d3f3b6b726bd33cafc473b75e9e44c0f7e80a5b29abc466bdea/mkdocs_get_deps-0.2.0-py3-none-any.whl - name: mkdocs-get-deps - version: 0.2.0 - sha256: 2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134 - requires_dist: - - importlib-metadata>=4.3 ; python_full_version < '3.10' - - mergedeep>=1.3.4 - - platformdirs>=2.2.0 - - pyyaml>=5.1 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/34/82/fc5ce89006389a6426ef28e326fc065b0fbaaed230373b62d14c889f47ea/mmh3-5.2.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - name: mmh3 - version: 5.2.0 - sha256: 7e5634565367b6d98dc4aa2983703526ef556b3688ba3065edb4b9b90ede1c54 - requires_dist: - - pytest==8.4.1 ; extra == 'test' - - pytest-sugar==1.0.0 ; extra == 'test' - - black==25.1.0 ; extra == 'lint' - - clang-format==20.1.8 ; extra == 'lint' - - isort==6.0.1 ; extra == 'lint' - - pylint==3.3.7 ; extra == 'lint' - - mypy==1.17.0 ; extra == 'type' - - myst-parser==4.0.1 ; extra == 'docs' - - shibuya==2025.7.24 ; extra == 'docs' - - sphinx==8.2.3 ; extra == 'docs' - - sphinx-copybutton==0.5.2 ; extra == 'docs' - - pymmh3==0.0.5 ; extra == 'benchmark' - - pyperf==2.9.0 ; extra == 'benchmark' - - xxhash==3.5.0 ; extra == 'benchmark' - - matplotlib==3.10.3 ; extra == 'plot' - - pandas==2.3.1 ; extra == 'plot' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/86/5b/fbc73e91f7727ae1e79b21ed833308e99dc11cc1cd3d4717f579775de5e9/msal-1.33.0-py3-none-any.whl - name: msal - version: 1.33.0 - sha256: c0cd41cecf8eaed733ee7e3be9e040291eba53b0f262d3ae9c58f38b04244273 - requires_dist: - - requests>=2.0.0,<3 - - pyjwt[crypto]>=1.0.0,<3 - - cryptography>=2.5,<48 - - pymsalruntime>=0.14,<0.19 ; python_full_version >= '3.6' and sys_platform == 'win32' and extra == 'broker' - - pymsalruntime>=0.17,<0.19 ; python_full_version >= '3.8' and sys_platform == 'darwin' and extra == 'broker' - - pymsalruntime>=0.18,<0.19 ; python_full_version >= '3.8' and sys_platform == 'linux' and extra == 'broker' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/5e/75/bd9b7bb966668920f06b200e84454c8f3566b102183bc55c5473d96cb2b9/msal_extensions-1.3.1-py3-none-any.whl - name: msal-extensions - version: 1.3.1 - sha256: 96d3de4d034504e969ac5e85bae8106c8373b5c6568e4c8fa7af2eca9dbe6bca - requires_dist: - - msal>=1.29,<2 - - portalocker>=1.4,<4 ; extra == 'portalocker' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/20/22/2ebae7ae43cd8f2debc35c631172ddf14e2a87ffcc04cf43ff9df9fff0d3/msgpack-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: msgpack - version: 1.1.1 - sha256: 9d592d06e3cc2f537ceeeb23d38799c6ad83255289bb84c2e5792e5a8dea268a - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/80/e5/5e22c5bf96a64bdd43518b1834c6d95a4922cc2066b7d8e467dae9b6cee6/multidict-6.6.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl - name: multidict - version: 6.6.4 - sha256: 497a2954adc25c08daff36f795077f63ad33e13f19bfff7736e72c785391534f - requires_dist: - - typing-extensions>=4.1.0 ; python_full_version < '3.11' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda - sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 - md5: 47e340acb35de30501a76c7c799c41d7 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - license: X11 AND BSD-3-Clause - purls: [] - size: 891641 - timestamp: 1738195959188 -- pypi: https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl - name: nest-asyncio - version: 1.6.0 - sha256: 87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c - requires_python: '>=3.5' -- pypi: https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl - name: networkx - version: '3.5' - sha256: 0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec - requires_dist: - - numpy>=1.25 ; extra == 'default' - - scipy>=1.11.2 ; extra == 'default' - - matplotlib>=3.8 ; extra == 'default' - - pandas>=2.0 ; extra == 'default' - - pre-commit>=4.1 ; extra == 'developer' - - mypy>=1.15 ; extra == 'developer' - - sphinx>=8.0 ; extra == 'doc' - - pydata-sphinx-theme>=0.16 ; extra == 'doc' - - sphinx-gallery>=0.18 ; extra == 'doc' - - numpydoc>=1.8.0 ; extra == 'doc' - - pillow>=10 ; extra == 'doc' - - texext>=0.6.7 ; extra == 'doc' - - myst-nb>=1.1 ; extra == 'doc' - - intersphinx-registry ; extra == 'doc' - - osmnx>=2.0.0 ; extra == 'example' - - momepy>=0.7.2 ; extra == 'example' - - contextily>=1.6 ; extra == 'example' - - seaborn>=0.13 ; extra == 'example' - - cairocffi>=1.7 ; extra == 'example' - - igraph>=0.11 ; extra == 'example' - - scikit-learn>=1.5 ; extra == 'example' - - lxml>=4.6 ; extra == 'extra' - - pygraphviz>=1.14 ; extra == 'extra' - - pydot>=3.0.1 ; extra == 'extra' - - sympy>=1.10 ; extra == 'extra' - - pytest>=7.2 ; extra == 'test' - - pytest-cov>=4.0 ; extra == 'test' - - pytest-xdist>=3.0 ; extra == 'test' - - pytest-mpl ; extra == 'test-extras' - - pytest-randomly ; extra == 'test-extras' - requires_python: '>=3.11' -- pypi: https://files.pythonhosted.org/packages/1d/0f/571b2c7a3833ae419fe69ff7b479a78d313581785203cc70a8db90121b9a/numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - name: numpy - version: 2.3.2 - sha256: 938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f - requires_python: '>=3.11' -- pypi: https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl - name: oauthlib - version: 3.3.1 - sha256: 88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1 - requires_dist: - - cryptography>=3.0.0 ; extra == 'rsa' - - cryptography>=3.0.0 ; extra == 'signedtoken' - - pyjwt>=2.0.0,<3 ; extra == 'signedtoken' - - blinker>=1.4.0 ; extra == 'signals' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b5/ed/9fbdeb23a09e430d87b7d72d430484b88184633dc50f6bfb792354b6f661/opencensus-0.11.4-py2.py3-none-any.whl - name: opencensus - version: 0.11.4 - sha256: a18487ce68bc19900336e0ff4655c5a116daf10c1b3685ece8d971bddad6a864 - requires_dist: - - opencensus-context>=0.1.3 - - six~=1.16 - - google-api-core>=1.0.0,<2.0.0 ; python_full_version < '3.6' - - google-api-core>=1.0.0,<3.0.0 ; python_full_version >= '3.6' -- pypi: https://files.pythonhosted.org/packages/10/68/162c97ea78c957d68ecf78a5c5041d2e25bd5562bdf5d89a6cbf7f8429bf/opencensus_context-0.1.3-py2.py3-none-any.whl - name: opencensus-context - version: 0.1.3 - sha256: 073bb0590007af276853009fac7e4bab1d523c3f03baf4cb4511ca38967c6039 - requires_dist: - - contextvars ; python_full_version == '3.6.*' -- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.5.2-h26f9b46_0.conda - sha256: c9f54d4e8212f313be7b02eb962d0cb13a8dae015683a403d3accd4add3e520e - md5: ffffb341206dd0dab0c36053c048d621 - depends: - - __glibc >=2.17,<3.0.a0 - - ca-certificates - - libgcc >=14 - license: Apache-2.0 - license_family: Apache - purls: [] - size: 3128847 - timestamp: 1754465526100 -- pypi: https://files.pythonhosted.org/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl - name: opentelemetry-api - version: 1.36.0 - sha256: 02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c - requires_dist: - - importlib-metadata>=6.0,<8.8.0 - - typing-extensions>=4.5.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/c1/1c/40fb93a7b7e495985393bbc734104d5d20e470811644dd56c2402d683739/opentelemetry_exporter_prometheus-0.57b0-py3-none-any.whl - name: opentelemetry-exporter-prometheus - version: 0.57b0 - sha256: c5b893d1cdd593fb022af2c7de3258c2d5a4d04402ae80d9fa35675fed77f05c - requires_dist: - - opentelemetry-api~=1.12 - - opentelemetry-sdk~=1.36.0 - - prometheus-client>=0.5.0,<1.0.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl - name: opentelemetry-proto - version: 1.36.0 - sha256: 151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e - requires_dist: - - protobuf>=5.0,<7.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl - name: opentelemetry-sdk - version: 1.36.0 - sha256: 19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb - requires_dist: - - opentelemetry-api==1.36.0 - - opentelemetry-semantic-conventions==0.57b0 - - typing-extensions>=4.5.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl - name: opentelemetry-semantic-conventions - version: 0.57b0 - sha256: 757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78 - requires_dist: - - opentelemetry-api==1.36.0 - - typing-extensions>=4.5.0 - requires_python: '>=3.9' -- pypi: ./ - name: orcapod - version: 0.0.3a2.dev30+g144ef6a.d20250815 - sha256: ca2f39ccbaf238434a839d578fbcb67fdf7d345a5ee936cb60d5f025d4bc7d81 - requires_dist: - - xxhash - - networkx - - typing-extensions - - matplotlib>=3.10.3 - - pandas>=2.2.3 - - pyyaml>=6.0.2 - - pyarrow>=20.0.0 - - polars>=1.31.0 - - beartype>=0.21.0 - - deltalake>=1.0.2 - - redis>=6.2.0 ; extra == 'redis' - - ray[default]==2.48.0 ; extra == 'ray' - - ipywidgets>=8.1.7 ; extra == 'ray' - - orcapod[redis] ; extra == 'all' - - orcapod[ray] ; extra == 'all' - requires_python: '>=3.12.0' - editable: true -- pypi: https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl - name: packaging - version: '25.0' - sha256: 29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/e9/e2/20a317688435470872885e7fc8f95109ae9683dec7c50be29b56911515a5/pandas-2.3.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: pandas - version: 2.3.1 - sha256: 2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9 - requires_dist: - - numpy>=1.22.4 ; python_full_version < '3.11' - - numpy>=1.23.2 ; python_full_version == '3.11.*' - - numpy>=1.26.0 ; python_full_version >= '3.12' - - python-dateutil>=2.8.2 - - pytz>=2020.1 - - tzdata>=2022.7 - - hypothesis>=6.46.1 ; extra == 'test' - - pytest>=7.3.2 ; extra == 'test' - - pytest-xdist>=2.2.0 ; extra == 'test' - - pyarrow>=10.0.1 ; extra == 'pyarrow' - - bottleneck>=1.3.6 ; extra == 'performance' - - numba>=0.56.4 ; extra == 'performance' - - numexpr>=2.8.4 ; extra == 'performance' - - scipy>=1.10.0 ; extra == 'computation' - - xarray>=2022.12.0 ; extra == 'computation' - - fsspec>=2022.11.0 ; extra == 'fss' - - s3fs>=2022.11.0 ; extra == 'aws' - - gcsfs>=2022.11.0 ; extra == 'gcp' - - pandas-gbq>=0.19.0 ; extra == 'gcp' - - odfpy>=1.4.1 ; extra == 'excel' - - openpyxl>=3.1.0 ; extra == 'excel' - - python-calamine>=0.1.7 ; extra == 'excel' - - pyxlsb>=1.0.10 ; extra == 'excel' - - xlrd>=2.0.1 ; extra == 'excel' - - xlsxwriter>=3.0.5 ; extra == 'excel' - - pyarrow>=10.0.1 ; extra == 'parquet' - - pyarrow>=10.0.1 ; extra == 'feather' - - tables>=3.8.0 ; extra == 'hdf5' - - pyreadstat>=1.2.0 ; extra == 'spss' - - sqlalchemy>=2.0.0 ; extra == 'postgresql' - - psycopg2>=2.9.6 ; extra == 'postgresql' - - adbc-driver-postgresql>=0.8.0 ; extra == 'postgresql' - - sqlalchemy>=2.0.0 ; extra == 'mysql' - - pymysql>=1.0.2 ; extra == 'mysql' - - sqlalchemy>=2.0.0 ; extra == 'sql-other' - - adbc-driver-postgresql>=0.8.0 ; extra == 'sql-other' - - adbc-driver-sqlite>=0.8.0 ; extra == 'sql-other' - - beautifulsoup4>=4.11.2 ; extra == 'html' - - html5lib>=1.1 ; extra == 'html' - - lxml>=4.9.2 ; extra == 'html' - - lxml>=4.9.2 ; extra == 'xml' - - matplotlib>=3.6.3 ; extra == 'plot' - - jinja2>=3.1.2 ; extra == 'output-formatting' - - tabulate>=0.9.0 ; extra == 'output-formatting' - - pyqt5>=5.15.9 ; extra == 'clipboard' - - qtpy>=2.3.0 ; extra == 'clipboard' - - zstandard>=0.19.0 ; extra == 'compression' - - dataframe-api-compat>=0.1.7 ; extra == 'consortium-standard' - - adbc-driver-postgresql>=0.8.0 ; extra == 'all' - - adbc-driver-sqlite>=0.8.0 ; extra == 'all' - - beautifulsoup4>=4.11.2 ; extra == 'all' - - bottleneck>=1.3.6 ; extra == 'all' - - dataframe-api-compat>=0.1.7 ; extra == 'all' - - fastparquet>=2022.12.0 ; extra == 'all' - - fsspec>=2022.11.0 ; extra == 'all' - - gcsfs>=2022.11.0 ; extra == 'all' - - html5lib>=1.1 ; extra == 'all' - - hypothesis>=6.46.1 ; extra == 'all' - - jinja2>=3.1.2 ; extra == 'all' - - lxml>=4.9.2 ; extra == 'all' - - matplotlib>=3.6.3 ; extra == 'all' - - numba>=0.56.4 ; extra == 'all' - - numexpr>=2.8.4 ; extra == 'all' - - odfpy>=1.4.1 ; extra == 'all' - - openpyxl>=3.1.0 ; extra == 'all' - - pandas-gbq>=0.19.0 ; extra == 'all' - - psycopg2>=2.9.6 ; extra == 'all' - - pyarrow>=10.0.1 ; extra == 'all' - - pymysql>=1.0.2 ; extra == 'all' - - pyqt5>=5.15.9 ; extra == 'all' - - pyreadstat>=1.2.0 ; extra == 'all' - - pytest>=7.3.2 ; extra == 'all' - - pytest-xdist>=2.2.0 ; extra == 'all' - - python-calamine>=0.1.7 ; extra == 'all' - - pyxlsb>=1.0.10 ; extra == 'all' - - qtpy>=2.3.0 ; extra == 'all' - - scipy>=1.10.0 ; extra == 'all' - - s3fs>=2022.11.0 ; extra == 'all' - - sqlalchemy>=2.0.0 ; extra == 'all' - - tables>=3.8.0 ; extra == 'all' - - tabulate>=0.9.0 ; extra == 'all' - - xarray>=2022.12.0 ; extra == 'all' - - xlrd>=2.0.1 ; extra == 'all' - - xlsxwriter>=3.0.5 ; extra == 'all' - - zstandard>=0.19.0 ; extra == 'all' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl - name: parso - version: 0.8.4 - sha256: a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18 - requires_dist: - - flake8==5.0.4 ; extra == 'qa' - - mypy==0.971 ; extra == 'qa' - - types-setuptools==67.2.0.1 ; extra == 'qa' - - docopt ; extra == 'testing' - - pytest ; extra == 'testing' - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl - name: pathspec - version: 0.12.1 - sha256: a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl - name: pexpect - version: 4.9.0 - sha256: 7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523 - requires_dist: - - ptyprocess>=0.5 -- pypi: https://files.pythonhosted.org/packages/d5/1c/a2a29649c0b1983d3ef57ee87a66487fdeb45132df66ab30dd37f7dbe162/pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl - name: pillow - version: 11.3.0 - sha256: 13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8 - requires_dist: - - furo ; extra == 'docs' - - olefile ; extra == 'docs' - - sphinx>=8.2 ; extra == 'docs' - - sphinx-autobuild ; extra == 'docs' - - sphinx-copybutton ; extra == 'docs' - - sphinx-inline-tabs ; extra == 'docs' - - sphinxext-opengraph ; extra == 'docs' - - olefile ; extra == 'fpx' - - olefile ; extra == 'mic' - - pyarrow ; extra == 'test-arrow' - - check-manifest ; extra == 'tests' - - coverage>=7.4.2 ; extra == 'tests' - - defusedxml ; extra == 'tests' - - markdown2 ; extra == 'tests' - - olefile ; extra == 'tests' - - packaging ; extra == 'tests' - - pyroma ; extra == 'tests' - - pytest ; extra == 'tests' - - pytest-cov ; extra == 'tests' - - pytest-timeout ; extra == 'tests' - - pytest-xdist ; extra == 'tests' - - trove-classifiers>=2024.10.12 ; extra == 'tests' - - typing-extensions ; python_full_version < '3.10' and extra == 'typing' - - defusedxml ; extra == 'xmp' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl - name: pip - version: '25.2' - sha256: 6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl - name: platformdirs - version: 4.3.8 - sha256: ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4 - requires_dist: - - furo>=2024.8.6 ; extra == 'docs' - - proselint>=0.14 ; extra == 'docs' - - sphinx-autodoc-typehints>=3 ; extra == 'docs' - - sphinx>=8.1.3 ; extra == 'docs' - - appdirs==1.4.4 ; extra == 'test' - - covdefaults>=2.3 ; extra == 'test' - - pytest-cov>=6 ; extra == 'test' - - pytest-mock>=3.14 ; extra == 'test' - - pytest>=8.3.4 ; extra == 'test' - - mypy>=1.14.1 ; extra == 'type' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl - name: pluggy - version: 1.6.0 - sha256: e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746 - requires_dist: - - pre-commit ; extra == 'dev' - - tox ; extra == 'dev' - - pytest ; extra == 'testing' - - pytest-benchmark ; extra == 'testing' - - coverage ; extra == 'testing' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/ec/14/ee34ebe3eb842c83ca1d2d3af6ee02b08377e056ffad156c9a2b15a6d05c/polars-1.32.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: polars - version: 1.32.2 - sha256: a711a750cfc19f1f883d2b46895dd698abf4d446ca41c3bf510ced0ff1178057 - requires_dist: - - polars-cloud>=0.0.1a1 ; extra == 'polars-cloud' - - numpy>=1.16.0 ; extra == 'numpy' - - pandas ; extra == 'pandas' - - polars[pyarrow] ; extra == 'pandas' - - pyarrow>=7.0.0 ; extra == 'pyarrow' - - pydantic ; extra == 'pydantic' - - fastexcel>=0.9 ; extra == 'calamine' - - openpyxl>=3.0.0 ; extra == 'openpyxl' - - xlsx2csv>=0.8.0 ; extra == 'xlsx2csv' - - xlsxwriter ; extra == 'xlsxwriter' - - polars[calamine,openpyxl,xlsx2csv,xlsxwriter] ; extra == 'excel' - - adbc-driver-manager[dbapi] ; extra == 'adbc' - - adbc-driver-sqlite[dbapi] ; extra == 'adbc' - - connectorx>=0.3.2 ; extra == 'connectorx' - - sqlalchemy ; extra == 'sqlalchemy' - - polars[pandas] ; extra == 'sqlalchemy' - - polars[adbc,connectorx,sqlalchemy] ; extra == 'database' - - fsspec ; extra == 'fsspec' - - deltalake>=1.0.0 ; extra == 'deltalake' - - pyiceberg>=0.7.1 ; extra == 'iceberg' - - gevent ; extra == 'async' - - cloudpickle ; extra == 'cloudpickle' - - matplotlib ; extra == 'graph' - - altair>=5.4.0 ; extra == 'plot' - - great-tables>=0.8.0 ; extra == 'style' - - tzdata ; sys_platform == 'win32' and extra == 'timezone' - - cudf-polars-cu12 ; extra == 'gpu' - - polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone] ; extra == 'all' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/32/ae/ec06af4fe3ee72d16973474f122541746196aaa16cea6f66d18b963c6177/prometheus_client-0.22.1-py3-none-any.whl - name: prometheus-client - version: 0.22.1 - sha256: cca895342e308174341b2cbf99a56bef291fbc0ef7b9e5412a0f26d653ba7094 - requires_dist: - - twisted ; extra == 'twisted' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/ce/4f/5249960887b1fbe561d9ff265496d170b55a735b76724f10ef19f9e40716/prompt_toolkit-3.0.51-py3-none-any.whl - name: prompt-toolkit - version: 3.0.51 - sha256: 52742911fde84e2d423e2f9a4cf1de7d7ac4e51958f648d9540e0fb8db077b07 - requires_dist: - - wcwidth - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/af/81/b324c44ae60c56ef12007105f1460d5c304b0626ab0cc6b07c8f2a9aa0b8/propcache-0.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: propcache - version: 0.3.2 - sha256: 4c1396592321ac83157ac03a2023aa6cc4a3cc3cfdecb71090054c09e5a7cce3 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl - name: proto-plus - version: 1.26.1 - sha256: 13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66 - requires_dist: - - protobuf>=3.19.0,<7.0.0 - - google-api-core>=1.31.5 ; extra == 'testing' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl - name: protobuf - version: 6.31.1 - sha256: 4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: psutil - version: 7.0.0 - sha256: 4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34 - requires_dist: - - pytest ; extra == 'dev' - - pytest-xdist ; extra == 'dev' - - setuptools ; extra == 'dev' - - abi3audit ; extra == 'dev' - - black==24.10.0 ; extra == 'dev' - - check-manifest ; extra == 'dev' - - coverage ; extra == 'dev' - - packaging ; extra == 'dev' - - pylint ; extra == 'dev' - - pyperf ; extra == 'dev' - - pypinfo ; extra == 'dev' - - pytest-cov ; extra == 'dev' - - requests ; extra == 'dev' - - rstcheck ; extra == 'dev' - - ruff ; extra == 'dev' - - sphinx ; extra == 'dev' - - sphinx-rtd-theme ; extra == 'dev' - - toml-sort ; extra == 'dev' - - twine ; extra == 'dev' - - virtualenv ; extra == 'dev' - - vulture ; extra == 'dev' - - wheel ; extra == 'dev' - - pytest ; extra == 'test' - - pytest-xdist ; extra == 'test' - - setuptools ; extra == 'test' - requires_python: '>=3.6' -- pypi: https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl - name: ptyprocess - version: 0.7.0 - sha256: 4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 -- pypi: https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl - name: pure-eval - version: 0.2.3 - sha256: 1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0 - requires_dist: - - pytest ; extra == 'tests' -- pypi: https://files.pythonhosted.org/packages/68/fb/bc7f639aed026bca6e7beb1e33f6951e16b7d315594e7635a4f7d21d63f4/py_spy-0.4.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl - name: py-spy - version: 0.4.1 - sha256: 6a80ec05eb8a6883863a367c6a4d4f2d57de68466f7956b6367d4edd5c61bb29 - requires_dist: - - numpy ; extra == 'test' -- pypi: https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl - name: pyarrow - version: 21.0.0 - sha256: 69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61 - requires_dist: - - pytest ; extra == 'test' - - hypothesis ; extra == 'test' - - cffi ; extra == 'test' - - pytz ; extra == 'test' - - pandas ; extra == 'test' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/ca/a1/d0c333111d801c77a83a32f793222c4b9aef7de0fdb2ceb73a1980a6c98b/pyarrow_stubs-20.0.0.20250716-py3-none-any.whl - name: pyarrow-stubs - version: 20.0.0.20250716 - sha256: 8ecfdd215af468d6b993e2290da7f3d51a32991c1d230b90682f7ee4bc5ee7cd - requires_dist: - - pyarrow>=20 - requires_python: '>=3.9,<4' -- pypi: https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl - name: pyasn1 - version: 0.6.1 - sha256: 0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl - name: pyasn1-modules - version: 0.4.2 - sha256: 29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a - requires_dist: - - pyasn1>=0.6.1,<0.7.0 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl - name: pycparser - version: '2.22' - sha256: c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: pycryptodome - version: 3.23.0 - sha256: c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*' -- pypi: https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl - name: pydantic - version: 2.11.7 - sha256: dde5df002701f6de26248661f6835bbe296a47bf73990135c7d07ce741b9623b - requires_dist: - - annotated-types>=0.6.0 - - pydantic-core==2.33.2 - - typing-extensions>=4.12.2 - - typing-inspection>=0.4.0 - - email-validator>=2.0.0 ; extra == 'email' - - tzdata ; python_full_version >= '3.9' and sys_platform == 'win32' and extra == 'timezone' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/eb/3c/f4abd740877a35abade05e437245b192f9d0ffb48bbbbd708df33d3cda37/pydantic_core-2.33.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: pydantic-core - version: 2.33.2 - sha256: 9fdac5d6ffa1b5a83bca06ffe7583f5576555e6c8b3a91fbd25ea7780f825f7d - requires_dist: - - typing-extensions>=4.6.0,!=4.7.0 - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda - sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a - md5: 6b6ece66ebcae2d5f326c77ef2c5a066 - depends: - - python >=3.9 - license: BSD-2-Clause - license_family: BSD - purls: - - pkg:pypi/pygments?source=hash-mapping - size: 889287 - timestamp: 1750615908735 -- pypi: https://files.pythonhosted.org/packages/bd/6a/6c1ac381ff0b8e03a9abc2f05722f6002d7452a2c05118697b3f3910e171/pyiceberg-0.9.1.tar.gz - name: pyiceberg - version: 0.9.1 - sha256: 3634134ce33859a441768b39df179b2c6f3de2bbbf506622884f553b013ee799 - requires_dist: - - adlfs>=2023.1.0 ; extra == 'adlfs' - - boto3>=1.24.59 ; extra == 'dynamodb' or extra == 'glue' or extra == 'rest-sigv4' - - cachetools>=5.5.0,<6.0.0 - - click>=7.1.1,<9.0.0 - - duckdb>=0.5.0,<2.0.0 ; extra == 'duckdb' - - fsspec>=2023.1.0 - - gcsfs>=2023.1.0 ; extra == 'gcsfs' - - getdaft>=0.2.12 ; extra == 'daft' - - kerberos>=1.3.1,<2.0.0 ; extra == 'hive-kerberos' - - mmh3>=4.0.0,<6.0.0 - - mypy-boto3-glue>=1.28.18 ; extra == 'glue' - - pandas>=1.0.0,<3.0.0 ; extra == 'pandas' or extra == 'ray' - - polars>=1.21.0,<2.0.0 ; extra == 'polars' - - psycopg2-binary>=2.9.6 ; extra == 'sql-postgres' - - pyarrow>=17.0.0,<20.0.0 ; extra == 'duckdb' or extra == 'pandas' or extra == 'pyarrow' or extra == 'ray' - - pydantic>=2.0,!=2.4.0,!=2.4.1,<3.0 - - pyiceberg-core>=0.4.0,<0.5.0 ; extra == 'pyiceberg-core' - - pyparsing>=3.1.0,<4.0.0 - - python-snappy>=0.6.0,<1.0.0 ; extra == 'snappy' - - ray==2.10.0 ; python_full_version < '3.9' and extra == 'ray' - - ray>=2.10.0,<3.0.0 ; python_full_version >= '3.9' and extra == 'ray' - - requests>=2.20.0,<3.0.0 - - rich>=10.11.0,<14.0.0 - - s3fs>=2023.1.0 ; extra == 's3fs' - - sortedcontainers==2.4.0 - - sqlalchemy>=2.0.18,<3.0.0 ; extra == 'sql-postgres' or extra == 'sql-sqlite' - - strictyaml>=1.7.0,<2.0.0 - - tenacity>=8.2.3,<10.0.0 - - thrift>=0.13.0,<1.0.0 ; extra == 'hive' or extra == 'hive-kerberos' - - thrift-sasl>=0.4.3 ; extra == 'hive-kerberos' - - zstandard>=0.13.0,<1.0.0 ; extra == 'zstandard' - requires_python: '!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9' -- pypi: https://files.pythonhosted.org/packages/61/ad/689f02752eeec26aed679477e80e632ef1b682313be70793d798c1d5fc8f/PyJWT-2.10.1-py3-none-any.whl - name: pyjwt - version: 2.10.1 - sha256: dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb - requires_dist: - - cryptography>=3.4.0 ; extra == 'crypto' - - coverage[toml]==5.0.4 ; extra == 'dev' - - cryptography>=3.4.0 ; extra == 'dev' - - pre-commit ; extra == 'dev' - - pytest>=6.0.0,<7.0.0 ; extra == 'dev' - - sphinx ; extra == 'dev' - - sphinx-rtd-theme ; extra == 'dev' - - zope-interface ; extra == 'dev' - - sphinx ; extra == 'docs' - - sphinx-rtd-theme ; extra == 'docs' - - zope-interface ; extra == 'docs' - - coverage[toml]==5.0.4 ; extra == 'tests' - - pytest>=6.0.0,<7.0.0 ; extra == 'tests' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl - name: pyparsing - version: 3.2.3 - sha256: a749938e02d6fd0b59b356ca504a24982314bb090c383e3cf201c95ef7e2bfcf - requires_dist: - - railroad-diagrams ; extra == 'diagrams' - - jinja2 ; extra == 'diagrams' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl - name: pysocks - version: 1.7.1 - sha256: 2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' -- pypi: https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl - name: pytest - version: 8.4.1 - sha256: 539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7 - requires_dist: - - colorama>=0.4 ; sys_platform == 'win32' - - exceptiongroup>=1 ; python_full_version < '3.11' - - iniconfig>=1 - - packaging>=20 - - pluggy>=1.5,<2 - - pygments>=2.7.2 - - tomli>=1 ; python_full_version < '3.11' - - argcomplete ; extra == 'dev' - - attrs>=19.2 ; extra == 'dev' - - hypothesis>=3.56 ; extra == 'dev' - - mock ; extra == 'dev' - - requests ; extra == 'dev' - - setuptools ; extra == 'dev' - - xmlschema ; extra == 'dev' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/bc/16/4ea354101abb1287856baa4af2732be351c7bee728065aed451b678153fd/pytest_cov-6.2.1-py3-none-any.whl - name: pytest-cov - version: 6.2.1 - sha256: f5bc4c23f42f1cdd23c70b1dab1bbaef4fc505ba950d53e0081d0730dd7e86d5 - requires_dist: - - pytest>=6.2.5 - - coverage[toml]>=7.5 - - pluggy>=1.2 - - fields ; extra == 'testing' - - hunter ; extra == 'testing' - - process-tests ; extra == 'testing' - - pytest-xdist ; extra == 'testing' - - virtualenv ; extra == 'testing' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.13.5-hec9711d_102_cp313.conda - build_number: 102 - sha256: c2cdcc98ea3cbf78240624e4077e164dc9d5588eefb044b4097c3df54d24d504 - md5: 89e07d92cf50743886f41638d58c4328 - depends: - - __glibc >=2.17,<3.0.a0 - - bzip2 >=1.0.8,<2.0a0 - - ld_impl_linux-64 >=2.36.1 - - libexpat >=2.7.0,<3.0a0 - - libffi >=3.4.6,<3.5.0a0 - - libgcc >=13 - - liblzma >=5.8.1,<6.0a0 - - libmpdec >=4.0.0,<5.0a0 - - libsqlite >=3.50.1,<4.0a0 - - libuuid >=2.38.1,<3.0a0 - - libzlib >=1.3.1,<2.0a0 - - ncurses >=6.5,<7.0a0 - - openssl >=3.5.0,<4.0a0 - - python_abi 3.13.* *_cp313 - - readline >=8.2,<9.0a0 - - tk >=8.6.13,<8.7.0a0 - - tzdata - license: Python-2.0 - purls: [] - size: 33273132 - timestamp: 1750064035176 - python_site_packages_path: lib/python3.13/site-packages -- pypi: https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl - name: python-dateutil - version: 2.9.0.post0 - sha256: a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 - requires_dist: - - six>=1.5 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*' -- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.13-8_cp313.conda - build_number: 8 - sha256: 210bffe7b121e651419cb196a2a63687b087497595c9be9d20ebe97dd06060a7 - md5: 94305520c52a4aa3f6c2b1ff6008d9f8 - constrains: - - python 3.13.* *_cp313 - license: BSD-3-Clause - license_family: BSD - purls: [] - size: 7002 - timestamp: 1752805902938 -- pypi: https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl - name: pytz - version: '2025.2' - sha256: 5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00 -- pypi: https://files.pythonhosted.org/packages/04/24/b7721e4845c2f162d26f50521b825fb061bc0a5afcf9a386840f23ea19fa/PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: pyyaml - version: 6.0.2 - sha256: 70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl - name: pyyaml-env-tag - version: '1.1' - sha256: 17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04 - requires_dist: - - pyyaml - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/7e/0a/2356305c423a975000867de56888b79e44ec2192c690ff93c3109fd78081/pyzmq-27.0.1-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl - name: pyzmq - version: 27.0.1 - sha256: f5b6133c8d313bde8bd0d123c169d22525300ff164c2189f849de495e1344577 - requires_dist: - - cffi ; implementation_name == 'pypy' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/dd/4f/bb511598091f06cc7d781868caf833a0c3459b4f51c0b36cfb75dfaa7e4e/ray-2.48.0-cp313-cp313-manylinux2014_x86_64.whl - name: ray - version: 2.48.0 - sha256: 25e4b79fcc8f849d72db1acc4f03f37008c5c0b745df63d8a30cd35676b6545e - requires_dist: - - click>=7.0 - - filelock - - jsonschema - - msgpack>=1.0.0,<2.0.0 - - packaging - - protobuf>=3.15.3,!=3.19.5 - - pyyaml - - requests - - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'cgraph' - - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'client' - - grpcio ; extra == 'client' - - numpy>=1.20 ; extra == 'data' - - pandas>=1.3 ; extra == 'data' - - pyarrow>=9.0.0 ; extra == 'data' - - fsspec ; extra == 'data' - - aiohttp>=3.7 ; extra == 'default' - - aiohttp-cors ; extra == 'default' - - colorful ; extra == 'default' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'default' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'default' - - requests ; extra == 'default' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'default' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'default' - - opencensus ; extra == 'default' - - opentelemetry-sdk>=1.30.0 ; extra == 'default' - - opentelemetry-exporter-prometheus ; extra == 'default' - - opentelemetry-proto ; extra == 'default' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'default' - - prometheus-client>=0.7.1 ; extra == 'default' - - smart-open ; extra == 'default' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'default' - - memray ; sys_platform != 'win32' and extra == 'observability' - - colorful ; extra == 'serve' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'serve' - - opencensus ; extra == 'serve' - - aiohttp>=3.7 ; extra == 'serve' - - prometheus-client>=0.7.1 ; extra == 'serve' - - aiohttp-cors ; extra == 'serve' - - opentelemetry-exporter-prometheus ; extra == 'serve' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'serve' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'serve' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'serve' - - uvicorn[standard] ; extra == 'serve' - - fastapi ; extra == 'serve' - - requests ; extra == 'serve' - - opentelemetry-sdk>=1.30.0 ; extra == 'serve' - - smart-open ; extra == 'serve' - - opentelemetry-proto ; extra == 'serve' - - starlette ; extra == 'serve' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'serve' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'serve' - - watchfiles ; extra == 'serve' - - pandas ; extra == 'tune' - - tensorboardx>=1.9 ; extra == 'tune' - - requests ; extra == 'tune' - - pyarrow>=9.0.0 ; extra == 'tune' - - fsspec ; extra == 'tune' - - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'adag' - - colorful ; extra == 'serve-grpc' - - opencensus ; extra == 'serve-grpc' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'serve-grpc' - - aiohttp>=3.7 ; extra == 'serve-grpc' - - prometheus-client>=0.7.1 ; extra == 'serve-grpc' - - aiohttp-cors ; extra == 'serve-grpc' - - opentelemetry-exporter-prometheus ; extra == 'serve-grpc' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'serve-grpc' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'serve-grpc' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'serve-grpc' - - uvicorn[standard] ; extra == 'serve-grpc' - - fastapi ; extra == 'serve-grpc' - - requests ; extra == 'serve-grpc' - - opentelemetry-sdk>=1.30.0 ; extra == 'serve-grpc' - - smart-open ; extra == 'serve-grpc' - - opentelemetry-proto ; extra == 'serve-grpc' - - starlette ; extra == 'serve-grpc' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'serve-grpc' - - pyopenssl ; extra == 'serve-grpc' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'serve-grpc' - - watchfiles ; extra == 'serve-grpc' - - ray-cpp==2.48.0 ; extra == 'cpp' - - pandas ; extra == 'rllib' - - tensorboardx>=1.9 ; extra == 'rllib' - - requests ; extra == 'rllib' - - pyarrow>=9.0.0 ; extra == 'rllib' - - fsspec ; extra == 'rllib' - - dm-tree ; extra == 'rllib' - - gymnasium==1.0.0 ; extra == 'rllib' - - lz4 ; extra == 'rllib' - - ormsgpack==1.7.0 ; extra == 'rllib' - - pyyaml ; extra == 'rllib' - - scipy ; extra == 'rllib' - - pandas ; extra == 'train' - - tensorboardx>=1.9 ; extra == 'train' - - requests ; extra == 'train' - - pyarrow>=9.0.0 ; extra == 'train' - - fsspec ; extra == 'train' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'train' - - colorful ; extra == 'air' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'air' - - opencensus ; extra == 'air' - - aiohttp>=3.7 ; extra == 'air' - - prometheus-client>=0.7.1 ; extra == 'air' - - aiohttp-cors ; extra == 'air' - - tensorboardx>=1.9 ; extra == 'air' - - opentelemetry-exporter-prometheus ; extra == 'air' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'air' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'air' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'air' - - pandas>=1.3 ; extra == 'air' - - uvicorn[standard] ; extra == 'air' - - fsspec ; extra == 'air' - - fastapi ; extra == 'air' - - requests ; extra == 'air' - - opentelemetry-sdk>=1.30.0 ; extra == 'air' - - smart-open ; extra == 'air' - - opentelemetry-proto ; extra == 'air' - - pyarrow>=9.0.0 ; extra == 'air' - - starlette ; extra == 'air' - - pandas ; extra == 'air' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'air' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'air' - - numpy>=1.20 ; extra == 'air' - - watchfiles ; extra == 'air' - - colorful ; extra == 'all' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'all' - - opencensus ; extra == 'all' - - aiohttp>=3.7 ; extra == 'all' - - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'all' - - scipy ; extra == 'all' - - prometheus-client>=0.7.1 ; extra == 'all' - - aiohttp-cors ; extra == 'all' - - opentelemetry-exporter-prometheus ; extra == 'all' - - tensorboardx>=1.9 ; extra == 'all' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'all' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'all' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'all' - - pandas>=1.3 ; extra == 'all' - - uvicorn[standard] ; extra == 'all' - - ormsgpack==1.7.0 ; extra == 'all' - - fsspec ; extra == 'all' - - fastapi ; extra == 'all' - - requests ; extra == 'all' - - opentelemetry-sdk>=1.30.0 ; extra == 'all' - - gymnasium==1.0.0 ; extra == 'all' - - smart-open ; extra == 'all' - - memray ; sys_platform != 'win32' and extra == 'all' - - dm-tree ; extra == 'all' - - lz4 ; extra == 'all' - - opentelemetry-proto ; extra == 'all' - - pyarrow>=9.0.0 ; extra == 'all' - - starlette ; extra == 'all' - - pandas ; extra == 'all' - - pyyaml ; extra == 'all' - - grpcio ; extra == 'all' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'all' - - pyopenssl ; extra == 'all' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'all' - - numpy>=1.20 ; extra == 'all' - - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'all' - - watchfiles ; extra == 'all' - - colorful ; extra == 'all-cpp' - - opencensus ; extra == 'all-cpp' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'all-cpp' - - aiohttp>=3.7 ; extra == 'all-cpp' - - grpcio!=1.56.0 ; sys_platform == 'darwin' and extra == 'all-cpp' - - ray-cpp==2.48.0 ; extra == 'all-cpp' - - scipy ; extra == 'all-cpp' - - prometheus-client>=0.7.1 ; extra == 'all-cpp' - - aiohttp-cors ; extra == 'all-cpp' - - opentelemetry-exporter-prometheus ; extra == 'all-cpp' - - tensorboardx>=1.9 ; extra == 'all-cpp' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'all-cpp' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'all-cpp' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'all-cpp' - - pandas>=1.3 ; extra == 'all-cpp' - - uvicorn[standard] ; extra == 'all-cpp' - - ormsgpack==1.7.0 ; extra == 'all-cpp' - - fsspec ; extra == 'all-cpp' - - fastapi ; extra == 'all-cpp' - - requests ; extra == 'all-cpp' - - opentelemetry-sdk>=1.30.0 ; extra == 'all-cpp' - - gymnasium==1.0.0 ; extra == 'all-cpp' - - smart-open ; extra == 'all-cpp' - - memray ; sys_platform != 'win32' and extra == 'all-cpp' - - dm-tree ; extra == 'all-cpp' - - lz4 ; extra == 'all-cpp' - - opentelemetry-proto ; extra == 'all-cpp' - - pyarrow>=9.0.0 ; extra == 'all-cpp' - - starlette ; extra == 'all-cpp' - - pandas ; extra == 'all-cpp' - - pyyaml ; extra == 'all-cpp' - - grpcio ; extra == 'all-cpp' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'all-cpp' - - pyopenssl ; extra == 'all-cpp' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'all-cpp' - - numpy>=1.20 ; extra == 'all-cpp' - - cupy-cuda12x ; sys_platform != 'darwin' and extra == 'all-cpp' - - watchfiles ; extra == 'all-cpp' - - colorful ; extra == 'llm' - - grpcio>=1.32.0 ; python_full_version < '3.10' and extra == 'llm' - - opencensus ; extra == 'llm' - - aiohttp>=3.7 ; extra == 'llm' - - prometheus-client>=0.7.1 ; extra == 'llm' - - aiohttp-cors ; extra == 'llm' - - opentelemetry-exporter-prometheus ; extra == 'llm' - - vllm>=0.9.2 ; extra == 'llm' - - typer ; extra == 'llm' - - pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 ; extra == 'llm' - - virtualenv>=20.0.24,!=20.21.1 ; extra == 'llm' - - jsonschema ; extra == 'llm' - - py-spy>=0.4.0 ; python_full_version >= '3.12' and extra == 'llm' - - pandas>=1.3 ; extra == 'llm' - - uvicorn[standard] ; extra == 'llm' - - ninja ; extra == 'llm' - - fsspec ; extra == 'llm' - - fastapi ; extra == 'llm' - - requests ; extra == 'llm' - - opentelemetry-sdk>=1.30.0 ; extra == 'llm' - - smart-open ; extra == 'llm' - - jsonref>=1.1.0 ; extra == 'llm' - - opentelemetry-proto ; extra == 'llm' - - pyarrow>=9.0.0 ; extra == 'llm' - - starlette ; extra == 'llm' - - async-timeout ; python_full_version < '3.11' and extra == 'llm' - - grpcio>=1.42.0 ; python_full_version >= '3.10' and extra == 'llm' - - py-spy>=0.2.0 ; python_full_version < '3.12' and extra == 'llm' - - numpy>=1.20 ; extra == 'llm' - - watchfiles ; extra == 'llm' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8c095d6_2.conda - sha256: 2d6d0c026902561ed77cd646b5021aef2d4db22e57a5b0178dfc669231e06d2c - md5: 283b96675859b20a825f8fa30f311446 - depends: - - libgcc >=13 - - ncurses >=6.5,<7.0a0 - license: GPL-3.0-only - license_family: GPL - purls: [] - size: 282480 - timestamp: 1740379431762 -- pypi: https://files.pythonhosted.org/packages/e8/02/89e2ed7e85db6c93dfa9e8f691c5087df4e3551ab39081a4d7c6d1f90e05/redis-6.4.0-py3-none-any.whl - name: redis - version: 6.4.0 - sha256: f0544fa9604264e9464cdf4814e7d4830f74b165d52f2a330a760a88dd248b7f - requires_dist: - - async-timeout>=4.0.3 ; python_full_version < '3.11.3' - - hiredis>=3.2.0 ; extra == 'hiredis' - - pyjwt>=2.9.0 ; extra == 'jwt' - - cryptography>=36.0.1 ; extra == 'ocsp' - - pyopenssl>=20.0.1 ; extra == 'ocsp' - - requests>=2.31.0 ; extra == 'ocsp' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/c1/b1/3baf80dc6d2b7bc27a95a67752d0208e410351e3feb4eb78de5f77454d8d/referencing-0.36.2-py3-none-any.whl - name: referencing - version: 0.36.2 - sha256: e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0 - requires_dist: - - attrs>=22.2.0 - - rpds-py>=0.7.0 - - typing-extensions>=4.4.0 ; python_full_version < '3.13' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl - name: requests - version: 2.32.4 - sha256: 27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c - requires_dist: - - charset-normalizer>=2,<4 - - idna>=2.5,<4 - - urllib3>=1.21.1,<3 - - certifi>=2017.4.17 - - pysocks>=1.5.6,!=1.5.7 ; extra == 'socks' - - chardet>=3.0.2,<6 ; extra == 'use-chardet-on-py3' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl - name: requests-oauthlib - version: 2.0.0 - sha256: 7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 - requires_dist: - - oauthlib>=3.0.0 - - requests>=2.0.0 - - oauthlib[signedtoken]>=3.0.0 ; extra == 'rsa' - requires_python: '>=3.4' -- pypi: https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl - name: requests-toolbelt - version: 1.0.0 - sha256: cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 - requires_dist: - - requests>=2.0.1,<3.0.0 - requires_python: '>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*' -- pypi: https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl - name: rich - version: 13.9.4 - sha256: 6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90 - requires_dist: - - ipywidgets>=7.5.1,<9 ; extra == 'jupyter' - - markdown-it-py>=2.2.0 - - pygments>=2.13.0,<3.0.0 - - typing-extensions>=4.0.0,<5.0 ; python_full_version < '3.11' - requires_python: '>=3.8.0' -- pypi: https://files.pythonhosted.org/packages/33/cc/6b0ee8f0ba3f2df2daac1beda17fde5cf10897a7d466f252bd184ef20162/rpds_py-0.27.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: rpds-py - version: 0.27.0 - sha256: be0744661afbc4099fef7f4e604e7f1ea1be1dd7284f357924af12a705cc7d5c - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl - name: rsa - version: 4.9.1 - sha256: 68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 - requires_dist: - - pyasn1>=0.1.3 - requires_python: '>=3.6,<4' -- pypi: https://files.pythonhosted.org/packages/e8/67/0c3c9179a3ad19791ef1b8f7138aa27d4578c78700551c60d9260b2c660d/ruff-0.12.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: ruff - version: 0.12.8 - sha256: 560e0cd641e45591a3e42cb50ef61ce07162b9c233786663fdce2d8557d99818 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/ff/c7/30d13b7fd4f866ca3f30e9a6e7ae038f0c45226f6e26b3cc98d6d197f93b/s3fs-2025.7.0-py3-none-any.whl - name: s3fs - version: 2025.7.0 - sha256: b6b2d3f84b6aa1c2ba5e62e39dd9410cf54f10a2cce1ea6db1ba0d1a6bcce685 - requires_dist: - - aiobotocore>=2.5.4,<3.0.0 - - fsspec==2025.7.0 - - aiohttp!=4.0.0a0,!=4.0.0a1 - - aiobotocore[awscli]>=2.5.4,<3.0.0 ; extra == 'awscli' - - aiobotocore[boto3]>=2.5.4,<3.0.0 ; extra == 'boto3' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/6d/4f/d073e09df851cfa251ef7840007d04db3293a0482ce607d2b993926089be/s3transfer-0.13.1-py3-none-any.whl - name: s3transfer - version: 0.13.1 - sha256: a981aa7429be23fe6dfc13e80e4020057cbab622b08c0315288758d67cabc724 - requires_dist: - - botocore>=1.37.4,<2.0a0 - - botocore[crt]>=1.37.4,<2.0a0 ; extra == 'crt' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl - name: setuptools - version: 80.9.0 - sha256: 062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922 - requires_dist: - - pytest>=6,!=8.1.* ; extra == 'test' - - virtualenv>=13.0.0 ; extra == 'test' - - wheel>=0.44.0 ; extra == 'test' - - pip>=19.1 ; extra == 'test' - - packaging>=24.2 ; extra == 'test' - - jaraco-envs>=2.2 ; extra == 'test' - - pytest-xdist>=3 ; extra == 'test' - - jaraco-path>=3.7.2 ; extra == 'test' - - build[virtualenv]>=1.0.3 ; extra == 'test' - - filelock>=3.4.0 ; extra == 'test' - - ini2toml[lite]>=0.14 ; extra == 'test' - - tomli-w>=1.0.0 ; extra == 'test' - - pytest-timeout ; extra == 'test' - - pytest-perf ; sys_platform != 'cygwin' and extra == 'test' - - jaraco-develop>=7.21 ; python_full_version >= '3.9' and sys_platform != 'cygwin' and extra == 'test' - - pytest-home>=0.5 ; extra == 'test' - - pytest-subprocess ; extra == 'test' - - pyproject-hooks!=1.1 ; extra == 'test' - - jaraco-test>=5.5 ; extra == 'test' - - sphinx>=3.5 ; extra == 'doc' - - jaraco-packaging>=9.3 ; extra == 'doc' - - rst-linker>=1.9 ; extra == 'doc' - - furo ; extra == 'doc' - - sphinx-lint ; extra == 'doc' - - jaraco-tidelift>=1.4 ; extra == 'doc' - - pygments-github-lexers==0.0.5 ; extra == 'doc' - - sphinx-favicon ; extra == 'doc' - - sphinx-inline-tabs ; extra == 'doc' - - sphinx-reredirects ; extra == 'doc' - - sphinxcontrib-towncrier ; extra == 'doc' - - sphinx-notfound-page>=1,<2 ; extra == 'doc' - - pyproject-hooks!=1.1 ; extra == 'doc' - - towncrier<24.7 ; extra == 'doc' - - packaging>=24.2 ; extra == 'core' - - more-itertools>=8.8 ; extra == 'core' - - jaraco-text>=3.7 ; extra == 'core' - - importlib-metadata>=6 ; python_full_version < '3.10' and extra == 'core' - - tomli>=2.0.1 ; python_full_version < '3.11' and extra == 'core' - - wheel>=0.43.0 ; extra == 'core' - - platformdirs>=4.2.2 ; extra == 'core' - - jaraco-functools>=4 ; extra == 'core' - - more-itertools ; extra == 'core' - - pytest-checkdocs>=2.4 ; extra == 'check' - - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' - - ruff>=0.8.0 ; sys_platform != 'cygwin' and extra == 'check' - - pytest-cov ; extra == 'cover' - - pytest-enabler>=2.2 ; extra == 'enabler' - - pytest-mypy ; extra == 'type' - - mypy==1.14.* ; extra == 'type' - - importlib-metadata>=7.0.2 ; python_full_version < '3.10' and extra == 'type' - - jaraco-develop>=7.21 ; sys_platform != 'cygwin' and extra == 'type' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/noarch/six-1.17.0-pyhe01879c_1.conda - sha256: 458227f759d5e3fcec5d9b7acce54e10c9e1f4f4b7ec978f3bfd54ce4ee9853d - md5: 3339e3b65d58accf4ca4fb8748ab16b3 - depends: - - python >=3.9 - - python - license: MIT - license_family: MIT - purls: - - pkg:pypi/six?source=hash-mapping - size: 18455 - timestamp: 1753199211006 -- pypi: https://files.pythonhosted.org/packages/08/5b/a2a3d4514c64818925f4e886d39981f1926eeb5288a4549c6b3c17ed66bb/smart_open-7.3.0.post1-py3-none-any.whl - name: smart-open - version: 7.3.0.post1 - sha256: c73661a2c24bf045c1e04e08fffc585b59af023fe783d57896f590489db66fb4 - requires_dist: - - wrapt - - boto3 ; extra == 's3' - - google-cloud-storage>=2.6.0 ; extra == 'gcs' - - azure-storage-blob ; extra == 'azure' - - azure-common ; extra == 'azure' - - azure-core ; extra == 'azure' - - requests ; extra == 'http' - - requests ; extra == 'webhdfs' - - paramiko ; extra == 'ssh' - - zstandard ; extra == 'zst' - - smart-open[azure,gcs,http,s3,ssh,webhdfs,zst] ; extra == 'all' - - smart-open[all] ; extra == 'test' - - moto[server] ; extra == 'test' - - responses ; extra == 'test' - - pytest ; extra == 'test' - - pytest-rerunfailures ; extra == 'test' - - pytest-benchmark ; extra == 'test' - - awscli ; extra == 'test' - - pyopenssl ; extra == 'test' - - numpy ; extra == 'test' - requires_python: '>=3.8,<4.0' -- pypi: https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl - name: sortedcontainers - version: 2.4.0 - sha256: a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0 -- pypi: https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl - name: stack-data - version: 0.6.3 - sha256: d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695 - requires_dist: - - executing>=1.2.0 - - asttokens>=2.1.0 - - pure-eval - - pytest ; extra == 'tests' - - typeguard ; extra == 'tests' - - pygments ; extra == 'tests' - - littleutils ; extra == 'tests' - - cython ; extra == 'tests' -- pypi: https://files.pythonhosted.org/packages/96/7c/a81ef5ef10978dd073a854e0fa93b5d8021d0594b639cc8f6453c3c78a1d/strictyaml-1.7.3-py3-none-any.whl - name: strictyaml - version: 1.7.3 - sha256: fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7 - requires_dist: - - python-dateutil>=2.6.0 - requires_python: '>=3.7.0' -- pypi: https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl - name: tenacity - version: 9.1.2 - sha256: f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138 - requires_dist: - - reno ; extra == 'doc' - - sphinx ; extra == 'doc' - - pytest ; extra == 'test' - - tornado>=4.5 ; extra == 'test' - - typeguard ; extra == 'test' - requires_python: '>=3.9' -- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_hd72426e_102.conda - sha256: a84ff687119e6d8752346d1d408d5cf360dee0badd487a472aa8ddedfdc219e1 - md5: a0116df4f4ed05c303811a837d5b39d8 - depends: - - __glibc >=2.17,<3.0.a0 - - libgcc >=13 - - libzlib >=1.3.1,<2.0a0 - license: TCL - license_family: BSD - purls: [] - size: 3285204 - timestamp: 1748387766691 -- pypi: https://files.pythonhosted.org/packages/f9/41/fb15f06e33d7430ca89420283a8762a4e6b8025b800ea51796ab5e6d9559/tornado-6.5.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: tornado - version: 6.5.2 - sha256: e792706668c87709709c18b353da1f7662317b563ff69f00bab83595940c7108 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl - name: tqdm - version: 4.67.1 - sha256: 26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 - requires_dist: - - colorama ; sys_platform == 'win32' - - pytest>=6 ; extra == 'dev' - - pytest-cov ; extra == 'dev' - - pytest-timeout ; extra == 'dev' - - pytest-asyncio>=0.24 ; extra == 'dev' - - nbval ; extra == 'dev' - - requests ; extra == 'discord' - - slack-sdk ; extra == 'slack' - - requests ; extra == 'telegram' - - ipywidgets>=6 ; extra == 'notebook' - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl - name: traitlets - version: 5.14.3 - sha256: b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f - requires_dist: - - myst-parser ; extra == 'docs' - - pydata-sphinx-theme ; extra == 'docs' - - sphinx ; extra == 'docs' - - argcomplete>=3.0.3 ; extra == 'test' - - mypy>=1.7.0 ; extra == 'test' - - pre-commit ; extra == 'test' - - pytest-mock ; extra == 'test' - - pytest-mypy-testing ; extra == 'test' - - pytest>=7.0,<8.2 ; extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl - name: typing-extensions - version: 4.14.1 - sha256: d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl - name: typing-inspection - version: 0.4.1 - sha256: 389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51 - requires_dist: - - typing-extensions>=4.12.0 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl - name: tzdata - version: '2025.2' - sha256: 1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8 - requires_python: '>=2' -- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025b-h78e105d_0.conda - sha256: 5aaa366385d716557e365f0a4e9c3fca43ba196872abbbe3d56bb610d131e192 - md5: 4222072737ccff51314b5ece9c7d6f5a - license: LicenseRef-Public-Domain - purls: [] - size: 122968 - timestamp: 1742727099393 -- pypi: https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl - name: urllib3 - version: 2.5.0 - sha256: e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc - requires_dist: - - brotli>=1.0.9 ; platform_python_implementation == 'CPython' and extra == 'brotli' - - brotlicffi>=0.8.0 ; platform_python_implementation != 'CPython' and extra == 'brotli' - - h2>=4,<5 ; extra == 'h2' - - pysocks>=1.5.6,!=1.5.7,<2.0 ; extra == 'socks' - - zstandard>=0.18.0 ; extra == 'zstd' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/ca/ff/ded57ac5ff40a09e6e198550bab075d780941e0b0f83cbeabd087c59383a/virtualenv-20.33.1-py3-none-any.whl - name: virtualenv - version: 20.33.1 - sha256: 07c19bc66c11acab6a5958b815cbcee30891cd1c2ccf53785a28651a0d8d8a67 - requires_dist: - - distlib>=0.3.7,<1 - - filelock>=3.12.2,<4 - - importlib-metadata>=6.6 ; python_full_version < '3.8' - - platformdirs>=3.9.1,<5 - - furo>=2023.7.26 ; extra == 'docs' - - proselint>=0.13 ; extra == 'docs' - - sphinx>=7.1.2,!=7.3 ; extra == 'docs' - - sphinx-argparse>=0.4 ; extra == 'docs' - - sphinxcontrib-towncrier>=0.2.1a0 ; extra == 'docs' - - towncrier>=23.6 ; extra == 'docs' - - covdefaults>=2.3 ; extra == 'test' - - coverage-enable-subprocess>=1 ; extra == 'test' - - coverage>=7.2.7 ; extra == 'test' - - flaky>=3.7 ; extra == 'test' - - packaging>=23.1 ; extra == 'test' - - pytest-env>=0.8.2 ; extra == 'test' - - pytest-freezer>=0.4.8 ; (python_full_version >= '3.13' and platform_python_implementation == 'CPython' and sys_platform == 'win32' and extra == 'test') or (platform_python_implementation == 'GraalVM' and extra == 'test') or (platform_python_implementation == 'PyPy' and extra == 'test') - - pytest-mock>=3.11.1 ; extra == 'test' - - pytest-randomly>=3.12 ; extra == 'test' - - pytest-timeout>=2.1 ; extra == 'test' - - pytest>=7.4 ; extra == 'test' - - setuptools>=68 ; extra == 'test' - - time-machine>=2.10 ; platform_python_implementation == 'CPython' and extra == 'test' - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl - name: watchdog - version: 6.0.0 - sha256: 20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2 - requires_dist: - - pyyaml>=3.10 ; extra == 'watchmedo' - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl - name: wcwidth - version: 0.2.13 - sha256: 3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859 - requires_dist: - - backports-functools-lru-cache>=1.2.1 ; python_full_version < '3.2' -- pypi: https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl - name: widgetsnbextension - version: 4.0.14 - sha256: 4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575 - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl - name: wrapt - version: 1.17.3 - sha256: 6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277 - requires_python: '>=3.8' -- pypi: https://files.pythonhosted.org/packages/b4/0e/15c243775342ce840b9ba34aceace06a1148fa1630cd8ca269e3223987f5/xxhash-3.5.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: xxhash - version: 3.5.0 - sha256: 07fda5de378626e502b42b311b049848c2ef38784d0d67b6f30bb5008642f8eb - requires_python: '>=3.7' -- pypi: https://files.pythonhosted.org/packages/af/44/46407d7f7a56e9a85a4c207724c9f2c545c060380718eea9088f222ba697/yarl-1.20.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl - name: yarl - version: 1.20.1 - sha256: d1a4fbb50e14396ba3d375f68bfe02215d8e7bc3ec49da8341fe3157f59d2ff5 - requires_dist: - - idna>=2.0 - - multidict>=4.0 - - propcache>=0.2.1 - requires_python: '>=3.9' -- pypi: https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl - name: zipp - version: 3.23.0 - sha256: 071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e - requires_dist: - - pytest>=6,!=8.1.* ; extra == 'test' - - jaraco-itertools ; extra == 'test' - - jaraco-functools ; extra == 'test' - - more-itertools ; extra == 'test' - - big-o ; extra == 'test' - - pytest-ignore-flaky ; extra == 'test' - - jaraco-test ; extra == 'test' - - sphinx>=3.5 ; extra == 'doc' - - jaraco-packaging>=9.3 ; extra == 'doc' - - rst-linker>=1.9 ; extra == 'doc' - - furo ; extra == 'doc' - - sphinx-lint ; extra == 'doc' - - jaraco-tidelift>=1.4 ; extra == 'doc' - - pytest-checkdocs>=2.4 ; extra == 'check' - - pytest-ruff>=0.2.1 ; sys_platform != 'cygwin' and extra == 'check' - - pytest-cov ; extra == 'cover' - - pytest-enabler>=2.2 ; extra == 'enabler' - - pytest-mypy ; extra == 'type' - requires_python: '>=3.9' diff --git a/pyproject.toml b/pyproject.toml index 5c243d6..5a23cad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "orcapod" -description = "simple yet powerful pipeline library for building reproducible scientific pipeline" +description = "Intuitive and powerful library for highly reproducible scientific data pipeline" dynamic = ["version"] dependencies = [ "xxhash", @@ -76,24 +76,6 @@ dev = [ "tqdm>=4.67.1", ] -[tool.pixi.workspace] -channels = ["conda-forge"] -platforms = ["linux-64"] - -[tool.pixi.pypi-dependencies] -orcapod = { path = ".", editable = true } - -[tool.pixi.environments] -default = { solve-group = "default" } -all = { features = ["all", "redis", "ray"], solve-group = "default" } -dev = { features = ["dev"], solve-group = "default" } -ray = { features = ["ray"], solve-group = "default" } -redis = { features = ["redis"], solve-group = "default" } - -[tool.pixi.tasks] - -[tool.pixi.dependencies] -python = ">=3.12" [tool.uv.sources] selection-pipeline = { git = "https://github.com/enigma-brain/selection_pipeline" } #, rev = "6218290" } From a158e17b170192027ea7a08d63f198bb471e978d Mon Sep 17 00:00:00 2001 From: "Edgar Y. Walker" Date: Mon, 1 Sep 2025 16:15:36 -0700 Subject: [PATCH 224/224] feat: add more operators --- src/orcapod/core/operators/__init__.py | 11 +- .../core/operators/column_selection.py | 168 +++++++++++++++++- src/orcapod/core/operators/filters.py | 166 +++++++++++++++++ src/orcapod/core/sources/data_frame_source.py | 15 +- src/orcapod/core/streams/base.py | 71 ++++++-- src/orcapod/pipeline/graph.py | 4 +- .../protocols/core_protocols/streams.py | 56 +++++- 7 files changed, 459 insertions(+), 32 deletions(-) create mode 100644 src/orcapod/core/operators/filters.py diff --git a/src/orcapod/core/operators/__init__.py b/src/orcapod/core/operators/__init__.py index 6cc8ee3..b1f0544 100644 --- a/src/orcapod/core/operators/__init__.py +++ b/src/orcapod/core/operators/__init__.py @@ -2,7 +2,13 @@ from .semijoin import SemiJoin from .mappers import MapTags, MapPackets from .batch import Batch -from .column_selection import DropTagColumns, DropPacketColumns +from .column_selection import ( + SelectTagColumns, + SelectPacketColumns, + DropTagColumns, + DropPacketColumns, +) +from .filters import PolarsFilter __all__ = [ "Join", @@ -10,6 +16,9 @@ "MapTags", "MapPackets", "Batch", + "SelectTagColumns", + "SelectPacketColumns", "DropTagColumns", "DropPacketColumns", + "PolarsFilter", ] diff --git a/src/orcapod/core/operators/column_selection.py b/src/orcapod/core/operators/column_selection.py index 46f1612..4140db8 100644 --- a/src/orcapod/core/operators/column_selection.py +++ b/src/orcapod/core/operators/column_selection.py @@ -18,6 +18,155 @@ logger = logging.getLogger(__name__) +class SelectTagColumns(UnaryOperator): + """ + Operator that selects specified columns from a stream. + """ + + def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs): + if isinstance(columns, str): + columns = [columns] + self.columns = columns + self.strict = strict + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + tags_to_drop = [c for c in tag_columns if c not in self.columns] + new_tag_columns = [c for c in tag_columns if c not in tags_to_drop] + + if len(new_tag_columns) == len(tag_columns): + logger.info("All tag columns are selected. Returning stream unaltered.") + return stream + + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + + modified_table = table.drop_columns(list(tags_to_drop)) + + return TableStream( + modified_table, + tag_columns=new_tag_columns, + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # TODO: remove redundant logic + tag_columns, packet_columns = stream.keys() + columns_to_select = self.columns + missing_columns = set(columns_to_select) - set(tag_columns) + if missing_columns and self.strict: + raise InputValidationError( + f"Missing tag columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" + ) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema, packet_schema = stream.types( + include_system_tags=include_system_tags + ) + tag_columns, _ = stream.keys() + tags_to_drop = [tc for tc in tag_columns if tc not in self.columns] + + # this ensures all system tag columns are preserved + new_tag_schema = {k: v for k, v in tag_schema.items() if k not in tags_to_drop} + + return new_tag_schema, packet_schema + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.columns, + self.strict, + ) + ((stream,) if stream is not None else ()) + + +class SelectPacketColumns(UnaryOperator): + """ + Operator that selects specified columns from a stream. + """ + + def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs): + if isinstance(columns, str): + columns = [columns] + self.columns = columns + self.strict = strict + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] + new_packet_columns = [ + c for c in packet_columns if c not in packet_columns_to_drop + ] + + if len(new_packet_columns) == len(packet_columns): + logger.info("All packet columns are selected. Returning stream unaltered.") + return stream + + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + # make sure to drop associated source fields + associated_source_fields = [ + f"{constants.SOURCE_PREFIX}{c}" for c in packet_columns_to_drop + ] + packet_columns_to_drop.extend(associated_source_fields) + + modified_table = table.drop_columns(packet_columns_to_drop) + + return TableStream( + modified_table, + tag_columns=tag_columns, + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # TODO: remove redundant logic + tag_columns, packet_columns = stream.keys() + columns_to_select = self.columns + missing_columns = set(columns_to_select) - set(packet_columns) + if missing_columns and self.strict: + raise InputValidationError( + f"Missing packet columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" + ) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema, packet_schema = stream.types( + include_system_tags=include_system_tags + ) + _, packet_columns = stream.keys() + packets_to_drop = [pc for pc in packet_columns if pc not in self.columns] + + # this ensures all system tag columns are preserved + new_packet_schema = { + k: v for k, v in packet_schema.items() if k not in packets_to_drop + } + + return tag_schema, new_packet_schema + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.columns, + self.strict, + ) + ((stream,) if stream is not None else ()) + + class DropTagColumns(UnaryOperator): """ Operator that drops specified columns from a stream. @@ -64,11 +213,10 @@ def op_validate_inputs(self, stream: cp.Stream) -> None: tag_columns, packet_columns = stream.keys() columns_to_drop = self.columns missing_columns = set(columns_to_drop) - set(tag_columns) - if missing_columns: - if self.strict: - raise InputValidationError( - f"Missing tag columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" - ) + if missing_columns and self.strict: + raise InputValidationError( + f"Missing tag columns: {missing_columns}. Make sure all specified columns to drop are present or use strict=False to ignore missing columns" + ) def op_output_types( self, stream: cp.Stream, include_system_tags: bool = False @@ -105,7 +253,7 @@ def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs def op_forward(self, stream: cp.Stream) -> cp.Stream: tag_columns, packet_columns = stream.keys() - columns_to_drop = self.columns + columns_to_drop = list(self.columns) if not self.strict: columns_to_drop = [c for c in columns_to_drop if c in packet_columns] @@ -113,11 +261,17 @@ def op_forward(self, stream: cp.Stream) -> cp.Stream: logger.info("No packet columns to drop. Returning stream unaltered.") return stream + # make sure all associated source columns are dropped too + associated_source_columns = [ + f"{constants.SOURCE_PREFIX}{c}" for c in columns_to_drop + ] + columns_to_drop.extend(associated_source_columns) + table = stream.as_table( include_source=True, include_system_tags=True, sort_by_tags=False ) - modified_table = table.drop_columns(list(columns_to_drop)) + modified_table = table.drop_columns(columns_to_drop) return TableStream( modified_table, diff --git a/src/orcapod/core/operators/filters.py b/src/orcapod/core/operators/filters.py new file mode 100644 index 0000000..2edf4f7 --- /dev/null +++ b/src/orcapod/core/operators/filters.py @@ -0,0 +1,166 @@ +from orcapod.protocols import core_protocols as cp +from orcapod.core.streams import TableStream +from orcapod.types import PythonSchema +from typing import Any, TYPE_CHECKING, TypeAlias +from orcapod.utils.lazy_module import LazyModule +from collections.abc import Collection, Mapping +from orcapod.errors import InputValidationError +from orcapod.core.system_constants import constants +from orcapod.core.operators.base import UnaryOperator +import logging +from collections.abc import Iterable + + +if TYPE_CHECKING: + import pyarrow as pa + import polars as pl + import polars._typing as pl_type + import numpy as np +else: + pa = LazyModule("pyarrow") + pl = LazyModule("polars") + pl_type = LazyModule("polars._typing") + +logger = logging.getLogger(__name__) + +polars_predicate: TypeAlias = "pl_type.IntoExprColumn| Iterable[pl_type.IntoExprColumn]| bool| list[bool]| np.ndarray[Any, Any]" + + +class PolarsFilter(UnaryOperator): + """ + Operator that applies Polars filtering to a stream + """ + + def __init__( + self, + predicates: Collection[ + "pl_type.IntoExprColumn| Iterable[pl_type.IntoExprColumn]| bool| list[bool]| np.ndarray[Any, Any]" + ] = (), + constraints: Mapping[str, Any] | None = None, + **kwargs, + ): + self.predicates = predicates + self.constraints = constraints if constraints is not None else {} + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + if len(self.predicates) == 0 and len(self.constraints) == 0: + logger.info( + "No predicates or constraints specified. Returning stream unaltered." + ) + return stream + + # TODO: improve efficiency here... + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + df = pl.DataFrame(table) + filtered_table = df.filter(*self.predicates, **self.constraints).to_arrow() + + return TableStream( + filtered_table, + tag_columns=stream.tag_keys(), + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + + # Any valid stream would work + return + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + # data types are not modified + return stream.types(include_system_tags=include_system_tags) + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.predicates, + self.constraints, + ) + ((stream,) if stream is not None else ()) + + +class SelectPacketColumns(UnaryOperator): + """ + Operator that selects specified columns from a stream. + """ + + def __init__(self, columns: str | Collection[str], strict: bool = True, **kwargs): + if isinstance(columns, str): + columns = [columns] + self.columns = columns + self.strict = strict + super().__init__(**kwargs) + + def op_forward(self, stream: cp.Stream) -> cp.Stream: + tag_columns, packet_columns = stream.keys() + packet_columns_to_drop = [c for c in packet_columns if c not in self.columns] + new_packet_columns = [ + c for c in packet_columns if c not in packet_columns_to_drop + ] + + if len(new_packet_columns) == len(packet_columns): + logger.info("All packet columns are selected. Returning stream unaltered.") + return stream + + table = stream.as_table( + include_source=True, include_system_tags=True, sort_by_tags=False + ) + # make sure to drop associated source fields + associated_source_fields = [ + f"{constants.SOURCE_PREFIX}{c}" for c in packet_columns_to_drop + ] + packet_columns_to_drop.extend(associated_source_fields) + + modified_table = table.drop_columns(packet_columns_to_drop) + + return TableStream( + modified_table, + tag_columns=tag_columns, + source=self, + upstreams=(stream,), + ) + + def op_validate_inputs(self, stream: cp.Stream) -> None: + """ + This method should be implemented by subclasses to validate the inputs to the operator. + It takes two streams as input and raises an error if the inputs are not valid. + """ + # TODO: remove redundant logic + tag_columns, packet_columns = stream.keys() + columns_to_select = self.columns + missing_columns = set(columns_to_select) - set(packet_columns) + if missing_columns and self.strict: + raise InputValidationError( + f"Missing packet columns: {missing_columns}. Make sure all specified columns to select are present or use strict=False to ignore missing columns" + ) + + def op_output_types( + self, stream: cp.Stream, include_system_tags: bool = False + ) -> tuple[PythonSchema, PythonSchema]: + tag_schema, packet_schema = stream.types( + include_system_tags=include_system_tags + ) + _, packet_columns = stream.keys() + packets_to_drop = [pc for pc in packet_columns if pc not in self.columns] + + # this ensures all system tag columns are preserved + new_packet_schema = { + k: v for k, v in packet_schema.items() if k not in packets_to_drop + } + + return tag_schema, new_packet_schema + + def op_identity_structure(self, stream: cp.Stream | None = None) -> Any: + return ( + self.__class__.__name__, + self.columns, + self.strict, + ) + ((stream,) if stream is not None else ()) diff --git a/src/orcapod/core/sources/data_frame_source.py b/src/orcapod/core/sources/data_frame_source.py index 35af11d..2fb4a78 100644 --- a/src/orcapod/core/sources/data_frame_source.py +++ b/src/orcapod/core/sources/data_frame_source.py @@ -8,7 +8,7 @@ from orcapod.core.system_constants import constants from orcapod.core import polars_data_utils from orcapod.core.sources.source_registry import GLOBAL_SOURCE_REGISTRY, SourceRegistry - +import logging from orcapod.core.sources.base import SourceBase if TYPE_CHECKING: @@ -20,6 +20,9 @@ pl = LazyModule("polars") +logger = logging.getLogger(__name__) + + class DataFrameSource(SourceBase): """Construct source from a dataframe and any Polars dataframe compatible data structure""" @@ -42,6 +45,16 @@ def __init__( # TODO: work with LazyFrame df = pl.DataFrame(data) + object_columns = [c for c in df.columns if df[c].dtype == pl.Object] + if len(object_columns) > 0: + logger.info( + f"Converting {len(object_columns)}object columns to Arrow format" + ) + sub_table = self.data_context.type_converter.python_dicts_to_arrow_table( + df.select(object_columns).to_dicts() + ) + df = df.with_columns([pl.from_arrow(c) for c in sub_table]) + if isinstance(tag_columns, str): tag_columns = [tag_columns] diff --git a/src/orcapod/core/streams/base.py b/src/orcapod/core/streams/base.py index 082225a..a216cdc 100644 --- a/src/orcapod/core/streams/base.py +++ b/src/orcapod/core/streams/base.py @@ -96,19 +96,70 @@ def batch( self, label=label ) # type: ignore + def polars_filter( + self, + *predicates: Any, + constraint_map: Mapping[str, Any] | None = None, + label: str | None = None, + **constraints: Any, + ) -> cp.Stream: + from orcapod.core.operators import PolarsFilter + + total_constraints = dict(constraint_map) if constraint_map is not None else {} + + total_constraints.update(constraints) + + return PolarsFilter(predicates=predicates, constraints=total_constraints)( + self, label=label + ) + + def select_tag_columns( + self: cp.Stream, + tag_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, + ) -> cp.Stream: + """ + Select the specified tag columns from the stream. A ValueError is raised + if one or more specified tag columns do not exist in the stream unless strict = False. + """ + from orcapod.core.operators import SelectTagColumns + + return SelectTagColumns(tag_columns, strict=strict)(self, label=label) + + def select_packet_columns( + self: cp.Stream, + packet_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, + ) -> cp.Stream: + """ + Select the specified packet columns from the stream. A ValueError is raised + if one or more specified packet columns do not exist in the stream unless strict = False. + """ + from orcapod.core.operators import SelectPacketColumns + + return SelectPacketColumns(packet_columns, strict=strict)(self, label=label) + def drop_tag_columns( - self: cp.Stream, tag_columns: str | Collection[str], label: str | None = None + self: cp.Stream, + tag_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, ) -> cp.Stream: from orcapod.core.operators import DropTagColumns - return DropTagColumns(tag_columns)(self, label=label) + return DropTagColumns(tag_columns, strict=strict)(self, label=label) def drop_packet_columns( - self: cp.Stream, packet_columns: str | Collection[str], label: str | None = None + self: cp.Stream, + packet_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, ) -> cp.Stream: from orcapod.core.operators import DropPacketColumns - return DropPacketColumns(packet_columns)(self, label=label) + return DropPacketColumns(packet_columns, strict=strict)(self, label=label) class StatefulStreamBase(OperatorStreamBaseMixin, LabeledContentIdentifiableBase): @@ -303,7 +354,7 @@ def as_polars_df( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": + ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. """ @@ -326,7 +377,7 @@ def as_df( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": + ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. """ @@ -347,7 +398,7 @@ def as_lazy_frame( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: cp.ExecutionEngine | None = None, - ) -> "pl.LazyFrame | None": + ) -> "pl.LazyFrame": """ Convert the entire stream to a Polars LazyFrame. """ @@ -359,8 +410,6 @@ def as_lazy_frame( sort_by_tags=sort_by_tags, execution_engine=execution_engine, ) - if df is None: - return None return df.lazy() def as_pandas_df( @@ -372,7 +421,7 @@ def as_pandas_df( sort_by_tags: bool = True, index_by_tags: bool = True, execution_engine: cp.ExecutionEngine | None = None, - ) -> "pd.DataFrame | None": + ) -> "pd.DataFrame": df = self.as_polars_df( include_data_context=include_data_context, include_source=include_source, @@ -381,8 +430,6 @@ def as_pandas_df( sort_by_tags=sort_by_tags, execution_engine=execution_engine, ) - if df is None: - return None tag_keys, _ = self.keys() pdf = df.to_pandas() if index_by_tags: diff --git a/src/orcapod/pipeline/graph.py b/src/orcapod/pipeline/graph.py index 8e2bb67..84b9eab 100644 --- a/src/orcapod/pipeline/graph.py +++ b/src/orcapod/pipeline/graph.py @@ -481,7 +481,7 @@ def render_graph( output_path: Optional[str] = None, raw_output: bool = False, figsize: tuple = (12, 8), - dpi: int = 200, + dpi: int = 150, style_rules: Optional[Dict] = None, **style_overrides, ) -> Optional[str]: @@ -541,7 +541,7 @@ def render_graph( dot.render(tmp.name[:-4], format="png", cleanup=True) img = mpimg.imread(tmp.name) plt.figure(figsize=figsize, dpi=dpi) - plt.imshow(img, interpolation="none") + plt.imshow(img) plt.axis("off") # plt.title("Example Graph") plt.tight_layout() diff --git a/src/orcapod/protocols/core_protocols/streams.py b/src/orcapod/protocols/core_protocols/streams.py index bada8c9..f7e692a 100644 --- a/src/orcapod/protocols/core_protocols/streams.py +++ b/src/orcapod/protocols/core_protocols/streams.py @@ -1,6 +1,6 @@ from collections.abc import Collection, Iterator, Mapping from datetime import datetime -from typing import TYPE_CHECKING, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable from orcapod.protocols.core_protocols.base import ExecutionEngine, Labelable from orcapod.protocols.core_protocols.datagrams import Packet, Tag @@ -291,7 +291,7 @@ def as_df( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": + ) -> "pl.DataFrame": """ Convert the entire stream to a Polars DataFrame. """ @@ -305,7 +305,7 @@ def as_lazy_frame( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: ExecutionEngine | None = None, - ) -> "pl.LazyFrame | None": + ) -> "pl.LazyFrame": """ Load the entire stream to a Polars LazyFrame. """ @@ -319,7 +319,7 @@ def as_polars_df( include_content_hash: bool | str = False, sort_by_tags: bool = True, execution_engine: ExecutionEngine | None = None, - ) -> "pl.DataFrame | None": ... + ) -> "pl.DataFrame": ... def as_pandas_df( self, @@ -330,7 +330,7 @@ def as_pandas_df( sort_by_tags: bool = True, index_by_tags: bool = True, execution_engine: ExecutionEngine | None = None, - ) -> "pd.DataFrame | None": ... + ) -> "pd.DataFrame": ... def as_table( self, @@ -428,22 +428,60 @@ def map_packets( """ ... + def polars_filter( + self, + *predicates: Any, + constraint_map: Mapping[str, Any] | None = None, + label: str | None = None, + **constraints: Any, + ) -> "Stream": ... + + def select_tag_columns( + self, + tag_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, + ) -> "Stream": + """ + Select the specified tag columns from the stream. A ValueError is raised + if one or more specified tag columns do not exist in the stream unless strict = False. + """ + ... + + def select_packet_columns( + self, + packet_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, + ) -> "Stream": + """ + Select the specified tag columns from the stream. A ValueError is raised + if one or more specified tag columns do not exist in the stream unless strict = False. + """ + ... + def drop_tag_columns( - self, tag_columns: str | Collection[str], label: str | None = None + self, + tag_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, ) -> "Stream": """ Drop the specified tag columns from the stream. A ValueError is raised - if one or more specified tag columns do not exist in the stream. + if one or more specified tag columns do not exist in the stream unless strict = False. """ ... # TODO: check to make sure source columns are also dropped def drop_packet_columns( - self, packet_columns: str | Collection[str], label: str | None = None + self, + packet_columns: str | Collection[str], + strict: bool = True, + label: str | None = None, ) -> "Stream": """ Drop the specified packet columns from the stream. A ValueError is raised - if one or more specified packet columns do not exist in the stream. + if one or more specified packet columns do not exist in the stream unless strict = False. """ ...